In [10]:
dataset_path = "data"

with open(f"{dataset_path}/train_texts.txt", "r", encoding="utf-8") as f:
    text_train = f.read().splitlines()

with open(f"{dataset_path}/train_labels.txt", "r", encoding="utf-8") as f:
    label_train = f.read().splitlines()

with open(f"{dataset_path}/test_texts.txt", "r", encoding="utf-8") as f:
    text_test = f.read().splitlines()

with open(f"{dataset_path}/test_labels.txt", "r", encoding="utf-8") as f:
    label_test = f.read().splitlines()

print("📊 Số mẫu train:", len(text_train))
print("📊 Số nhãn train:", len(label_train))
print("📊 Số mẫu test:", len(text_test))
print("📊 Số nhãn test:", len(label_test))



📊 Số mẫu train: 10000
📊 Số nhãn train: 10000
📊 Số mẫu test: 2500
📊 Số nhãn test: 2500


**Neural Network**

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

vectorizer = TfidfVectorizer(stop_words='english', max_features=20000)
X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(label_train)
y_test_enc = label_encoder.transform(label_test)

X_train_tensor = torch.FloatTensor(X_train_vec.toarray())
y_train_tensor = torch.LongTensor(y_train_enc)
X_test_tensor = torch.FloatTensor(X_test_vec.toarray())
y_test_tensor = torch.LongTensor(y_test_enc)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.relu3 = nn.ReLU()
        self.output = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.output(x)
        return x

input_dim = X_train_vec.shape[1]
hidden_dim = 128
output_dim = len(label_encoder.classes_)
model = SimpleNN(input_dim, hidden_dim, output_dim).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 2
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"📈 Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# --- Metrics ---
accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

print("\n✅ Đánh giá trên tập test:")
print(f"🎯 Accuracy :  {accuracy:.4f}")
print(f"🎯 Precision:  {precision:.4f}")
print(f"🎯 Recall   :  {recall:.4f}")
print(f"🎯 F1-score :  {f1:.4f}")
print("\n📋 Classification Report:")
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

torch.save({
    'model_state_dict': model.state_dict(),
    'vectorizer': vectorizer,
    'label_encoder': label_encoder
}, 'nn_model.pth')
print("\n💾 Đã lưu mô hình vào text_classifier.pth")

def predict(text):
    model.eval()
    vec = vectorizer.transform([text])
    input_tensor = torch.FloatTensor(vec.toarray()).to(device)
    with torch.no_grad():
        outputs = model(input_tensor)
        _, pred = torch.max(outputs, 1)
    return label_encoder.inverse_transform(pred.cpu().numpy())[0]

example_text = "How does inflation affect small businesses?"
predicted_label = predict(example_text)
print(f"\n📝 Input: {example_text}")
print(f"🔍 Predicted topic: {predicted_label}")


📈 Epoch 1/2, Loss: 0.7175
📈 Epoch 2/2, Loss: 0.1954

✅ Đánh giá trên tập test:
🎯 Accuracy :  0.8824
🎯 Precision:  0.8827
🎯 Recall   :  0.8824
🎯 F1-score :  0.8822

📋 Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.86      0.88       607
           1       0.93      0.97      0.95       627
           2       0.83      0.85      0.84       626
           3       0.86      0.85      0.85       640

    accuracy                           0.88      2500
   macro avg       0.88      0.88      0.88      2500
weighted avg       0.88      0.88      0.88      2500


💾 Đã lưu mô hình vào text_classifier.pth

📝 Input: How does inflation affect small businesses?
🔍 Predicted topic: 2


**Naive Bayes**

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    classification_report, accuracy_score,
    precision_score, recall_score, f1_score
)
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(label_train)
y_test_enc = label_encoder.transform(label_test)

vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train_enc)

y_pred = nb_model.predict(X_test_vec)

label_names = label_encoder.classes_

print("\n✅ Đánh giá mô hình Naive Bayes:")
print(f"🔹 Accuracy       : {accuracy_score(y_test_enc, y_pred):.4f}")
print(f"🔹 Precision (weighted): {precision_score(y_test_enc, y_pred, average='weighted'):.4f}")
print(f"🔹 Recall    (weighted): {recall_score(y_test_enc, y_pred, average='weighted'):.4f}")
print(f"🔹 F1-score  (weighted): {f1_score(y_test_enc, y_pred, average='weighted'):.4f}")
print(f"🔹 F1-score    (macro): {f1_score(y_test_enc, y_pred, average='macro'):.4f}")

print("\n📋 Classification Report:\n")
print(classification_report(y_test_enc, y_pred, target_names=label_names))

def predict_topic(text):
    vec = vectorizer.transform([text])
    pred = nb_model.predict(vec)[0]
    return label_encoder.inverse_transform([pred])[0]
import joblib
joblib.dump(nb_model, "nb_model.pkl")
joblib.dump(vectorizer, "nb_vectorizer.pkl")
joblib.dump(label_encoder, "nb_label_encoder.pkl")
example = "How does inflation affect small businesses?"
print("\n📝 Input:", example)
print("🔍 Predicted topic:", predict_topic(example))



✅ Đánh giá mô hình Naive Bayes:
🔹 Accuracy       : 0.8788
🔹 Precision (weighted): 0.8784
🔹 Recall    (weighted): 0.8788
🔹 F1-score  (weighted): 0.8786
🔹 F1-score    (macro): 0.8788

📋 Classification Report:

              precision    recall  f1-score   support

           0       0.89      0.88      0.89       607
           1       0.94      0.96      0.95       627
           2       0.83      0.83      0.83       626
           3       0.85      0.85      0.85       640

    accuracy                           0.88      2500
   macro avg       0.88      0.88      0.88      2500
weighted avg       0.88      0.88      0.88      2500


📝 Input: How does inflation affect small businesses?
🔍 Predicted topic: 2


**Logistic Regression**

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)

X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

y_train = list(map(int, label_train))
y_test = list(map(int, label_test))

lr_model = LogisticRegression(max_iter=1000, solver='lbfgs') 
lr_model.fit(X_train_vec, y_train)

y_pred = lr_model.predict(X_test_vec)

label_names = [f"Topic {i}" for i in sorted(set(y_train))]

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n✅ Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_names))

def predict_topic(text):
    vec = vectorizer.transform([text])
    pred = lr_model.predict(vec)[0]
    return label_names[int(pred)]
joblib.dump(lr_model, "lr_model.pkl")
joblib.dump(vectorizer, "lr_vectorizer.pkl")
joblib.dump(label_encoder, "lr_label_encoder.pkl")
example = "How does inflation affect small businesses?"
print("\n📌 Predicted topic:", predict_topic(example))


✅ Accuracy: 0.8828

✅ Classification Report:
              precision    recall  f1-score   support

     Topic 0       0.88      0.89      0.89       607
     Topic 1       0.93      0.96      0.95       627
     Topic 2       0.86      0.83      0.85       626
     Topic 3       0.85      0.85      0.85       640

    accuracy                           0.88      2500
   macro avg       0.88      0.88      0.88      2500
weighted avg       0.88      0.88      0.88      2500


📌 Predicted topic: Topic 3


**SGD**

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np


vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

y_train = list(map(int, label_train))
y_test = list(map(int, label_test))

sgd_model = SGDClassifier(
    loss="log_loss",   
    random_state=0,
    n_jobs=-1           
)
sgd_model.fit(X_train_vec, y_train)

y_pred = sgd_model.predict(X_test_vec)



print("✅ Accuracy (SGD):", accuracy_score(y_test, y_pred))
print("\n📋 Classification Report:\n")
print(classification_report(y_test, y_pred))
joblib.dump(sgd_model, "sgd_model.pkl")
joblib.dump(vectorizer, "sgd_vectorizer.pkl")
joblib.dump(label_encoder, "sgd_label_encoder.pkl")

✅ Accuracy (SGD): 0.8828

📋 Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.88      0.88       607
           1       0.93      0.96      0.95       627
           2       0.87      0.83      0.85       626
           3       0.85      0.86      0.85       640

    accuracy                           0.88      2500
   macro avg       0.88      0.88      0.88      2500
weighted avg       0.88      0.88      0.88      2500



['sgd_label_encoder.pkl']

**SVM**

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

y_train = list(map(int, label_train))
y_test = list(map(int, label_test))

svc_model = SVC(kernel='rbf', gamma='scale') 
svc_model.fit(X_train_vec, y_train)

y_pred = svc_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
joblib.dump(svc_model, "svc_model.pkl")
joblib.dump(vectorizer, "svc_vectorizer.pkl")
joblib.dump(label_encoder, "svc_label_encoder.pkl")

Accuracy: 0.8908
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.88      0.89       607
           1       0.94      0.96      0.95       627
           2       0.88      0.84      0.86       626
           3       0.85      0.87      0.86       640

    accuracy                           0.89      2500
   macro avg       0.89      0.89      0.89      2500
weighted avg       0.89      0.89      0.89      2500



['svc_label_encoder.pkl']

**Random Forest**

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np



vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

y_train = list(map(int, label_train))
y_test = list(map(int, label_test))

rf_model = RandomForestClassifier(
    n_estimators=200,        
    max_depth=1500,          
    min_samples_split=2,     
    random_state=42,
    n_jobs=-1                
)
rf_model.fit(X_train_vec, y_train)

y_pred = rf_model.predict(X_test_vec)

print("✅ Accuracy (Random Forest):", accuracy_score(y_test, y_pred))
print("\n📋 Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_names))
joblib.dump(rf_model, "rf_model.pkl")
joblib.dump(vectorizer, "rf_vectorizer.pkl")
joblib.dump(label_encoder, "rf_label_encoder.pkl")

✅ Accuracy (Random Forest): 0.8392

📋 Classification Report:

              precision    recall  f1-score   support

     Topic 0       0.85      0.85      0.85       607
     Topic 1       0.90      0.93      0.91       627
     Topic 2       0.82      0.78      0.80       626
     Topic 3       0.79      0.80      0.80       640

    accuracy                           0.84      2500
   macro avg       0.84      0.84      0.84      2500
weighted avg       0.84      0.84      0.84      2500



['rf_label_encoder.pkl']

**Clustering**

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import joblib
vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

label_train_int = np.array(label_train, dtype=int)
label_test_int = np.array(label_test, dtype=int)
available_labels = np.unique(label_train_int)

centroids = []
for label in available_labels:
    group_indices = np.where(label_train_int == label)[0]
    group_vectors = X_train_vec[group_indices]
    centroid = group_vectors.mean(axis=0)
    centroids.append(centroid)

centroids_matrix = np.asarray(np.vstack(centroids))
similarities = cosine_similarity(X_test_vec, centroids_matrix)
y_pred = available_labels[np.argmax(similarities, axis=1)]


print("✅ Accuracy:", accuracy_score(label_test_int, y_pred))
print("\n📄 Classification Report:\n", classification_report(label_test_int, y_pred, digits=4))
# Save vectorizer and centroids
joblib.dump(vectorizer, "cluster_vectorizer.pkl")
joblib.dump(centroids_matrix, "cluster_centroids.pkl")


✅ Accuracy: 0.8552

📄 Classification Report:
               precision    recall  f1-score   support

           0     0.8553    0.8764    0.8657       607
           1     0.9244    0.9362    0.9303       627
           2     0.8294    0.7843    0.8062       626
           3     0.8111    0.8250    0.8180       640

    accuracy                         0.8552      2500
   macro avg     0.8550    0.8555    0.8551      2500
weighted avg     0.8548    0.8552    0.8548      2500


✅ Vectorizer and centroids have been saved successfully!


**PROLDA**

In [45]:


import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class ProdLDA(nn.Module):
    '''
        Autoencoding Variational Inference For Topic Models. ICLR 2017

        Akash Srivastava, Charles Sutton.
    '''
    def __init__(self, vocab_size, num_topics=50, en_units=200, dropout=0.4):
        super().__init__()

        self.num_topics = num_topics

        self.a = 1 * np.ones((1, num_topics)).astype(np.float32)
        self.mu2 = nn.Parameter(torch.as_tensor((np.log(self.a).T - np.mean(np.log(self.a), 1)).T))
        self.var2 = nn.Parameter(torch.as_tensor((((1.0 / self.a) * (1 - (2.0 / num_topics))).T + (1.0 / (num_topics * num_topics)) * np.sum(1.0 / self.a, 1)).T))

        self.mu2.requires_grad = False
        self.var2.requires_grad = False

        self.fc11 = nn.Linear(vocab_size, en_units)
        self.fc12 = nn.Linear(en_units, en_units)
        self.fc21 = nn.Linear(en_units, num_topics)
        self.fc22 = nn.Linear(en_units, num_topics)

        self.mean_bn = nn.BatchNorm1d(num_topics, eps=0.001, momentum=0.001, affine=True)
        self.mean_bn.weight.data.copy_(torch.ones(num_topics))
        self.mean_bn.weight.requires_grad = False

        self.logvar_bn = nn.BatchNorm1d(num_topics, eps=0.001, momentum=0.001, affine=True)
        self.logvar_bn.weight.data.copy_(torch.ones(num_topics))
        self.logvar_bn.weight.requires_grad = False

        self.decoder_bn = nn.BatchNorm1d(vocab_size, eps=0.001, momentum=0.001, affine=True)
        self.decoder_bn.weight.data.copy_(torch.ones(vocab_size))
        self.decoder_bn.weight.requires_grad = False

        self.fc1_drop = nn.Dropout(dropout)
        self.theta_drop = nn.Dropout(dropout)

        self.fcd1 = nn.Linear(num_topics, vocab_size, bias=False)
        nn.init.xavier_uniform_(self.fcd1.weight)

    def get_beta(self):
        return self.fcd1.weight.T

    def get_theta(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        theta = F.softmax(z, dim=1)
        theta = self.theta_drop(theta)
        if self.training:
            return theta, mu, logvar
        else:
            return theta

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return mu + (eps * std)
        else:
            return mu

    def encode(self, x):
        e1 = F.softplus(self.fc11(x))
        e1 = F.softplus(self.fc12(e1))
        e1 = self.fc1_drop(e1)
        return self.mean_bn(self.fc21(e1)), self.logvar_bn(self.fc22(e1))

    def decode(self, theta):
        d1 = F.softmax(self.decoder_bn(self.fcd1(theta)), dim=1)
        return d1

    def forward(self, x):
        theta, mu, logvar = self.get_theta(x)
        recon_x = self.decode(theta)
        loss = self.loss_function(x, recon_x, mu, logvar)
        return {'loss': loss}

    def loss_function(self, x, recon_x, mu, logvar):
        recon_loss = -(x * (recon_x + 1e-10).log()).sum(axis=1)
        var = logvar.exp()
        var_division = var / self.var2
        diff = mu - self.mu2
        diff_term = diff * diff / self.var2
        logvar_division = self.var2.log() - logvar
        KLD = 0.5 * ((var_division + diff_term + logvar_division).sum(axis=1) - self.num_topics)
        loss = (recon_loss + KLD).mean()
        return loss

In [46]:
from scipy import sparse
import os
import torch
import numpy as np
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
# Load vocabulary
def load_vocab(vocab_path):
    with open(vocab_path, 'r', encoding='utf-8') as f:
        vocab = [line.strip() for line in f.readlines()]
    return {word: idx for idx, word in enumerate(vocab)}

# Load texts
def load_texts(text_path):
    with open(text_path, 'r', encoding='utf-8') as f:
        texts = [line.strip() for line in f.readlines()]
    return texts

# Set dataset path
dataset_path = "/kaggle/working/ECRTM/data/AGNews"

# Load vocabulary và texts
vocab_path = os.path.join(dataset_path, "vocab.txt")
vocab_dict = load_vocab(vocab_path)

train_texts_path = os.path.join(dataset_path, "train_texts.txt")
test_texts_path = os.path.join(dataset_path, "test_texts.txt")

train_texts = load_texts(train_texts_path)
test_texts = load_texts(test_texts_path)

# Tạo BOW bằng sklearn
print("Creating BOW matrices with sklearn...")
vectorizer = CountVectorizer(vocabulary=vocab_dict, lowercase=True, stop_words='english')
X_train_sparse = vectorizer.fit_transform(train_texts)
X_test_sparse = vectorizer.transform(test_texts)


# Convert to tensors
X_train_tensor = torch.FloatTensor(X_train_sparse.toarray())
X_test_tensor = torch.FloatTensor(X_test_sparse.toarray())

# Move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = X_train_tensor.to(device)
X_test_tensor = X_test_tensor.to(device)

# Initialize model
model = ProdLDA(vocab_size=X_train_tensor.shape[1], num_topics=50).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training
epochs = 500
model.train()
for epoch in range(epochs):
    total_loss = 0
    for i in range(0, X_train_tensor.size(0), 64):
        batch = X_train_tensor[i:i+64]
        output = model(batch)
        loss = output['loss']
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Extract theta
def extract_theta(model, X_tensor):
    model.eval()
    theta_list = []
    with torch.no_grad():
        for i in range(0, X_tensor.size(0), 64):
            batch = X_tensor[i:i+64]
            theta = model.get_theta(batch)
            if isinstance(theta, tuple):
                theta = theta[0]
            theta_list.append(theta.cpu())
    return torch.cat(theta_list).numpy()

X_train_theta = extract_theta(model, X_train_tensor)# 1. Lưu ProdLDA model weights
torch.save(model.state_dict(), "prodlda.pth")

# 2. Lưu SVM classifier
joblib.dump(clf, "prodlda_svm.pkl")

# 3. Lưu CountVectorizer
joblib.dump(vectorizer, "prodlda_bow_vectorizer.pkl")
X_test_theta = extract_theta(model, X_test_tensor)


clf = SVC(kernel='rbf', gamma='scale')
clf.fit(X_train_theta, label_train)
y_pred = clf.predict(X_test_theta)

print("\nAccuracy:", accuracy_score(label_test, y_pred))
print("\nClassification Report:")
print(classification_report(label_test, y_pred))
# 1. Lưu ProdLDA model weights
torch.save(model.state_dict(), "prodlda.pth")

# 2. Lưu SVM classifier
joblib.dump(clf, "prodlda_svm.pkl")

# 3. Lưu CountVectorizer
joblib.dump(vectorizer, "prodlda_bow_vectorizer.pkl")

Creating BOW matrices with sklearn...
Epoch 1, Loss: 28851.8849
Epoch 2, Loss: 24750.3757
Epoch 3, Loss: 24566.9622
Epoch 4, Loss: 24401.4527
Epoch 5, Loss: 24253.1013
Epoch 6, Loss: 24109.1375
Epoch 7, Loss: 23986.9417
Epoch 8, Loss: 23878.4209
Epoch 9, Loss: 23774.5526
Epoch 10, Loss: 23675.8763
Epoch 11, Loss: 23592.5010
Epoch 12, Loss: 23521.2204
Epoch 13, Loss: 23450.5224
Epoch 14, Loss: 23385.7997
Epoch 15, Loss: 23329.9842
Epoch 16, Loss: 23281.7941
Epoch 17, Loss: 23231.9182
Epoch 18, Loss: 23196.4272
Epoch 19, Loss: 23150.4696
Epoch 20, Loss: 23126.2598
Epoch 21, Loss: 23088.6696
Epoch 22, Loss: 23063.9016
Epoch 23, Loss: 23037.3634
Epoch 24, Loss: 23019.5582
Epoch 25, Loss: 23001.4425
Epoch 26, Loss: 22988.4018
Epoch 27, Loss: 22974.8255
Epoch 28, Loss: 22952.2537
Epoch 29, Loss: 22958.6703
Epoch 30, Loss: 22946.7143
Epoch 31, Loss: 22931.9573
Epoch 32, Loss: 22921.7793
Epoch 33, Loss: 22920.2880
Epoch 34, Loss: 22908.8436
Epoch 35, Loss: 22904.4244
Epoch 36, Loss: 22878.6336

['prodlda_bow_vectorizer.pkl']

In [38]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Co

In [53]:
import gradio as gr
import numpy as np
import torch
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn

# Định nghĩa lại SimpleNN và ProdLDA nếu cần
class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.relu3 = nn.ReLU()
        self.output = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.output(x)
        return x

class ProdLDA(nn.Module):
    def __init__(self, vocab_size, num_topics=50, en_units=200, dropout=0.4):
        super().__init__()
        self.num_topics = num_topics
        import numpy as np
        self.a = 1 * np.ones((1, num_topics)).astype(np.float32)
        self.mu2 = nn.Parameter(torch.as_tensor((np.log(self.a).T - np.mean(np.log(self.a), 1)).T))
        self.var2 = nn.Parameter(torch.as_tensor((((1.0 / self.a) * (1 - (2.0 / num_topics))).T + (1.0 / (num_topics * num_topics)) * np.sum(1.0 / self.a, 1)).T))
        self.mu2.requires_grad = False
        self.var2.requires_grad = False
        self.fc11 = nn.Linear(vocab_size, en_units)
        self.fc12 = nn.Linear(en_units, en_units)
        self.fc21 = nn.Linear(en_units, num_topics)
        self.fc22 = nn.Linear(en_units, num_topics)
        self.mean_bn = nn.BatchNorm1d(num_topics, eps=0.001, momentum=0.001, affine=True)
        self.mean_bn.weight.data.copy_(torch.ones(num_topics))
        self.mean_bn.weight.requires_grad = False
        self.logvar_bn = nn.BatchNorm1d(num_topics, eps=0.001, momentum=0.001, affine=True)
        self.logvar_bn.weight.data.copy_(torch.ones(num_topics))
        self.logvar_bn.weight.requires_grad = False
        self.decoder_bn = nn.BatchNorm1d(vocab_size, eps=0.001, momentum=0.001, affine=True)
        self.decoder_bn.weight.data.copy_(torch.ones(vocab_size))
        self.decoder_bn.weight.requires_grad = False
        self.fc1_drop = nn.Dropout(dropout)
        self.theta_drop = nn.Dropout(dropout)
        self.fcd1 = nn.Linear(num_topics, vocab_size, bias=False)
        nn.init.xavier_uniform_(self.fcd1.weight)
    def get_theta(self, x):
        mu, logvar = self.encode(x)
        import torch.nn.functional as F
        z = self.reparameterize(mu, logvar)
        theta = F.softmax(z, dim=1)
        theta = self.theta_drop(theta)
        if self.training:
            return theta, mu, logvar
        else:
            return theta
    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return mu + (eps * std)
        else:
            return mu
    def encode(self, x):
        import torch.nn.functional as F
        e1 = F.softplus(self.fc11(x))
        e1 = F.softplus(self.fc12(e1))
        e1 = self.fc1_drop(e1)
        return self.mean_bn(self.fc21(e1)), self.logvar_bn(self.fc22(e1))
    def forward(self, x):
        theta, mu, logvar = self.get_theta(x)
        return theta

label_names = ["World", "Sports", "Business", "Sci/Tech"]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load các mô hình và vectorizer đã lưu
nn_ckpt = torch.load('nn_model.pth', map_location=device, weights_only=False)
nn_vectorizer = nn_ckpt['vectorizer']
nn_label_encoder = nn_ckpt['label_encoder']
input_dim = len(nn_vectorizer.get_feature_names_out())
hidden_dim = 128
output_dim = len(nn_label_encoder.classes_)
nn_model = SimpleNN(input_dim, hidden_dim, output_dim)
nn_model.load_state_dict(nn_ckpt['model_state_dict'])
nn_model = nn_model.to(device)
nn_model.eval()

nb_model = joblib.load('nb_model.pkl')
nb_vectorizer = joblib.load('nb_vectorizer.pkl')
nb_label_encoder = joblib.load('nb_label_encoder.pkl')

lr_model = joblib.load('lr_model.pkl')
lr_vectorizer = joblib.load('lr_vectorizer.pkl')
lr_label_encoder = joblib.load('lr_label_encoder.pkl')

svc_model = joblib.load('svc_model.pkl')
svc_vectorizer = joblib.load('svc_vectorizer.pkl')
svc_label_encoder = joblib.load('svc_label_encoder.pkl')

sgd_model = joblib.load('sgd_model.pkl')
sgd_vectorizer = joblib.load('sgd_vectorizer.pkl')
sgd_label_encoder = joblib.load('sgd_label_encoder.pkl')

rf_model = joblib.load('rf_model.pkl')
rf_vectorizer = joblib.load('rf_vectorizer.pkl')
rf_label_encoder = joblib.load('rf_label_encoder.pkl')

prodlda_vectorizer = joblib.load('prodlda_bow_vectorizer.pkl')
prodlda_svm = joblib.load('prodlda_svm.pkl')
prodlda_model = ProdLDA(vocab_size=len(prodlda_vectorizer.vocabulary_), num_topics=50)
prodlda_model.load_state_dict(torch.load('prodlda.pth', map_location=device))
prodlda_model = prodlda_model.to(device)
prodlda_model.eval()

def prodlda_infer(text):
    bow = prodlda_vectorizer.transform([text])
    input_tensor = torch.FloatTensor(bow.toarray()).to(device)
    with torch.no_grad():
        theta = prodlda_model.get_theta(input_tensor)
        if isinstance(theta, tuple):
            theta = theta[0]
        pred = prodlda_svm.predict(theta.cpu().numpy())[0]
    return label_names[int(pred)]

# Cluster centroids (dùng trực tiếp text_train, label_train đã có)
# Load cluster parameters
cluster_vectorizer = joblib.load('cluster_vectorizer.pkl')
cluster_centroids = joblib.load('cluster_centroids.pkl')
def cluster_infer(text):
    vec = cluster_vectorizer.transform([text])
    similarities = cosine_similarity(vec, cluster_centroids)
    pred = np.argmax(similarities, axis=1)[0]
    return label_names[int(pred)]

def infer(model_name, input_text):
    if model_name == "Neural Network":
        vec = nn_vectorizer.transform([input_text])
        input_tensor = torch.FloatTensor(vec.toarray()).to(device)
        with torch.no_grad():
            outputs = nn_model(input_tensor)
            _, pred = torch.max(outputs, 1)
            pred = pred.item()
        return label_names[int(pred)]
    elif model_name == "Naive Bayes":
        vec = nb_vectorizer.transform([input_text])
        pred = nb_model.predict(vec)[0]
        # pred là index, nên dùng label_names
        return label_names[int(pred)]
    elif model_name == "Logistic Regression":
        vec = lr_vectorizer.transform([input_text])
        pred = lr_model.predict(vec)[0]
        return label_names[int(pred)]
    elif model_name == "SVM":
        vec = svc_vectorizer.transform([input_text])
        pred = svc_model.predict(vec)[0]
        return label_names[int(pred)]
    elif model_name == "SGD":
        vec = sgd_vectorizer.transform([input_text])
        pred = sgd_model.predict(vec)[0]
        return label_names[int(pred)]
    elif model_name == "Random Forest":
        vec = rf_vectorizer.transform([input_text])
        pred = rf_model.predict(vec)[0]
        return label_names[int(pred)]
    elif model_name == "Prod LDA":
        return prodlda_infer(input_text)
    elif model_name == "Cluster":
        return cluster_infer(input_text)
    else:
        return "Unknown model"

model_choices = [
    "Neural Network", "Naive Bayes", "Logistic Regression", "SVM", "SGD", "Random Forest", "Prod LDA", "Cluster"
]

gr.Interface(
    fn=infer,
    inputs=[
        gr.Dropdown(choices=model_choices, label="Select Model"),
        gr.Textbox(lines=4, label="Enter News Content")
    ],
    outputs=gr.Label(label="Predicted Category"),
    title="AG News Text Classification",
    description="Choose a model and enter a news article to classify it into one of 4 categories: World, Sports, Business, Sci/Tech."
).launch()

* Running on local URL:  http://127.0.0.1:7863
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://3054e7223111dc8c61.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [56]:
rm /kaggle/working/archive.zip

In [58]:
!zip -r /kaggle/working/archive.zip /kaggle/working/

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/state.db (deflated 85%)
  adding: kaggle/working/prodlda_svm.pkl (deflated 53%)
  adding: kaggle/working/nn_model.pth (deflated 16%)
  adding: kaggle/working/nb_label_encoder.pkl (deflated 24%)
  adding: kaggle/working/svc_model.pkl (deflated 31%)
  adding: kaggle/working/sgd_model.pkl (deflated 5%)
  adding: kaggle/working/svc_vectorizer.pkl (deflated 74%)
  adding: kaggle/working/text_classifier.pth (deflated 16%)
  adding: kaggle/working/.gradio/ (stored 0%)
  adding: kaggle/working/.gradio/certificate.pem (deflated 24%)
  adding: kaggle/working/rf_vectorizer.pkl (deflated 74%)
  adding: kaggle/working/lr_vectorizer.pkl (deflated 74%)
  adding: kaggle/working/lr_label_encoder.pkl (deflated 24%)
  adding: kaggle/working/.virtual_documents/ (stored 0%)
  adding: kaggle/working/rf_label_encoder.pkl (deflated 24%)
  adding: kaggle/working/sgd_vectorizer.pkl (deflated 74%)
  adding: kaggle/working/nb_vectorizer.pkl (deflated 