In [1]:
!git clone https://github.com/BobXWu/ECRTM.git

fatal: destination path 'ECRTM' already exists and is not an empty directory.


In [2]:
dataset_path = "/kaggle/working/ECRTM/data/AGNews"

with open(f"{dataset_path}/train_texts.txt", "r", encoding="utf-8") as f:
    text_train = f.read().splitlines()

with open(f"{dataset_path}/train_labels.txt", "r", encoding="utf-8") as f:
    label_train = f.read().splitlines()

with open(f"{dataset_path}/test_texts.txt", "r", encoding="utf-8") as f:
    text_test = f.read().splitlines()

with open(f"{dataset_path}/test_labels.txt", "r", encoding="utf-8") as f:
    label_test = f.read().splitlines()

print("📊 Số mẫu train:", len(text_train))
print("📊 Số nhãn train:", len(label_train))
print("📊 Số mẫu test:", len(text_test))
print("📊 Số nhãn test:", len(label_test))



📊 Số mẫu train: 10000
📊 Số nhãn train: 10000
📊 Số mẫu test: 2500
📊 Số nhãn test: 2500


**Neural Network**

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

vectorizer = TfidfVectorizer(stop_words='english', max_features=20000)
X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(label_train)
y_test_enc = label_encoder.transform(label_test)

X_train_tensor = torch.FloatTensor(X_train_vec.toarray())
y_train_tensor = torch.LongTensor(y_train_enc)
X_test_tensor = torch.FloatTensor(X_test_vec.toarray())
y_test_tensor = torch.LongTensor(y_test_enc)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.relu3 = nn.ReLU()
        self.output = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.output(x)
        return x

input_dim = X_train_vec.shape[1]
hidden_dim = 128
output_dim = len(label_encoder.classes_)
model = SimpleNN(input_dim, hidden_dim, output_dim).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 4
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"📈 Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# --- Metrics ---
accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

print("\n✅ Đánh giá trên tập test:")
print(f"🎯 Accuracy :  {accuracy:.4f}")
print(f"🎯 Precision:  {precision:.4f}")
print(f"🎯 Recall   :  {recall:.4f}")
print(f"🎯 F1-score :  {f1:.4f}")
print("\n📋 Classification Report:")
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

torch.save({
    'model_state_dict': model.state_dict(),
    'vectorizer': vectorizer,
    'label_encoder': label_encoder
}, 'text_classifier.pth')
print("\n💾 Đã lưu mô hình vào text_classifier.pth")

def predict(text):
    model.eval()
    vec = vectorizer.transform([text])
    input_tensor = torch.FloatTensor(vec.toarray()).to(device)
    with torch.no_grad():
        outputs = model(input_tensor)
        _, pred = torch.max(outputs, 1)
    return label_encoder.inverse_transform(pred.cpu().numpy())[0]

example_text = "How does inflation affect small businesses?"
predicted_label = predict(example_text)
print(f"\n📝 Input: {example_text}")
print(f"🔍 Predicted topic: {predicted_label}")


📈 Epoch 1/4, Loss: 0.7328
📈 Epoch 2/4, Loss: 0.2328
📈 Epoch 3/4, Loss: 0.1280
📈 Epoch 4/4, Loss: 0.0640

✅ Đánh giá trên tập test:
🎯 Accuracy :  0.8528
🎯 Precision:  0.8534
🎯 Recall   :  0.8528
🎯 F1-score :  0.8530

📋 Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       607
           1       0.94      0.92      0.93       627
           2       0.82      0.81      0.82       626
           3       0.83      0.82      0.83       640

    accuracy                           0.85      2500
   macro avg       0.85      0.85      0.85      2500
weighted avg       0.85      0.85      0.85      2500


💾 Đã lưu mô hình vào text_classifier.pth

📝 Input: How does inflation affect small businesses?
🔍 Predicted topic: 2


**Naive Bayes**

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    classification_report, accuracy_score,
    precision_score, recall_score, f1_score
)
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(label_train)
y_test_enc = label_encoder.transform(label_test)

vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

model = MultinomialNB()
model.fit(X_train_vec, y_train_enc)

y_pred = model.predict(X_test_vec)

label_names = label_encoder.classes_

print("\n✅ Đánh giá mô hình Naive Bayes:")
print(f"🔹 Accuracy       : {accuracy_score(y_test_enc, y_pred):.4f}")
print(f"🔹 Precision (weighted): {precision_score(y_test_enc, y_pred, average='weighted'):.4f}")
print(f"🔹 Recall    (weighted): {recall_score(y_test_enc, y_pred, average='weighted'):.4f}")
print(f"🔹 F1-score  (weighted): {f1_score(y_test_enc, y_pred, average='weighted'):.4f}")
print(f"🔹 F1-score    (macro): {f1_score(y_test_enc, y_pred, average='macro'):.4f}")

print("\n📋 Classification Report:\n")
print(classification_report(y_test_enc, y_pred, target_names=label_names))

def predict_topic(text):
    vec = vectorizer.transform([text])
    pred = model.predict(vec)[0]
    return label_encoder.inverse_transform([pred])[0]

example = "How does inflation affect small businesses?"
print("\n📝 Input:", example)
print("🔍 Predicted topic:", predict_topic(example))



✅ Đánh giá mô hình Naive Bayes:
🔹 Accuracy       : 0.8788
🔹 Precision (weighted): 0.8784
🔹 Recall    (weighted): 0.8788
🔹 F1-score  (weighted): 0.8786
🔹 F1-score    (macro): 0.8788

📋 Classification Report:

              precision    recall  f1-score   support

           0       0.89      0.88      0.89       607
           1       0.94      0.96      0.95       627
           2       0.83      0.83      0.83       626
           3       0.85      0.85      0.85       640

    accuracy                           0.88      2500
   macro avg       0.88      0.88      0.88      2500
weighted avg       0.88      0.88      0.88      2500


📝 Input: How does inflation affect small businesses?
🔍 Predicted topic: 2


**Logistic Regression**

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)

X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

y_train = list(map(int, label_train))
y_test = list(map(int, label_test))

lr_model = LogisticRegression(max_iter=1000, solver='lbfgs') 
lr_model.fit(X_train_vec, y_train)

y_pred = lr_model.predict(X_test_vec)

label_names = [f"Topic {i}" for i in sorted(set(y_train))]

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n✅ Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_names))

def predict_topic(text):
    vec = vectorizer.transform([text])
    pred = lr_model.predict(vec)[0]
    return label_names[int(pred)]

example = "How does inflation affect small businesses?"
print("\n📌 Predicted topic:", predict_topic(example))


✅ Accuracy: 0.8828

✅ Classification Report:
              precision    recall  f1-score   support

     Topic 0       0.88      0.89      0.89       607
     Topic 1       0.93      0.96      0.95       627
     Topic 2       0.86      0.83      0.85       626
     Topic 3       0.85      0.85      0.85       640

    accuracy                           0.88      2500
   macro avg       0.88      0.88      0.88      2500
weighted avg       0.88      0.88      0.88      2500


📌 Predicted topic: Topic 3


**SGD**

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np


vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

y_train = list(map(int, label_train))
y_test = list(map(int, label_test))

sgd_model = SGDClassifier(
    loss="log_loss",   
    random_state=0,
    n_jobs=-1           
)
sgd_model.fit(X_train_vec, y_train)

y_pred = sgd_model.predict(X_test_vec)



print("✅ Accuracy (SGD):", accuracy_score(y_test, y_pred))
print("\n📋 Classification Report:\n")
print(classification_report(y_test, y_pred))


✅ Accuracy (SGD): 0.8828

📋 Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.88      0.88       607
           1       0.93      0.96      0.95       627
           2       0.87      0.83      0.85       626
           3       0.85      0.86      0.85       640

    accuracy                           0.88      2500
   macro avg       0.88      0.88      0.88      2500
weighted avg       0.88      0.88      0.88      2500



**SVM**

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

y_train = list(map(int, label_train))
y_test = list(map(int, label_test))

svc_model = SVC(kernel='rbf', gamma='scale') 
svc_model.fit(X_train_vec, y_train)

y_pred = svc_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8908
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.88      0.89       607
           1       0.94      0.96      0.95       627
           2       0.88      0.84      0.86       626
           3       0.85      0.87      0.86       640

    accuracy                           0.89      2500
   macro avg       0.89      0.89      0.89      2500
weighted avg       0.89      0.89      0.89      2500



**Random Forest**

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np



vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

y_train = list(map(int, label_train))
y_test = list(map(int, label_test))

rf_model = RandomForestClassifier(
    n_estimators=200,        
    max_depth=None,          
    min_samples_split=2,     
    random_state=42,
    n_jobs=-1                
)
rf_model.fit(X_train_vec, y_train)

y_pred = rf_model.predict(X_test_vec)

print("✅ Accuracy (Random Forest):", accuracy_score(y_test, y_pred))
print("\n📋 Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_names))


✅ Accuracy (Random Forest): 0.8392

📋 Classification Report:

              precision    recall  f1-score   support

     Topic 0       0.85      0.85      0.85       607
     Topic 1       0.90      0.93      0.91       627
     Topic 2       0.82      0.78      0.80       626
     Topic 3       0.79      0.80      0.80       640

    accuracy                           0.84      2500
   macro avg       0.84      0.84      0.84      2500
weighted avg       0.84      0.84      0.84      2500



**Clustering**

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(text_train)
X_test_vec = vectorizer.transform(text_test)

label_train_int = np.array(label_train, dtype=int)
label_test_int = np.array(label_test, dtype=int)
available_labels = np.unique(label_train_int)

centroids = []
for label in available_labels:
    group_indices = np.where(label_train_int == label)[0]
    group_vectors = X_train_vec[group_indices]
    centroid = group_vectors.mean(axis=0)
    centroids.append(centroid)

centroids_matrix = np.asarray(np.vstack(centroids))
similarities = cosine_similarity(X_test_vec, centroids_matrix)
y_pred = available_labels[np.argmax(similarities, axis=1)]


print("✅ Accuracy:", accuracy_score(label_test_int, y_pred))
print("\n📄 Classification Report:\n", classification_report(label_test_int, y_pred, digits=4))


✅ Accuracy: 0.8552

📄 Classification Report:
               precision    recall  f1-score   support

           0     0.8553    0.8764    0.8657       607
           1     0.9244    0.9362    0.9303       627
           2     0.8294    0.7843    0.8062       626
           3     0.8111    0.8250    0.8180       640

    accuracy                         0.8552      2500
   macro avg     0.8550    0.8555    0.8551      2500
weighted avg     0.8548    0.8552    0.8548      2500



**PROLDA**

In [10]:


import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class ProdLDA(nn.Module):
    '''
        Autoencoding Variational Inference For Topic Models. ICLR 2017

        Akash Srivastava, Charles Sutton.
    '''
    def __init__(self, vocab_size, num_topics=50, en_units=200, dropout=0.4):
        super().__init__()

        self.num_topics = num_topics

        self.a = 1 * np.ones((1, num_topics)).astype(np.float32)
        self.mu2 = nn.Parameter(torch.as_tensor((np.log(self.a).T - np.mean(np.log(self.a), 1)).T))
        self.var2 = nn.Parameter(torch.as_tensor((((1.0 / self.a) * (1 - (2.0 / num_topics))).T + (1.0 / (num_topics * num_topics)) * np.sum(1.0 / self.a, 1)).T))

        self.mu2.requires_grad = False
        self.var2.requires_grad = False

        self.fc11 = nn.Linear(vocab_size, en_units)
        self.fc12 = nn.Linear(en_units, en_units)
        self.fc21 = nn.Linear(en_units, num_topics)
        self.fc22 = nn.Linear(en_units, num_topics)

        self.mean_bn = nn.BatchNorm1d(num_topics, eps=0.001, momentum=0.001, affine=True)
        self.mean_bn.weight.data.copy_(torch.ones(num_topics))
        self.mean_bn.weight.requires_grad = False

        self.logvar_bn = nn.BatchNorm1d(num_topics, eps=0.001, momentum=0.001, affine=True)
        self.logvar_bn.weight.data.copy_(torch.ones(num_topics))
        self.logvar_bn.weight.requires_grad = False

        self.decoder_bn = nn.BatchNorm1d(vocab_size, eps=0.001, momentum=0.001, affine=True)
        self.decoder_bn.weight.data.copy_(torch.ones(vocab_size))
        self.decoder_bn.weight.requires_grad = False

        self.fc1_drop = nn.Dropout(dropout)
        self.theta_drop = nn.Dropout(dropout)

        self.fcd1 = nn.Linear(num_topics, vocab_size, bias=False)
        nn.init.xavier_uniform_(self.fcd1.weight)

    def get_beta(self):
        return self.fcd1.weight.T

    def get_theta(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        theta = F.softmax(z, dim=1)
        theta = self.theta_drop(theta)
        if self.training:
            return theta, mu, logvar
        else:
            return theta

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return mu + (eps * std)
        else:
            return mu

    def encode(self, x):
        e1 = F.softplus(self.fc11(x))
        e1 = F.softplus(self.fc12(e1))
        e1 = self.fc1_drop(e1)
        return self.mean_bn(self.fc21(e1)), self.logvar_bn(self.fc22(e1))

    def decode(self, theta):
        d1 = F.softmax(self.decoder_bn(self.fcd1(theta)), dim=1)
        return d1

    def forward(self, x):
        theta, mu, logvar = self.get_theta(x)
        recon_x = self.decode(theta)
        loss = self.loss_function(x, recon_x, mu, logvar)
        return {'loss': loss}

    def loss_function(self, x, recon_x, mu, logvar):
        recon_loss = -(x * (recon_x + 1e-10).log()).sum(axis=1)
        var = logvar.exp()
        var_division = var / self.var2
        diff = mu - self.mu2
        diff_term = diff * diff / self.var2
        logvar_division = self.var2.log() - logvar
        KLD = 0.5 * ((var_division + diff_term + logvar_division).sum(axis=1) - self.num_topics)
        loss = (recon_loss + KLD).mean()
        return loss

In [11]:
from scipy import sparse
import os
train_bow_path = os.path.join(dataset_path, "train_bow.npz")
test_bow_path = os.path.join(dataset_path, "test_bow.npz")

X_train_sparse = sparse.load_npz(train_bow_path)
X_test_sparse = sparse.load_npz(test_bow_path)

X_train_tensor = torch.FloatTensor(X_train_sparse.toarray())
X_test_tensor = torch.FloatTensor(X_test_sparse.toarray())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = X_train_tensor.to(device)
X_test_tensor = X_test_tensor.to(device)

In [12]:
model = ProdLDA(vocab_size=X_train_tensor.shape[1], num_topics=50).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
epochs = 200

model.train()
for epoch in range(epochs):
    total_loss = 0
    for i in range(0, X_train_tensor.size(0), 64):
        batch = X_train_tensor[i:i+64]
        output = model(batch)
        loss = output['loss']
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 30755.5206
Epoch 2, Loss: 26514.2245
Epoch 3, Loss: 26319.0188
Epoch 4, Loss: 26135.6382
Epoch 5, Loss: 25972.4401
Epoch 6, Loss: 25820.2441
Epoch 7, Loss: 25683.7093
Epoch 8, Loss: 25562.3176
Epoch 9, Loss: 25453.1503
Epoch 10, Loss: 25350.7052
Epoch 11, Loss: 25254.2374
Epoch 12, Loss: 25168.0053
Epoch 13, Loss: 25080.9723
Epoch 14, Loss: 25010.5262
Epoch 15, Loss: 24955.3282
Epoch 16, Loss: 24895.0938
Epoch 17, Loss: 24843.3042
Epoch 18, Loss: 24797.2644
Epoch 19, Loss: 24762.1695
Epoch 20, Loss: 24730.8981
Epoch 21, Loss: 24695.1783
Epoch 22, Loss: 24676.4764
Epoch 23, Loss: 24646.6774
Epoch 24, Loss: 24624.9722
Epoch 25, Loss: 24601.0133
Epoch 26, Loss: 24579.1179
Epoch 27, Loss: 24562.6606
Epoch 28, Loss: 24547.2502
Epoch 29, Loss: 24524.0017
Epoch 30, Loss: 24507.0742
Epoch 31, Loss: 24506.9647
Epoch 32, Loss: 24497.1184
Epoch 33, Loss: 24474.8368
Epoch 34, Loss: 24470.1998
Epoch 35, Loss: 24470.5223
Epoch 36, Loss: 24458.6033
Epoch 37, Loss: 24441.2555
Epoch 38, 

In [13]:
def extract_theta(model, X_tensor):
    model.eval()
    theta_list = []
    with torch.no_grad():
        for i in range(0, X_tensor.size(0), 64):
            batch = X_tensor[i:i+64]
            theta = model.get_theta(batch)
            if isinstance(theta, tuple):
                theta = theta[0]
            theta_list.append(theta.cpu())
    return torch.cat(theta_list).numpy()

X_train_theta = extract_theta(model, X_train_tensor)
X_test_theta = extract_theta(model, X_test_tensor)


In [14]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

clf = SVC(kernel='rbf', gamma='scale')
clf.fit(X_train_theta, label_train)
y_pred = clf.predict(X_test_theta)

print("\nAccuracy:", accuracy_score(label_test, y_pred))
print("\nClassification Report:")
print(classification_report(label_test, y_pred))



Accuracy: 0.8164

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       607
           1       0.92      0.92      0.92       627
           2       0.75      0.72      0.74       626
           3       0.73      0.80      0.76       640

    accuracy                           0.82      2500
   macro avg       0.82      0.82      0.82      2500
weighted avg       0.82      0.82      0.82      2500



**Ensemble**

In [15]:
import torch
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle

dataset_path = "/kaggle/working/ECRTM/data/AGNews"

with open(f"{dataset_path}/train_texts.txt", "r", encoding="utf-8") as f:
    text_train = f.read().splitlines()

with open(f"{dataset_path}/train_labels.txt", "r", encoding="utf-8") as f:
    label_train = f.read().splitlines()

vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(text_train)
y_train = list(map(int, label_train))

print("Training Naive Bayes...")
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)
with open('nb_model.pkl', 'wb') as f:
    pickle.dump(nb_model, f)
print("Saved Naive Bayes model")

print("Training SVM...")
svm_model = SVC(kernel='rbf', gamma='scale', probability=True)
svm_model.fit(X_train_vec, y_train)
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm_model, f)
print("Saved SVM model")

print("Training Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000, solver='lbfgs')
lr_model.fit(X_train_vec, y_train)
with open('lr_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)
print("Saved Logistic Regression model")

print("Training SGD...")
sgd_model = SGDClassifier(loss="log_loss", max_iter=1000, random_state=0)
sgd_model.fit(X_train_vec, y_train)
with open('sgd_model.pkl', 'wb') as f:
    pickle.dump(sgd_model, f)
print("Saved SGD model")

print("Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_vec, y_train)
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
print("Saved Random Forest model")

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
print("Saved vectorizer")

print("\nAll models have been saved successfully!")

Training Naive Bayes...
Saved Naive Bayes model
Training SVM...
Saved SVM model
Training Logistic Regression...
Saved Logistic Regression model
Training SGD...
Saved SGD model
Training Random Forest...
Saved Random Forest model
Saved vectorizer

All models have been saved successfully!


In [16]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.30.0-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Co

In [34]:
import gradio as gr
import numpy as np
import torch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import os

vocab_path = "/kaggle/working/ECRTM/data/AGNews/vocab.txt"
with open(vocab_path, 'r', encoding='utf-8') as f:
    vocab = [line.strip() for line in f.readlines()]

bow_vectorizer = CountVectorizer(vocabulary=dict(zip(vocab, range(len(vocab)))))

label_names = ["World", "Sports", "Business", "Sci/Tech"]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

checkpoint = torch.load('text_classifier.pth', weights_only=False)
nn_vectorizer = checkpoint['vectorizer']

input_dim = len(nn_vectorizer.get_feature_names_out())
hidden_dim = 128
output_dim = len(label_names)
nn_model = SimpleNN(input_dim, hidden_dim, output_dim)
nn_model.load_state_dict(checkpoint['model_state_dict'])
nn_model = nn_model.to(device)
nn_model.eval()

prodlda_model = ProdLDA(vocab_size=len(vocab), num_topics=50).to(device)
prodlda_model.eval()

model_dict = {
    "Naive Bayes": nb_model,
    "Logistic Regression": lr_model,
    "SVM": svc_model,
    "SGD": sgd_model,
    "Random Forest": rf_model,
    "Neural Network": nn_model,
    "Prod LDA": prodlda_model,  
    "Classification (Kmeans)": None
}

if model_dict["Classification (Kmeans)"] is None:
    X_train_vec = vectorizer.transform(text_train)
    label_train_int = np.array(label_train, dtype=int)

    centroids = []
    for label in range(4):
        group_indices = np.where(label_train_int == label)[0]
        group_vectors = X_train_vec[group_indices]
        centroid = group_vectors.mean(axis=0)
        centroids.append(centroid)
    centroids_matrix = np.asarray(np.vstack(centroids))
    model_dict["Classification (Kmeans)"] = centroids_matrix

def classify_news(model_name, input_text):
    if model_name == "Neural Network":
        vec = nn_vectorizer.transform([input_text])
        input_tensor = torch.FloatTensor(vec.toarray()).to(device)
        model = model_dict[model_name]
        with torch.no_grad():
            outputs = model(input_tensor)
            _, pred = torch.max(outputs, 1)
            pred = pred.item()
    elif model_name == "Prod LDA":
        bow = bow_vectorizer.transform([input_text])
        input_tensor = torch.FloatTensor(bow.toarray()).to(device)
        with torch.no_grad():
            theta = model_dict[model_name].get_theta(input_tensor)
            if isinstance(theta, tuple):
                theta = theta[0]
            pred = clf.predict(theta.cpu().numpy())[0]
    elif model_name == "Classification (Kmeans)":
        vec = vectorizer.transform([input_text])
        similarities = cosine_similarity(vec, model_dict["Classification (Kmeans)"])
        pred = np.argmax(similarities, axis=1)[0]
    else:
        vec = vectorizer.transform([input_text])
        model = model_dict[model_name]
        pred = model.predict(vec)[0]

    return label_names[int(pred)]

gr.Interface(
    fn=classify_news,
    inputs=[
        gr.Dropdown(choices=list(model_dict.keys()), label="Select Model"),
        gr.Textbox(lines=4, label="Enter News Content")
    ],
    outputs=gr.Label(label="Predicted Category"),
    title="AG News Text Classification",
    description="Choose a model and enter a news article to classify it into one of 4 categories: World, Sports, Business, Sci/Tech."
).launch()

* Running on local URL:  http://127.0.0.1:7869
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://bb1e2929381b7d9e1e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


