In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

In [2]:
df = pd.read_csv("cleaned_combined_articles.csv")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns}")
df.head()

Shape: (1348, 5)
Columns: Index(['id', 'title', 'link', 'content', 'gold_label'], dtype='object')


Unnamed: 0,id,title,link,content,gold_label
0,391,پاکستانی طلبا عالمی نیوکلیئر اولمپیاڈ اعزازات,https://jang.com.pk/news/1378483,پاکستانی طلبا بین الاقوامی نیوکلیئر سائنس اولم...,science-technology
1,76,ملک آج ڈالر قیمت رہی,https://urdu.geo.tv/latest/387502-,کراچی ملکی تبادلہ منڈیوں ڈالر قیمت اضافہ ہوگیا...,business
2,280,امریکی سینیٹ اسرائیل اسلحے فروخت روکنے متعلق ...,https://urdu.geo.tv/latest/387551-,واشنگٹن امریکی سینیٹ اسرائیل غزہ جنگ اسلحے فرو...,international
3,63,پاکستان سعودی عرب درآمدات کمی ایران انحصار بڑھ,https://urdu.geo.tv/latest/387804-,اسلام آباد رواں مالی سال پاکستان سعودی عرب در...,business
4,114,غزہ اسرائیل رہائشی بمباری فلسطینی شہید,https://urdu.samaa.tv/2087325042,عالمی فوجداری عدالت نیتن یاہو وارنٹ گرفتاری جا...,international


In [3]:
label_vector = df["gold_label"].unique()
print(label_vector)

['science-technology' 'business' 'international' 'sports' 'entertainment']


In [4]:
X = df[['title','content']]
X = X['title'] + X['content']
print(X.head())

0    پاکستانی طلبا عالمی نیوکلیئر اولمپیاڈ اعزازاتپ...
1    ملک آج ڈالر قیمت رہیکراچی ملکی تبادلہ منڈیوں ...
2    امریکی سینیٹ اسرائیل اسلحے فروخت روکنے متعلق  ...
3    پاکستان سعودی عرب درآمدات کمی ایران انحصار بڑ...
4    غزہ اسرائیل رہائشی بمباری  فلسطینی شہیدعالمی ف...
dtype: object


In [5]:
def one_hot_encode(labels, label_list):
    labels = labels.astype(str)
    label_dict = {label: idx for idx, label in enumerate(label_list)}
    labels_idx = [label_dict[label] for label in labels]
    num_classes = len(label_list)
    one_hot = np.zeros((len(labels), num_classes), dtype=int)
    for i, label in enumerate(labels_idx):
        one_hot[i, label] = 1
    return one_hot
y = df['gold_label']
y = torch.tensor(one_hot_encode(y, label_vector), dtype=torch.float64)

In [6]:


vectorizer = CountVectorizer()
bow_encodings = vectorizer.fit_transform(X)
print("Vocabulary size:", len(vectorizer.vocabulary_))

bow_dense = bow_encodings.toarray()
scaler = StandardScaler()
scaled_inputs = scaler.fit_transform(bow_dense)
print("Shape of scaled inputs:", scaled_inputs.shape)


Vocabulary size: 14865
Shape of scaled inputs: (1348, 14865)


In [7]:
X_tensor = torch.tensor(scaled_inputs, dtype=torch.float32)
print(X_tensor.shape)
y_tensor = torch.tensor(y, dtype=torch.float32)
print(y_tensor.shape)

torch.Size([1348, 14865])
torch.Size([1348, 5])


  y_tensor = torch.tensor(y, dtype=torch.float32)


In [8]:
X_train, X_temp, y_train, y_temp = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=2024)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=2024)

print(X_train.shape, X_val.shape, X_test.shape)

torch.Size([1078, 14865]) torch.Size([135, 14865]) torch.Size([135, 14865])


In [9]:
print(f"X_tensor[0]: {X_tensor[0]}")
print(f"y_tensor[0]: {y_tensor[0]}")

X_tensor[0]: tensor([-0.0472, -0.1051, -0.0272,  ..., -0.0272, -0.0445, -0.0272])
y_tensor[0]: tensor([1., 0., 0., 0., 0.])


In [10]:
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [11]:


# class TextClassifier(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(TextClassifier, self).__init__()
#         self.fc1 = nn.Linear(input_dim, hidden_dim)
#         self.fc2 = nn.Linear(hidden_dim, output_dim)
#         self.dropout = nn.Dropout(0.7)
#         self.relu = nn.ReLU()

#     def forward(self, x):
#         x = self.fc1(x)
#         x = self.relu(x)
#         x = self.dropout(x)
#         x = self.fc2(x)
#         return x

# input_dim = X_tensor.shape[1]
# hidden_dim = 64
# output_dim = y_tensor.shape[1]
# model = TextClassifier(input_dim, hidden_dim, output_dim)

# loss_fn = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.00001)

# num_epochs = 250

# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0.0
#     for batch in train_loader:
#         inputs, labels = batch
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = loss_fn(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()

#     model.eval()
#     val_loss = 0.0
#     with torch.no_grad():
#         for batch in val_loader:
#             inputs, labels = batch
#             outputs = model(inputs)
#             loss = loss_fn(outputs, labels)
#             val_loss += loss.item()

#     print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss/len(train_loader):.4f}, Validation Loss: {val_loss/len(val_loader):.4f}")

# model.eval()
# test_loss = 0.0
# correct = 0
# total = 0
# with torch.no_grad():
#     for batch in test_loader:
#         inputs, labels = batch
#         outputs = model(inputs)
#         loss = loss_fn(outputs, labels)
#         test_loss += loss.item()
#         predicted = torch.sigmoid(outputs) > 0.5
#         correct += (predicted == labels).sum().item()
#         total += labels.size(0) * labels.size(1)

#     accuracy = correct / total * 100
#     print(f"Test Loss: {test_loss/len(test_loader):.4f}, Test Accuracy: {accuracy:.2f}%")


In [12]:
class TextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.7)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [13]:
class TextClassificationModel:
    def __init__(self, df, label_column, text_columns, batch_size=16, hidden_dim=64, lr=1e-5, num_epochs=250):
        self.df = df
        self.label_column = label_column
        self.text_columns = text_columns
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.lr = lr
        self.num_epochs = num_epochs
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.label_vector = df[label_column].unique()
        self.model = None
        self.train_loader = self.val_loader = self.test_loader = None
        self.input_dim = None
        self.vectorizer = CountVectorizer()
        self.output_dim = None
        self.loss_fn = nn.CrossEntropyLoss()

    def preprocess_data(self):
        df = self.df
        label_vector = self.label_vector
        X = df[self.text_columns[0]] + " " + df[self.text_columns[1]]

        bow_encodings = self.vectorizer.fit_transform(X)
        bow_dense = bow_encodings.toarray()
        scaler = StandardScaler()
        scaled_inputs = scaler.fit_transform(bow_dense)

        def one_hot_encode(labels, label_list):
            label_dict = {label: idx for idx, label in enumerate(label_list)}
            labels_idx = [label_dict[label] for label in labels.astype(str)]
            num_classes = len(label_list)
            one_hot = np.zeros((len(labels), num_classes), dtype=int)
            for i, label in enumerate(labels_idx):
                one_hot[i, label] = 1
            return one_hot

        y = df[self.label_column]
        y_encoded = torch.tensor(one_hot_encode(y, label_vector), dtype=torch.float32)

        X_tensor = torch.tensor(scaled_inputs, dtype=torch.float32)
        X_train, X_temp, y_train, y_temp = train_test_split(X_tensor, y_encoded, test_size=0.2, random_state=2024)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=2024)

        train_dataset = TensorDataset(X_train, y_train)
        val_dataset = TensorDataset(X_val, y_val)
        test_dataset = TensorDataset(X_test, y_test)

        self.train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        self.val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False)
        self.test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)

        self.input_dim = X_tensor.shape[1]
        self.output_dim = y_encoded.shape[1]

    def fit(self):
        self.model = TextClassifier(self.input_dim, self.hidden_dim, self.output_dim).to(self.device)
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        for epoch in range(self.num_epochs):
            self.model.train()
            running_loss = 0.0
            for inputs, labels in self.train_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = self.loss_fn(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            val_loss = self._evaluate_loss(self.val_loader)
            print(f"Epoch {epoch+1}/{self.num_epochs}, Training Loss: {running_loss/len(self.train_loader)}, Validation Loss: {val_loss}")

    def _evaluate_loss(self, loader):
        self.model.eval()
        total_loss = 0.0
        with torch.no_grad():
            for inputs, labels in loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                outputs = self.model(inputs)
                loss = self.loss_fn(outputs, labels)
                total_loss += loss.item()
        return total_loss / len(loader)

    def evaluate(self):
        self.model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for inputs, labels in self.test_loader:
                inputs = inputs.to(self.device)
                outputs = self.model(inputs)
                predicted = torch.sigmoid(outputs).cpu() > 0.5
                all_preds.extend(predicted.numpy())
                all_labels.extend(labels.numpy())

        y_true = np.argmax(all_labels, axis=1)
        y_pred = np.argmax(all_preds, axis=1)

        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='macro')
        recall = recall_score(y_true, y_pred, average='macro')
        f1 = f1_score(y_true, y_pred, average='macro')
        cm = confusion_matrix(y_true, y_pred)

        print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
        print(f"Confusion Matrix:\n{cm}")

    def predict(self, text):
        self.model.eval()
        bow_encodings = self.vectorizer.fit_transform([text])
        bow_dense = bow_encodings.toarray()
        scaler = StandardScaler()
        scaled_input = scaler.fit_transform(bow_dense)
        input_tensor = torch.tensor(scaled_input, dtype=torch.float32).to(self.device)

        with torch.no_grad():
            output = self.model(input_tensor)
            predicted = torch.sigmoid(output).cpu() > 0.5
        return predicted.numpy()

In [14]:
model = TextClassificationModel(df, label_column="gold_label", text_columns=["title", "content"])
model.preprocess_data()
model.fit()

Epoch 1/250, Training Loss: 1.673615681774476, Validation Loss: 1.6564298735724554
Epoch 2/250, Training Loss: 1.6287585409248577, Validation Loss: 1.635440958870782
Epoch 3/250, Training Loss: 1.579948612872292, Validation Loss: 1.6142146322462294
Epoch 4/250, Training Loss: 1.5321616547949173, Validation Loss: 1.5919889873928494
Epoch 5/250, Training Loss: 1.4530364134732414, Validation Loss: 1.571008907424079
Epoch 6/250, Training Loss: 1.4229250620393192, Validation Loss: 1.5496302843093872
Epoch 7/250, Training Loss: 1.3943469384137321, Validation Loss: 1.523538867632548
Epoch 8/250, Training Loss: 1.3412714705747717, Validation Loss: 1.4969407320022583
Epoch 9/250, Training Loss: 1.2900572436697342, Validation Loss: 1.4701298342810736
Epoch 10/250, Training Loss: 1.2509787959211014, Validation Loss: 1.4426380925708346
Epoch 11/250, Training Loss: 1.2086423425113453, Validation Loss: 1.4167453580432467
Epoch 12/250, Training Loss: 1.1782168500563677, Validation Loss: 1.39348487059

In [15]:
model.evaluate()

Accuracy: 0.8667, Precision: 0.8874, Recall: 0.8663, F1 Score: 0.8672
Confusion Matrix:
[[27  0  1  0  0]
 [ 1 21  0  0  0]
 [ 5  0 16  0  1]
 [ 2  0  0 29  0]
 [ 5  2  1  0 24]]
