In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [9]:
data=pd.read_csv('cleaned_combined_articles.csv')
print("Shape of the dataframe:", data.shape)
print("First 5 rows of the dataframe:")
print(data.head())
texts = data['content'] 
labels = pd.get_dummies(data['gold_label']).values 
labels_list = data['gold_label'].unique()
print("Labels:", labels_list)

Shape of the dataframe: (1348, 5)
First 5 rows of the dataframe:
    id                                              title  \
0  391      پاکستانی طلبا عالمی نیوکلیئر اولمپیاڈ اعزازات   
1   76                              ملک آج ڈالر قیمت رہی   
2  280  امریکی سینیٹ اسرائیل اسلحے فروخت روکنے متعلق  ...   
3   63    پاکستان سعودی عرب درآمدات کمی ایران انحصار بڑھ   
4  114            غزہ اسرائیل رہائشی بمباری  فلسطینی شہید   

                                 link  \
0    https://jang.com.pk/news/1378483   
1  https://urdu.geo.tv/latest/387502-   
2  https://urdu.geo.tv/latest/387551-   
3  https://urdu.geo.tv/latest/387804-   
4    https://urdu.samaa.tv/2087325042   

                                             content          gold_label  
0  پاکستانی طلبا بین الاقوامی نیوکلیئر سائنس اولم...  science-technology  
1  کراچی ملکی تبادلہ منڈیوں ڈالر قیمت اضافہ ہوگیا...            business  
2  واشنگٹن امریکی سینیٹ اسرائیل غزہ جنگ اسلحے فرو...       international  
3  اسلام آباد رواں م

In [3]:
def tokenize(text):
    return text.lower().split()

vocab = set()
tokenized_texts = []
for text in texts:
    tokens = tokenize(text)
    tokenized_texts.append(tokens)
    vocab.update(tokens)

vocab = sorted(vocab)  
vocab_ind = {word: idx for idx, word in enumerate(vocab)}
def vectorize(tokens, vocab_ind):
    vect = np.zeros(len(vocab_ind))
    for token in tokens:
        if token in vocab_ind:
            vect[vocab_ind[token]] += 1
    return vect

vect_bow = np.array([vectorize(tokens, vocab_ind) for tokens in tokenized_texts])
train_x, test_x, train_y, test_y = train_test_split(vect_bow, labels, test_size=0.25, random_state=42)

print(train_x)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
class LogisticRegressionScratch:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        n_classes = y.shape[1]
        self.weights = np.zeros((n_features, n_classes))
        self.bias = np.zeros(n_classes)

        for _ in range(self.epochs):
            linear_model = np.dot(X, self.weights) + self.bias
            predictions = self.sigmoid(linear_model)

            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (predictions - y))
            db = (1 / n_samples) * np.sum(predictions - y, axis=0)

            # Update weights and bias
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        probabilities = self.sigmoid(linear_model)
        # for i in range(len(X)):
        #     print(f"Text: {texts.iloc[i]}")
        #     print(f"Predicted Label: { labels_list[np.argmax(probabilities[i])]}")
        #     print(f"Actual Label: { labels_list[np.argmax(test_y[i])]}\n")
            
        return np.argmax(probabilities, axis=1)


In [13]:
mod = LogisticRegressionScratch(lr=0.1, epochs=1000)
mod.fit(train_x, train_y)
y_test_ind = np.argmax(test_y, axis=1)
y_pred_ind = mod.predict(test_x)
acc = accuracy_score(y_test_ind, y_pred_ind)
print(f"Accuracy: {acc:.2%}")
print("\nClassification Report:\n")
print(classification_report(y_test_ind, y_pred_ind))

Accuracy: 95.85%

Classification Report:

              precision    recall  f1-score   support

           0       0.97      0.99      0.98        70
           1       0.94      0.99      0.96        76
           2       0.94      0.94      0.94        66
           3       0.96      0.92      0.94        50
           4       0.99      0.95      0.97        75

    accuracy                           0.96       337
   macro avg       0.96      0.96      0.96       337
weighted avg       0.96      0.96      0.96       337

