In [7]:
import pandas as pd
import re
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# --- Load Dataset ---
data = pd.read_csv('/Users/tahafaisal/Desktop/ml-news-classification/data/FINAL_DATASET.csv')

# Load Urdu stopwords
with open('/Users/tahafaisal/Desktop/ml-news-classification/data/stopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

# --- Preprocessing Functions ---
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text

def normalize_unicode(text):
    return re.sub(r'[٠١٢٣٤٥٦٧٨٩]', '', text)  # Remove Urdu numerals

def normalize_urdu(text):
    text = re.sub(r'[؁؂؃؄؅؆؇؈؉؊؋،؛؟]', '', text)  # Remove Urdu punctuation
    text = re.sub(r'[آإأٱ]', 'ا', text)  # Normalize different forms of 'alif'
    text = re.sub(r'[ىېۍ]', 'ی', text)  # Normalize different forms of 'ye'
    text = re.sub(r'[ۀہ]', 'ہ', text)  # Normalize 'heh'
    text = re.sub(r'[ؤو]', 'و', text)  # Normalize 'waw'
    text = re.sub(r'[ءئ]', 'ی', text)  # Normalize 'hamza' with 'ye'
    return text

def tokenize_text(text):
    return re.findall(r'\w+', text)  # Extract words using regex

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stopwords]

def preprocess_urdu_text(text):
    text = clean_text(text)  # Clean text
    text = normalize_unicode(text)  # Normalize Unicode
    text = normalize_urdu(text)  # Normalize Urdu-specific characters
    tokens = tokenize_text(text)  # Tokenize text
    tokens = remove_stopwords(tokens)  # Remove stopwords
    return ' '.join(tokens)  # Return preprocessed text

# Apply preprocessing to dataset
data['title'] = data['title'].apply(preprocess_urdu_text)
data['content'] = data['content'].apply(preprocess_urdu_text)
data['combined'] = data['title'] + " " + data['content']

# --- Feature Extraction ---
# Use TF-IDF for feature representation
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=6000, ngram_range=(1, 2))  # Include unigrams and bigrams
X = vectorizer.fit_transform(data['combined']).toarray()

# Map string labels to numerical labels
unique_labels = np.unique(data['gold_label'])
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
inverse_label_mapping = {idx: label for label, idx in label_mapping.items()}

y = np.array([label_mapping[label] for label in data['gold_label']])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train XGBoost Classifier ---
from sklearn.svm import SVC

# Train a Support Vector Machine Classifier
svm_model = SVC(kernel='linear', C=1, probability=True)  # You can tune 'C' and kernel type
svm_model.fit(X_train, y_train)

# Predict on test data
y_pred = svm_model.predict(X_test)
y_pred_labels = [inverse_label_mapping[label] for label in y_pred]
y_test_labels = [inverse_label_mapping[label] for label in y_test]

# Evaluate the Model
print("SVM Classifier Accuracy:", accuracy_score(y_test_labels, y_pred_labels))
print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))


print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))
print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))


SVM Classifier Accuracy: 0.9816933638443935
Classification Report:
                     precision    recall  f1-score   support

          Business       1.00      0.97      0.99        76
     Entertainment       0.99      0.98      0.98        87
     International       0.95      0.99      0.97        99
Science-Technology       0.97      0.99      0.98        75
            Sports       1.00      0.98      0.99       100

          accuracy                           0.98       437
         macro avg       0.98      0.98      0.98       437
      weighted avg       0.98      0.98      0.98       437

Accuracy: 0.9816933638443935
Classification Report:
                     precision    recall  f1-score   support

          Business       1.00      0.97      0.99        76
     Entertainment       0.99      0.98      0.98        87
     International       0.95      0.99      0.97        99
Science-Technology       0.97      0.99      0.98        75
            Sports       1.00      