In [13]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
data = pd.read_csv('/Users/tahafaisal/Desktop/ml-news-classification/data/data5cleaned.csv')

# Load Urdu stopwords
with open('/Users/tahafaisal/Desktop/ml-news-classification/data/stopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

# Function to clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text

# Function to normalize Unicode characters
def normalize_unicode(text):
    return re.sub(r'[٠١٢٣٤٥٦٧٨٩]', '', text)  # Remove Urdu numerals

# Function to normalize Urdu-specific characters
def normalize_urdu(text):
    text = re.sub(r'[؁؂؃؄؅؆؇؈؉؊؋،؛؟]', '', text)  # Remove Urdu punctuation
    text = re.sub(r'[آإأٱ]', 'ا', text)  # Normalize different forms of 'alif'
    text = re.sub(r'[ىېۍ]', 'ی', text)  # Normalize different forms of 'ye'
    text = re.sub(r'[ۀہ]', 'ہ', text)  # Normalize 'heh'
    text = re.sub(r'[ؤو]', 'و', text)  # Normalize 'waw'
    text = re.sub(r'[ءئ]', 'ی', text)  # Normalize 'hamza' with 'ye'
    return text

# Function to tokenize text
def tokenize_text(text):
    return re.findall(r'\w+', text)  # Extract words using regex

# Function to remove stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stopwords]

# Function for custom lemmatization
def lemmatize_custom(word):
    if word.endswith('نا') or word.endswith('تے'):
        return word[:-2]  # Strip suffix
    elif word.endswith('ا') or word.endswith('ی'):
        return word[:-1]  # Remove singular/plural suffix
    return word

def lemmatize_text(tokens):
    return [lemmatize_custom(word) for word in tokens]

# Complete preprocessing function
def preprocess_urdu_text(text):
    text = clean_text(text)  # Clean text
    text = normalize_unicode(text)  # Normalize Unicode
    text = normalize_urdu(text)  # Normalize Urdu-specific characters
    tokens = tokenize_text(text)  # Tokenize text
    tokens = remove_stopwords(tokens)  # Remove stopwords
    tokens = lemmatize_text(tokens)  # Apply lemmatization
    return ' '.join(tokens)  # Return preprocessed text

# Apply preprocessing to dataset
data['title'] = data['title'].apply(preprocess_urdu_text)
data['content'] = data['content'].apply(preprocess_urdu_text)
data['combined'] = data['title'] + " " + data['content']

# Splitting data into training and testing sets
X = data['combined']
y = data['gold_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))  # Unigrams and bigrams
X_train_vec = tfidf_vectorizer.fit_transform(X_train)
X_test_vec = tfidf_vectorizer.transform(X_test)

# SVM Model with Grid Search for Hyperparameter Tuning
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm = SVC()
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train_vec, y_train)

# Best parameters from Grid Search
print("Best Parameters:", grid_search.best_params_)

# Use the best model for testing
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test_vec)

# Evaluation
print("\nFinal SVM Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   1.7s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   1.8s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   1.7s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   2.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   1.7s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   2.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   2.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   2.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   2.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   2.0s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   1.7s
[CV] END ...................C=0.1, gamma=auto, k

# Random KNN Implementation with SK Learn

In [12]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
data = pd.read_csv('/Users/tahafaisal/Desktop/ml-news-classification/data/data5cleaned.csv')

# Load Urdu stopwords
with open('/Users/tahafaisal/Desktop/ml-news-classification/data/stopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

# Function to normalize Urdu text
def normalize_urdu(text):
    text = re.sub(r'[؁؂؃؄؅؆؇؈؉؊؋،؛؟]', '', text)  # Remove Urdu-specific punctuation
    text = re.sub(r'[آإأٱ]', 'ا', text)  # Normalize different forms of 'alif'
    text = re.sub(r'[ىېۍ]', 'ی', text)  # Normalize different forms of 'ye'
    text = re.sub(r'[ۀہ]', 'ہ', text)  # Normalize 'heh'
    text = re.sub(r'[ؤو]', 'و', text)  # Normalize 'waw'
    text = re.sub(r'[ءئ]', 'ی', text)  # Normalize 'hamza' with 'ye'
    return text

# Function to preprocess Urdu text
def preprocess_urdu_text(text):
    text = normalize_urdu(text)
    text = re.sub(r'[^\w\s]', '', text)  # Remove any remaining punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords])  # Remove stopwords
    return text

# Apply preprocessing to dataset
data['title'] = data['title'].apply(preprocess_urdu_text)
data['content'] = data['content'].apply(preprocess_urdu_text)
data['combined'] = data['title'] + " " + data['content']

# Splitting data into training and testing sets
X = data['combined']
y = data['gold_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))  # Unigrams and bigrams
X_train_vec = tfidf_vectorizer.fit_transform(X_train)
X_test_vec = tfidf_vectorizer.transform(X_test)

# KNN Model with Grid Search for Hyperparameter Tuning
param_grid = {
    'n_neighbors': [3, 5, 7, 9],  # Different values for k
    'weights': ['uniform', 'distance'],  # Weighting strategies
    'metric': ['euclidean', 'manhattan']  # Distance metrics
}

knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train_vec.toarray(), y_train)  # KNN requires dense input

# Best parameters from Grid Search
print("Best Parameters:", grid_search.best_params_)

# Use the best model for testing
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test_vec.toarray())  # Convert sparse matrix to dense

# Evaluation
print("\nFinal KNN Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.3s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.1s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.1s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.1s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.1s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.1s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.1s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.1s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.1s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.1s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   0.1s
[CV] END ...metric=euclidean, n_neighbors=5, wei

Traceback (most recent call last):
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packag

[CV] END ..metric=manhattan, n_neighbors=3, weights=distance; total time=   0.9s
[CV] END ..metric=manhattan, n_neighbors=3, weights=distance; total time=   0.9s
[CV] END ..metric=manhattan, n_neighbors=3, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=3, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=3, weights=distance; total time=   0.8s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s


Traceback (most recent call last):
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packag

[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time=   0.8s
[CV] END ...metric=manhattan, n_neighbors=7, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=7, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=7, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=7, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=7, weights=uniform; total time=   0.0s


Traceback (most recent call last):
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packag

[CV] END ..metric=manhattan, n_neighbors=7, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=7, weights=distance; total time=   0.9s
[CV] END ..metric=manhattan, n_neighbors=7, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=7, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=7, weights=distance; total time=   0.8s
[CV] END ...metric=manhattan, n_neighbors=9, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=9, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=9, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=9, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=9, weights=uniform; total time=   0.0s


Traceback (most recent call last):
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tahafaisal/anaconda3/lib/python3.11/site-packag

[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time=   0.8s
[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time=   0.8s
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}

Final KNN Model Performance:
Accuracy: 0.9748
Classification Report:
                     precision    recall  f1-score   support

          Business       0.97      1.00      0.99        76
     Entertainment       0.99      0.94      0.96        87
     International       0.95      0.98      0.97        99
Science-Technology       0.97      1.00      0.99        75
            Sports       0.99      0.96      0.97       100

          accuracy                           0.97       437
         macro avg  

 0.94622186 0.95022677        nan 0.73913713        nan 0.73399754
        nan 0.73570856        nan 0.75630127]
