In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load data
data = pd.read_csv('/Users/tahafaisal/Desktop/ml-news-classification/data/data5cleaned.csv')

# Preprocessing: Assuming preprocessed text in 'combined' column
with open('/Users/tahafaisal/Desktop/ml-news-classification/data/stopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

def preprocessor(text):
    return ' '.join([word for word in text.split() if word not in stopwords])

data['title'] = data['title'].apply(preprocessor)
data['content'] = data['content'].apply(preprocessor)
data['combined'] = data['title'] + " " + data['content']

# Splitting data
X = data['combined']
y = data['gold_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Bag of Words Vectorization
class BagOfWords:
    def __init__(self):
        self.vocabulary = {}
        self.vocab_size = 0

    def fit(self, corpus):
        unique_words = set()
        for sentence in corpus:
            words = sentence.split()
            unique_words.update(words)
        self.vocabulary = {word: idx for idx, word in enumerate(sorted(unique_words))}
        self.vocab_size = len(self.vocabulary)

    def vectorize(self, sentence):
        vector = [0] * self.vocab_size
        words = sentence.split()
        for word in words:
            if word in self.vocabulary:
                index = self.vocabulary[word]
                vector[index] += 1
        return vector

bow_model = BagOfWords()
bow_model.fit(X_train)

vector_train = [bow_model.vectorize(doc) for doc in X_train]
vector_test = [bow_model.vectorize(doc) for doc in X_test]

# List of models to test
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Support Vector Machine (SVM)": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=50, random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(n_neighbors=5),
    "Neural Network (MLPClassifier)": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(vector_train, y_train)
    y_pred = model.predict(vector_test)
    
    # Evaluate and print results
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"\n{name} Model")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Compare results
print("\nModel Comparison:")
for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy:.4f}")



Training Logistic Regression...

Logistic Regression Model
Accuracy: 0.9703
Classification Report:
                    precision    recall  f1-score   support

          Business       1.00      0.96      0.98        76
     Entertainment       0.98      0.97      0.97        87
     International       0.94      0.98      0.96        99
Science-Technology       0.94      0.99      0.96        75
            Sports       1.00      0.96      0.98       100

          accuracy                           0.97       437
         macro avg       0.97      0.97      0.97       437
      weighted avg       0.97      0.97      0.97       437


Training Naive Bayes...

Naive Bayes Model
Accuracy: 0.9840
Classification Report:
                    precision    recall  f1-score   support

          Business       1.00      0.97      0.99        76
     Entertainment       0.99      0.99      0.99        87
     International       0.96      0.99      0.98        99
Science-Technology       0.97   




AdaBoost Model
Accuracy: 0.7803
Classification Report:
                    precision    recall  f1-score   support

          Business       0.88      0.70      0.78        76
     Entertainment       0.94      0.72      0.82        87
     International       0.74      0.78      0.76        99
Science-Technology       0.54      0.91      0.68        75
            Sports       0.99      0.80      0.88       100

          accuracy                           0.78       437
         macro avg       0.82      0.78      0.78       437
      weighted avg       0.83      0.78      0.79       437


Training Extra Trees...

Extra Trees Model
Accuracy: 0.9680
Classification Report:
                    precision    recall  f1-score   support

          Business       0.99      0.95      0.97        76
     Entertainment       0.96      0.98      0.97        87
     International       0.93      0.99      0.96        99
Science-Technology       0.97      0.92      0.95        75
            Spor