In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datasets import load_dataset

In [None]:
dataset = load_dataset("ag_news")

SUBSET_SIZE = 20000
tiny_subset = dataset['train'].shuffle(seed=42).select(range(SUBSET_SIZE))
split = tiny_subset.train_test_split(test_size=0.2, seed=42)

train_dataset = split['train']
val_dataset = split['test']
test_dataset = dataset['test'].select(range(1000))

print(f"Train: {len(train_dataset)} samples")
print(f"Val: {len(val_dataset)} samples")
print(f"Test: {len(test_dataset)} samples")


Train: 16000 samples
Val: 4000 samples
Test: 1000 samples


In [None]:
def prepare_ml_data():

    train_texts = [train_dataset[i]['text'] for i in range(len(train_dataset))]
    train_labels = [train_dataset[i]['label'] for i in range(len(train_dataset))]

    val_texts = [val_dataset[i]['text'] for i in range(len(val_dataset))]
    val_labels = [val_dataset[i]['label'] for i in range(len(val_dataset))]

    ml_train_texts = train_texts + val_texts
    ml_train_labels = train_labels + val_labels

    test_texts = [test_dataset[i]['text'] for i in range(len(test_dataset))]
    test_labels = [test_dataset[i]['label'] for i in range(len(test_dataset))]

    print(f"ML Training samples: {len(ml_train_texts)}")
    print(f"ML Test samples: {len(test_texts)}")

    return ml_train_texts, ml_train_labels, test_texts, test_labels

train_texts, train_labels, test_texts, test_labels = prepare_ml_data()

ML Training samples: 20000
ML Test samples: 1000


In [None]:
tfidf = TfidfVectorizer(
    max_features=15000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8,
    strip_accents='ascii',
    lowercase=True
)

X_train = tfidf.fit_transform(train_texts)
X_test = tfidf.transform(test_texts)

print(f" TF-IDF feature shape: {X_train.shape}")
print(f"   Vocabulary size: {len(tfidf.vocabulary_)}")



 TF-IDF feature shape: (20000, 15000)
   Vocabulary size: 15000


In [None]:
models = {
    "Logistic Regression": LogisticRegression(
        random_state=42,
        max_iter=1000,
        C=1.0,
        solver='liblinear'
    ),

    "Multinomial Naive Bayes": MultinomialNB(
        alpha=0.1
    ),

    "Support Vector Machine": SVC(
        random_state=42,
        kernel='linear',
        C=1.0,
        probability=True
    ),

    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        max_depth=20,
        min_samples_split=5,
        n_jobs=-1
    ),

    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=100,
        random_state=42,
        learning_rate=0.1,
        max_depth=5
    ),

    "K-Nearest Neighbors": KNeighborsClassifier(
        n_neighbors=7,
        weights='distance',
        n_jobs=-1
    )
}

In [None]:
results = {}
training_times = {}
prediction_times = {}
trained_models = {}

print(f"\n Training {len(models)} ML models...\n")

for name, model in models.items():
    print(f" Training {name}...")

    start_time = time.time()
    model.fit(X_train, train_labels)
    train_time = time.time() - start_time

    start_time = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - start_time

    accuracy = accuracy_score(test_labels, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        test_labels, y_pred, average='weighted'
    )

    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

    trained_models[name] = model


    training_times[name] = train_time
    prediction_times[name] = pred_time

    print(f"     ✅ Accuracy: {accuracy:.4f}")
    print(f"     📊 F1-Score: {f1:.4f}")
    print(f"     ⏱️  Train Time: {train_time:.2f}s")
    print(f"     🚀 Predict Time: {pred_time:.2f}s")
    print("-" * 50)


 Training 6 ML models...

 Training Logistic Regression...
     ✅ Accuracy: 0.8870
     📊 F1-Score: 0.8864
     ⏱️  Train Time: 0.54s
     🚀 Predict Time: 0.00s
--------------------------------------------------
 Training Multinomial Naive Bayes...
     ✅ Accuracy: 0.8910
     📊 F1-Score: 0.8905
     ⏱️  Train Time: 0.01s
     🚀 Predict Time: 0.00s
--------------------------------------------------
 Training Support Vector Machine...
     ✅ Accuracy: 0.8930
     📊 F1-Score: 0.8925
     ⏱️  Train Time: 328.06s
     🚀 Predict Time: 3.11s
--------------------------------------------------
 Training Random Forest...
     ✅ Accuracy: 0.7990
     📊 F1-Score: 0.7953
     ⏱️  Train Time: 4.14s
     🚀 Predict Time: 0.07s
--------------------------------------------------
 Training Gradient Boosting...
     ✅ Accuracy: 0.8500
     📊 F1-Score: 0.8494
     ⏱️  Train Time: 199.47s
     🚀 Predict Time: 0.01s
--------------------------------------------------
 Training K-Nearest Neighbors...
     ✅ 

In [None]:
import joblib
import os

os.makedirs('/content/drive/MyDrive/ml_models', exist_ok=True)

for name, model in trained_models.items():
    # Clean filename (remove spaces and special chars)
    filename = name.lower().replace(' ', '_').replace('(', '').replace(')', '')
    filepath = f'/content/drive/MyDrive/ml_models/{filename}.pkl'

    # Save model
    joblib.dump(model, filepath)
    print(f"     ✅ Saved: {name} → {filepath}")

# Save TF-IDF vectorizer (needed for predictions)
joblib.dump(tfidf, '/content/drive/MyDrive/ml_models/tfidf_vectorizer.pkl')

     ✅ Saved: Logistic Regression → /content/drive/MyDrive/ml_models/logistic_regression.pkl
     ✅ Saved: Multinomial Naive Bayes → /content/drive/MyDrive/ml_models/multinomial_naive_bayes.pkl
     ✅ Saved: Support Vector Machine → /content/drive/MyDrive/ml_models/support_vector_machine.pkl
     ✅ Saved: Random Forest → /content/drive/MyDrive/ml_models/random_forest.pkl
     ✅ Saved: Gradient Boosting → /content/drive/MyDrive/ml_models/gradient_boosting.pkl
     ✅ Saved: K-Nearest Neighbors → /content/drive/MyDrive/ml_models/k-nearest_neighbors.pkl


['/content/drive/MyDrive/ml_models/tfidf_vectorizer.pkl']