Install required **libraries**

In [2]:
# Install required Python libraries
# datasets → to download AG News from HuggingFace
# nltk → for stopwords
# scikit-learn → ML models, TF-IDF, evaluation
# joblib → save/load trained models

!pip install datasets pandas scikit-learn nltk joblib




Import **dependencies**

In [3]:
# Import all libraries required for the project

import pandas as pd
import numpy as np
import re
from datasets import load_dataset                       # HuggingFace datasets
from sklearn.model_selection import train_test_split     # Split data
from sklearn.feature_extraction.text import TfidfVectorizer  # Text → TF-IDF
from sklearn.naive_bayes import MultinomialNB            # ML Model
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import joblib                                            # Save/Load model

# Download stopwords (once)
nltk.download("stopwords")

# Store stopwords in a set for fast checking
STOP = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Load AG News **dataset**

In [4]:
# Load the AG News dataset using HuggingFace `datasets`
# The dataset contains: text + label (0–3)

dataset = load_dataset("ag_news")

# Convert to Pandas DataFrame for easier handling
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])

# Rename columns for readability
train_df = train_df.rename(columns={"text": "article", "label": "label"})
test_df = test_df.rename(columns={"text": "article", "label": "label"})

# Label mapping for readability
label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

# Create `category` column
train_df["category"] = train_df["label"].map(label_map)
test_df["category"] = test_df["label"].map(label_map)

# Display first few rows
train_df.head()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Unnamed: 0,article,label,category
0,Wall St. Bears Claw Back Into the Black (Reute...,2,Business
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2,Business
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2,Business
3,Iraq Halts Oil Exports from Main Southern Pipe...,2,Business
4,"Oil prices soar to all-time record, posing new...",2,Business


Function to clean **text**

In [5]:
# Function to preprocess and clean the article text
# Steps:
# 1. Convert to lowercase
# 2. Remove URLs
# 3. Remove special characters
# 4. Remove stopwords (the, is, an...)
# 5. Return cleaned text

def clean_text(text):
    text = text.lower()                                      # Lowercase
    text = re.sub(r"http\S+", "", text)                      # Remove URLs
    text = re.sub(r"[^a-z0-9\s]", " ", text)                 # Remove special chars
    tokens = text.split()                                    # Tokenize
    tokens = [t for t in tokens if t not in STOP]            # Remove stopwords
    return " ".join(tokens)                                  # Rejoin string


Apply text **cleaning**

In [6]:
# Clean the article text for both train & test datasets
# This will be used to train the ML model

train_df["clean"] = train_df["article"].apply(clean_text)
test_df["clean"] = test_df["article"].apply(clean_text)

# Split features (X) and labels (y)
X_train = train_df["clean"]
y_train = train_df["category"]

X_test = test_df["clean"]
y_test = test_df["category"]


Convert text to TF-IDF **vectors**

In [7]:
# TF-IDF converts text into numerical features that ML models can understand
# max_features → restrict vocabulary size
# ngram_range=(1, 2) → unigrams + bigrams

tfidf = TfidfVectorizer(
    max_features=25000,      # 25k words as maximum vocabulary
    ngram_range=(1, 2)       # Use 1-grams + 2-grams
)

# Fit on training text & transform both train and test
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


Train a Traditional ML model (Naive Bayes)**bold text**

In [8]:
# Train a Multinomial Naive Bayes classifier
# This is great for text classification and very fast

model = MultinomialNB(alpha=0.1)
model.fit(X_train_tfidf, y_train)


Evaluate the model

In [9]:
# Predict labels for test set
pred = model.predict(X_test_tfidf)

# Show accuracy and detailed classification report
print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))


Accuracy: 0.9038157894736842
              precision    recall  f1-score   support

    Business       0.88      0.85      0.86      1900
    Sci/Tech       0.87      0.89      0.88      1900
      Sports       0.95      0.98      0.97      1900
       World       0.92      0.89      0.91      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600



Save **model** and **vectorizer**

In [10]:
# Save both the ML model and TF-IDF vectorizer
# These will be used in Flask/FastAPI backend for prediction

joblib.dump(model, "news_classifier.joblib")
joblib.dump(tfidf, "tfidf_vectorizer.joblib")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!
