In [None]:
## ***************************** Classify News Headlines into Categories (Text) ********************************

from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression #use  for classification
from sklearn.metrics import accuracy_score, classification_report #To evaluate the model's performance
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

# ---- 1. Load the AG News dataset ----
ds = load_dataset("wangrongsheng/ag_news")

#  ---- 2. Split the dataset into train and test data ----
train_data = ds['train']
test_data = ds['test']

# The dataset has 'text' as (news articles) and 'label' as (the category)
X_train = train_data['text'] # Contains the news headlines
y_train = train_data['label'] # Contains the corresponding labels (World, Sports, Business, Science/Technology).
X_test = test_data['text']
y_test = test_data['label']

# ---- 3. Preprocess the data ----
# Convert text data to numerical format using TF-IDF 
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train) # convert data to matrix
X_test_vec = vectorizer.transform(X_test)

# ---- 4. Train multiple classifiers and compare accuracy ----
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Dictionary to store the accuracy of each model
accuracy_results = {}

for model_name, model in models.items():
    print(f"\nTraining {model_name} model...")

    # Train the model
    model.fit(X_train_vec, y_train)

    # Predict on test data
    y_pred = model.predict(X_test_vec)

    # Evaluate accuracy and store results
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[model_name] = accuracy

    # Print the results for this model
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"{model_name} Model Result:")
    print(classification_report(y_test, y_pred, target_names=["World", "Sports", "Business", "Science/Technology"]))

# ---- 5. Rank the models by accuracy ----
print("\nModel Accuracy Comparison:")
for model_name, accuracy in sorted(accuracy_results.items(), key=lambda item: item[1], reverse=True):
    print(f"{model_name}: {accuracy * 100:.2f}%")


Training Logistic Regression model...
Accuracy: 91.49%
Logistic Regression Model Result:
                    precision    recall  f1-score   support

             World       0.93      0.90      0.92      1900
            Sports       0.96      0.98      0.97      1900
          Business       0.89      0.88      0.88      1900
Science/Technology       0.89      0.90      0.89      1900

          accuracy                           0.91      7600
         macro avg       0.91      0.91      0.91      7600
      weighted avg       0.91      0.91      0.91      7600


Training Decision Tree model...
