In [60]:
import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score, classification_report, precision_score, f1_score


import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.ensemble import VotingClassifier

from sklearn.feature_extraction.text import TfidfVectorizer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\viewh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\viewh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\viewh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# accesing News_Category_Dataset

file_path = 'C:\\Users\\viewh\\OneDrive\\Documents\\News_Category_Dataset_v3.json'
data = []

try:
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
except FileNotFoundError:
    print(f"File not found at the path: {file_path}")
except Exception as e:
    print(f"An error occurred: {str(e)}")

df = pd.DataFrame(data)
df

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


Exploratory Analysis

In [5]:
#remove empty cells
df = df[df['char_count'] != 0]

In [12]:
# count the number of categories
category_count = df['category'].nunique()
category_count

42

In [13]:
# Reduce categories to 16 for computational simplification
# Combine some features and remove others
df['category'] = df['category'].replace('COLLEGE', 'EDUCATION')
df['category'] = df['category'].replace('ARTS', 'ARTS & CULTURE')
df['category'] = df['category'].replace('CULTURE & ARTS', 'ARTS & CULTURE')
df['category'] = df['category'].replace('WELLNESS', 'HEALTHY LIVING')
df['category'] = df['category'].replace('PARENTS', 'PARENTING')
df['category'] = df['category'].replace('STYLE', 'STYLE & BEAUTY')
df['category'] = df['category'].replace('GREEN', 'ENVIRONMENT')
values_to_remove = ['WEIRD NEWS', 'FOOD & DRINK', 'U.S. NEWS', 'WORLD NEWS',  'MONEY', 'QUEER VOICES', 'BLACK VOICES', 'TECH', 'MEDIA', 'LATINO VOICES', 'COMEDY', 'IMPACT', 'TASTE', 'GOOD NEWS', 'THE WORLDPOST', 'WORLDPOST', 'WEDDINGS', 'DIVORCE', 'FIFTY']
df = df[~df['category'].isin(values_to_remove)]
df = df.reset_index(drop=True)

In [14]:
category_count = df['category'].nunique()
category_count

16

Text Preprocessing
Tokenize headline, remove stop words, punctuation and capitalization

In [16]:
# Assign 'headline' as the input feature and 'category' as the target
X = df['headline']
y = df['category']

# Split with the data for training and testing desired test_size
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_indices = []
test_indices = []

# Split the data while maintaining class distribution
for train_index, test_index in sss.split(X, y):
    train_indices.append(train_index)
    test_indices.append(test_index)

# Use the indices to create the training and testing sets
X_train = X[train_indices[0]]
X_test = X[test_indices[0]]
y_train = y[train_indices[0]]
y_test = y[test_indices[0]]

# Tokenization and Text Normalization
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

# Apply Tokenization to each row in X_train and store in X_train_preprocessed
num_samples = X_train.shape[0]
X_train_preprocessed = []

for i in range(num_samples):
    row = X_train.iloc[i]
    preprocessed_text = preprocess_text(row)
    X_train_preprocessed.append(preprocessed_text)

# Apply Tokenization to each row in X_test and store in X_test_preprocessed
num_samples = X_test.shape[0]
X_test_preprocessed = []

for i in range(num_samples):
    row = X_test.iloc[i]
    preprocessed_text = preprocess_text(row)
    X_test_preprocessed.append(preprocessed_text)

# Encode the target labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Vectorize the preprocessed text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english') 
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_preprocessed)
X_test_tfidf = tfidf_vectorizer.transform(X_test_preprocessed)


In [78]:
#MULTINOMIAL 

# Define a grid of hyperparameters for MultinomialNB
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 10.0, 100.0], 
}
clf = MultinomialNB()

# GridSearchCV cross-validation and scoring 
grid_search_nb = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', refit=True)

nbstart = time.time()
grid_search_nb.fit(X_train_tfidf, y_train_encoded)
nbstop = time.time()

# Get the best model and its hyperparameters
best_clf = grid_search_nb.best_estimator_
nb_best_alpha = best_clf.alpha

# Predict categories for the training set
y_train_pred = best_clf.predict(X_train_tfidf)

# Predict categories for the test set
y_test_pred = best_clf.predict(X_test_tfidf)

# Train set results
nb_precision_train = precision_score(y_train_encoded, y_train_pred, average='weighted')
nb_recall_train = recall_score(y_train_encoded, y_train_pred, average='weighted')
nb_f1_train = f1_score(y_train_encoded, y_train_pred, average='weighted')
nb_train_accuracy = accuracy_score(y_train_encoded, y_train_pred)

# Test set results
nb_precision_test = precision_score(y_test_encoded, y_test_pred, average='weighted')
nb_recall_test = recall_score(y_test_encoded, y_test_pred, average='weighted')
nb_f1_test = f1_score(y_test_encoded, y_test_pred, average='weighted')
nb_test_accuracy = accuracy_score(y_test_encoded, y_test_pred)

class_names = label_encoder.classes_
nb_time = nbstop - nbstart

# Results
results_data = {
    'Metric': ['Best Alpha', 'Train Precision', 'Train Recall', 'Train F1-Score', 'Train Accuracy',
               'Test Precision', 'Test Recall', 'Test F1-Score', 'Test Accuracy', 'Grid Search Time'],
    'NB Results': [nb_best_alpha, nb_precision_train, nb_recall_train, nb_f1_train, nb_train_accuracy,
              nb_precision_test, nb_recall_test, nb_f1_test, nb_test_accuracy, nb_time]
}

results_df = pd.DataFrame(results_data)


# Calculate classification report metrics for train and test data
train_classification_report = classification_report(y_train_encoded, y_train_pred, target_names=class_names, output_dict=True)
test_classification_report = classification_report(y_test_encoded, y_test_pred, target_names=class_names, output_dict=True)

train_classification_df = pd.DataFrame(train_classification_report).transpose()
test_classification_df = pd.DataFrame(test_classification_report).transpose()

train_classification_df.rename(columns={'precision': 'Train Precision', 'recall': 'Train Recall', 'f1-score': 'Train F1-Score', 'support': 'Train Support'}, inplace=True)
test_classification_df.rename(columns={'precision': 'Test Precision', 'recall': 'Test Recall', 'f1-score': 'Test F1-Score', 'support': 'Test Support'}, inplace=True)

# Print Results

print("\nBest alpha:", nb_best_alpha)

print(results_df)

print("Classification Report by Category (Train Data) for the Best Hyperparameter:")
print(train_classification_df)

print("\nClassification Report by Category (Test Data) for the Best Hyperparameter:")
print(test_classification_df)



Best alpha: 0.1
             Metric  NB Results
0        Best Alpha    0.100000
1   Train Precision    0.826818
2      Train Recall    0.812990
3    Train F1-Score    0.805137
4    Train Accuracy    0.812990
5    Test Precision    0.697963
6       Test Recall    0.690125
7     Test F1-Score    0.667307
8     Test Accuracy    0.690125
9  Grid Search Time    2.162459
Classification Report by Category (Train Data) for the Best Hyperparameter:
                Train Precision  Train Recall  Train F1-Score  Train Support
ARTS & CULTURE         0.951368      0.598661        0.734886     3137.00000
BUSINESS               0.893784      0.546005        0.677891     4793.00000
CRIME                  0.891880      0.732281        0.804239     2850.00000
EDUCATION              0.935821      0.363268        0.523372     1726.00000
ENTERTAINMENT          0.816788      0.866657        0.840984    13889.00000
ENVIRONMENT            0.880489      0.597910        0.712193     3253.00000
HEALTHY LIVING  

In [79]:
#Logistic_Regression

# Define a grid of hyperparameters for Logistic Regression
param_grid = {'penalty': ['l1', 'l2', 'none'],
              'fit_intercept': [True, False]}

clf = LogisticRegression()

# GridSearchCV cross-validation and scoring
grid_search_lr = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', refit=True)

lrstart = time.time()
grid_search_lr.fit(X_train_tfidf, y_train_encoded)
lrstop = time.time()

# Get the best model and its hyperparameters
lr_best_clf = grid_search_lr.best_estimator_

# Predict categories for the training set
y_train_pred = lr_best_clf.predict(X_train_tfidf)

# Predict categories for the test set
y_test_pred = lr_best_clf.predict(X_test_tfidf)

# Train set metrics results
lr_precision_train = precision_score(y_train_encoded, y_train_pred, average='weighted')
lr_recall_train = recall_score(y_train_encoded, y_train_pred, average='weighted')
lr_f1_train = f1_score(y_train_encoded, y_train_pred, average='weighted')
lr_train_accuracy = accuracy_score(y_train_encoded, y_train_pred)

# Test set metrics results
lr_precision_test = precision_score(y_test_encoded, y_test_pred, average='weighted')
lr_recall_test = recall_score(y_test_encoded, y_test_pred, average='weighted')
lr_f1_test = f1_score(y_test_encoded, y_test_pred, average='weighted')
lr_test_accuracy = accuracy_score(y_test_encoded, y_test_pred)

# Print metrics by category 
lr_time = lrstop - lrstart
lr_best_para = grid_search_lr.best_params_


# Results
results_data = {
    'Metric': ['Best Parameters', 'Train Precision', 'Train Recall', 'Train F1-Score', 'Train Accuracy',
               'Test Precision', 'Test Recall', 'Test F1-Score', 'Test Accuracy', 'Grid Search Time'],
    'lr Results': [lr_best_para, lr_precision_train, lr_recall_train, lr_f1_train, lr_train_accuracy,
              lr_precision_test, lr_recall_test, lr_f1_test, lr_test_accuracy, lr_time]
}

results_df = pd.DataFrame(results_data)

# Calculate classification report metrics for train and test data
class_names = label_encoder.classes_
train_classification_report = classification_report(y_train_encoded, y_train_pred, target_names=class_names, output_dict=True)
test_classification_report = classification_report(y_test_encoded, y_test_pred, target_names=class_names, output_dict=True)

train_classification_df = pd.DataFrame(train_classification_report).transpose()
test_classification_df = pd.DataFrame(test_classification_report).transpose()

train_classification_df.rename(columns={'precision': 'Train Precision', 'recall': 'Train Recall', 'f1-score': 'Train F1-Score', 'support': 'Train Support'}, inplace=True)
test_classification_df.rename(columns={'precision': 'Test Precision', 'recall': 'Test Recall', 'f1-score': 'Test F1-Score', 'support': 'Test Support'}, inplace=True)

# Print Results

print("\nBest parameter:", lr_best_para)

print(results_df)

print("Classification Report by Category (Train Data) for the Best Hyperparameter:")
print(train_classification_df)

print("\nClassification Report by Category (Test Data) for the Best Hyperparameter:")
print(test_classification_df)


Best parameter: {'fit_intercept': False, 'penalty': 'l2'}
             Metric                                 lr Results
0   Best Parameters  {'fit_intercept': False, 'penalty': 'l2'}
1   Train Precision                                   0.804844
2      Train Recall                                    0.80663
3    Train F1-Score                                   0.802106
4    Train Accuracy                                    0.80663
5    Test Precision                                   0.724608
6       Test Recall                                   0.730887
7     Test F1-Score                                   0.723323
8     Test Accuracy                                   0.730887
9  Grid Search Time                                 261.468177
Classification Report by Category (Train Data) for the Best Hyperparameter:
                Train Precision  Train Recall  Train F1-Score  Train Support
ARTS & CULTURE         0.746430      0.549888        0.633260     3137.00000
BUSINESS          

In [80]:
# Decision Tree

# Define a grid of hyperparameters for DecisionTreeClassifier
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 1, 2, 3, 4],
    'min_samples_split': [1, 2, 3, 4]
}

clf = DecisionTreeClassifier()

# GridSearchCV cross-validation and scoring
grid_search_dt = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', refit=True)

dtstart = time.time()
grid_search_dt.fit(X_train_tfidf, y_train_encoded)
dtstop = time.time()

# Get the best model 
dt_best_clf = grid_search_dt.best_estimator_

# Predict categories for the training set
y_train_pred = dt_best_clf.predict(X_train_tfidf)

# Predict categories for the test set
y_test_pred = dt_best_clf.predict(X_test_tfidf)

# Train set metrics results
dt_train_accuracy = accuracy_score(y_train_encoded, y_train_pred)
dt_precision_train = precision_score(y_train_encoded, y_train_pred, average='weighted')
dt_recall_train = recall_score(y_train_encoded, y_train_pred, average='weighted')
dt_f1_train = f1_score(y_train_encoded, y_train_pred, average='weighted')

# Test set metrics results
dt_test_accuracy = accuracy_score(y_test_encoded, y_test_pred)
dt_precision_test = precision_score(y_test_encoded, y_test_pred, average='weighted')
dt_recall_test = recall_score(y_test_encoded, y_test_pred, average='weighted')
dt_f1_test = f1_score(y_test_encoded, y_test_pred, average='weighted')

# Print metrics by category within y_test

class_names = label_encoder.classes_
dt_time = dtstop - dtstart
dt_best_para = grid_search_dt.best_params_


# Results
results_data = {
    'Metric': ['Best Parameters', 'Train Precision', 'Train Recall', 'Train F1-Score', 'Train Accuracy',
               'Test Precision', 'Test Recall', 'Test F1-Score', 'Test Accuracy', 'Grid Search Time'],
    'dt Results': [dt_best_para, dt_precision_train, dt_recall_train, dt_f1_train, dt_train_accuracy,
              dt_precision_test, dt_recall_test, dt_f1_test, dt_test_accuracy, dt_time]
}

results_df = pd.DataFrame(results_data)


# Calculate classification report metrics for train and test data
train_classification_report = classification_report(y_train_encoded, y_train_pred, target_names=class_names, output_dict=True)
test_classification_report = classification_report(y_test_encoded, y_test_pred, target_names=class_names, output_dict=True)

train_classification_df = pd.DataFrame(train_classification_report).transpose()
test_classification_df = pd.DataFrame(test_classification_report).transpose()

train_classification_df.rename(columns={'precision': 'Train Precision', 'recall': 'Train Recall', 'f1-score': 'Train F1-Score', 'support': 'Train Support'}, inplace=True)
test_classification_df.rename(columns={'precision': 'Test Precision', 'recall': 'Test Recall', 'f1-score': 'Test F1-Score', 'support': 'Test Support'}, inplace=True)

# Print Results

print("\nBest parameter:", dt_best_para)

print(results_df)

print("Classification Report by Category (Train Data) for the Best Hyperparameter:")
print(train_classification_df)

print("\nClassification Report by Category (Test Data) for the Best Hyperparameter:")
print(test_classification_df)


Best parameter: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 3}
             Metric                                         dt Results
0   Best Parameters  {'criterion': 'gini', 'max_depth': None, 'min_...
1   Train Precision                                           0.976957
2      Train Recall                                           0.976288
3    Train F1-Score                                           0.976299
4    Train Accuracy                                           0.976288
5    Test Precision                                           0.600816
6       Test Recall                                           0.606396
7     Test F1-Score                                           0.601058
8     Test Accuracy                                           0.606396
9  Grid Search Time                                        4651.786371
Classification Report by Category (Train Data) for the Best Hyperparameter:
                Train Precision  Train Recall  Train F1-Score

In [81]:
# Ensemble Classifiers
lr_clf = LogisticRegression(fit_intercept=False, penalty='l2')
nb_clf = MultinomialNB(alpha=0.1)  

# Create the ensemble using VotingClassifier
ensemble_clf = VotingClassifier(estimators=[
    ('multinomial_nb', nb_clf),
    ('logistic_lr', lr_clf),
], voting='hard')  

enstart = time.time()
ensemble_clf.fit(X_train_tfidf, y_train_encoded)
enstop = time.time()

# Predict categories for the training set
y_train_pred = ensemble_clf.predict(X_train_tfidf)

# Predict categories for the test set
y_test_pred = ensemble_clf.predict(X_test_tfidf)

# Train set metrics results
en_precision_train = precision_score(y_train_encoded, y_train_pred, average='weighted')
en_recall_train = recall_score(y_train_encoded, y_train_pred, average='weighted')
en_f1_train = f1_score(y_train_encoded, y_train_pred, average='weighted')
en_train_accuracy = accuracy_score(y_train_encoded, y_train_pred)

# Test set metrics results
en_precision_test = precision_score(y_test_encoded, y_test_pred, average='weighted')
en_recall_test = recall_score(y_test_encoded, y_test_pred, average='weighted')
en_f1_test = f1_score(y_test_encoded, y_test_pred, average='weighted')
en_test_accuracy = accuracy_score(y_test_encoded, y_test_pred)

# Print metrics by category within y_test
class_names = label_encoder.classes_
en_time = enstop - enstart
en_best_para = 'N/A'


# Results
results_data = {
    'Metric': ['Best Parameters', 'Train Precision', 'Train Recall', 'Train F1-Score', 'Train Accuracy',
               'Test Precision', 'Test Recall', 'Test F1-Score', 'Test Accuracy', 'Grid Search Time'],
    'En Results': [en_best_para, en_precision_train, en_recall_train, en_f1_train, en_train_accuracy,
              en_precision_test, en_recall_test, en_f1_test, en_test_accuracy, en_time]
}

results_df = pd.DataFrame(results_data)


# Calculate classification report metrics for train and test data
train_classification_report = classification_report(y_train_encoded, y_train_pred, target_names=class_names, output_dict=True)
test_classification_report = classification_report(y_test_encoded, y_test_pred, target_names=class_names, output_dict=True)

train_classification_df = pd.DataFrame(train_classification_report).transpose()
test_classification_df = pd.DataFrame(test_classification_report).transpose()

train_classification_df.rename(columns={'precision': 'Train Precision', 'recall': 'Train Recall', 'f1-score': 'Train F1-Score', 'support': 'Train Support'}, inplace=True)
test_classification_df.rename(columns={'precision': 'Test Precision', 'recall': 'Test Recall', 'f1-score': 'Test F1-Score', 'support': 'Test Support'}, inplace=True)

# Print Results

print(results_df)

print("Classification Report by Category (Train Data) for the Best Hyperparameter:")
print(train_classification_df)

print("\nClassification Report by Category (Test Data) for the Best Hyperparameter:")
print(test_classification_df)

             Metric En Results
0   Best Parameters        N/A
1   Train Precision   0.821128
2      Train Recall   0.811605
3    Train F1-Score    0.80523
4    Train Accuracy   0.811605
5    Test Precision   0.720697
6       Test Recall    0.71099
7     Test F1-Score   0.698021
8     Test Accuracy    0.71099
9  Grid Search Time   7.119377
Classification Report by Category (Train Data) for the Best Hyperparameter:
                Train Precision  Train Recall  Train F1-Score  Train Support
ARTS & CULTURE         0.780022      0.689512        0.731980    3137.000000
BUSINESS               0.769698      0.686835        0.725910    4793.000000
CRIME                  0.805906      0.785263        0.795451    2850.000000
EDUCATION              0.753846      0.567787        0.647720    1726.000000
ENTERTAINMENT          0.771048      0.895457        0.828609   13889.000000
ENVIRONMENT            0.771410      0.719951        0.744792    3253.000000
HEALTHY LIVING         0.722597      0.90102

In [83]:
results_data_all= {
    'Metric': ['Best Alpha', 'Train Precision', 'Train Recall', 'Train F1-Score', 'Train Accuracy',
               'Test Precision', 'Test Recall', 'Test F1-Score', 'Test Accuracy', 'Grid Search Time'],
    'NB Results': [nb_best_alpha, nb_precision_train, nb_recall_train, nb_f1_train, nb_train_accuracy,
              nb_precision_test, nb_recall_test, nb_f1_test, nb_test_accuracy, nb_time],
    'lr Results': [lr_best_para, lr_precision_train, lr_recall_train, lr_f1_train, lr_train_accuracy,
              lr_precision_test, lr_recall_test, lr_f1_test, lr_test_accuracy, lr_time],
    'dt Results': [dt_best_para, dt_precision_train, dt_recall_train, dt_f1_train, dt_train_accuracy,
              dt_precision_test, dt_recall_test, dt_f1_test, dt_test_accuracy, dt_time],
     'En Results': [en_best_para, en_precision_train, en_recall_train, en_f1_train, en_train_accuracy,
              en_precision_test, en_recall_test, en_f1_test, en_test_accuracy, en_time]
}

results_df_all = pd.DataFrame(results_data_all)
print(results_df_all)

             Metric  NB Results                                 lr Results  \
0        Best Alpha    0.100000  {'fit_intercept': False, 'penalty': 'l2'}   
1   Train Precision    0.826818                                   0.804844   
2      Train Recall    0.812990                                    0.80663   
3    Train F1-Score    0.805137                                   0.802106   
4    Train Accuracy    0.812990                                    0.80663   
5    Test Precision    0.697963                                   0.724608   
6       Test Recall    0.690125                                   0.730887   
7     Test F1-Score    0.667307                                   0.723323   
8     Test Accuracy    0.690125                                   0.730887   
9  Grid Search Time    2.162459                                 261.468177   

                                          dt Results En Results  
0  {'criterion': 'gini', 'max_depth': None, 'min_...        N/A  
1        

Conclusion: When considering the headline as the target variable (y), logistic regression emerges as the optimal model, achieving the highest level of accuracy.