# -----------------------------------------------------------------------------

# Importing Core Libraries

# -----------------------------------------------------------------------------

In [None]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------------------------------------------------------------

# Data Collection & Exploration

# -----------------------------------------------------------------------------

In [None]:
# Load the dataset (This is News_Categories.csv, I renamed it to dataset.csv for simplicity)
df = pd.read_csv('dataset.csv')

In [None]:
# The first 5 rows of the dataset
df.head()

In [None]:
# The last 5 rows of the dataset
df.tail()

In [None]:
# Rows and Columns of the dataset
df.shape

In [None]:
# Info about the dataset
df.info()

In [None]:
# Statistical summary of the dataset
df.describe()

In [None]:
# Checking for Null Values
df.isnull().sum()

In [None]:
# Checking for Duplicates
df.duplicated().sum()

# -----------------------------------------------------------------------------

# Data Pre-processing & Preparation

# -----------------------------------------------------------------------------

In [None]:
# Drop Null Values
df = df.dropna()
df.isnull().sum()

In [None]:
# Drop Duplicates
df = df.drop_duplicates()
df.duplicated().sum()

In [None]:
# Drop unnecessary columns: authors, link, date
df = df.drop(['authors', 'link', 'date'], axis=1)

In [None]:
# Display the updated dataset
df

In [None]:
# Merge headline and short_description into one column called 'text'
df['text'] = df['headline'] + ' ' + df['short_description']

# Drop headline and short_description columns because they are no longer needed
df = df.drop(['headline', 'short_description'], axis=1)

In [None]:
# Display the updated dataset
df

In [None]:
# Display the unique categories
df['category'].unique()

In [None]:
# Display the number of unique categories
df['category'].nunique()

In [None]:
# Array of culture and arts categories
culture_arts_categories = ['ARTS', 'CULTURE & ARTS', 'ARTS & CULTURE']

# Array of news categories
news_categories = ['WEIRD NEWS', 'WORLD NEWS', 'GOOD NEWS']

# Array of voices categories
voices_categories = ['LATINO VOICES', 'BLACK VOICES', 'QUEER VOICES']

# Bundle the culture and arts categories into one category called 'CULTURE & ARTS'
df['category'] = df['category'].replace(culture_arts_categories, 'CULTURE & ARTS')

# Bundle the news categories into one category called 'NEWS'
df['category'] = df['category'].replace(news_categories, 'NEWS')

# Bundle the voices categories into one category called 'VOICES'
df['category'] = df['category'].replace(voices_categories, 'VOICES')

In [None]:
# Merge categories with the same or similar meaning
df['category'] = df['category'].replace('STYLE & BEAUTY', 'STYLE')
df['category'] = df['category'].replace('PARENTING', 'PARENTS')
df['category'] = df['category'].replace('COLLEGE', 'EDUCATION')
df['category'] = df['category'].replace('TASTE', 'FOOD & DRINK')
df['category'] = df['category'].replace('DIVORCE', 'WEDDINGS')
df['category'] = df['category'].replace('MONEY', 'BUSINESS')
df['category'] = df['category'].replace('HEALTHY LIVING', 'WELLNESS')
df['category'] = df['category'].replace('THE WORLDPOST', 'WORLDPOST')
df['category'] = df['category'].replace('WORLDPOST', 'NEWS')

In [None]:
# Drop trash categories
trash_categories = ['GREEN', 'FIFTY']
df = df[~df['category'].isin(trash_categories)]

In [None]:
# Display the unique categories
df['category'].unique()

In [None]:
# Display the number of unique categories
df['category'].nunique()

In [None]:
# Display the updated dataset
df

In [None]:
# Plot the count of each category before Downsampling
plt.figure(figsize=(10, 5))
sns.countplot(df['category'])
plt.title('Count of Each Category before Downsampling')
plt.show()

In [None]:
# Downsampling
from sklearn.utils import resample

# Separate the dataset into different categories
politics = df[df['category'] == 'POLITICS']
wellness = df[df['category'] == 'WELLNESS']
entertainment = df[df['category'] == 'ENTERTAINMENT']
other_categories = df[(df['category'] != 'POLITICS') & 
                      (df['category'] != 'WELLNESS') & 
                      (df['category'] != 'ENTERTAINMENT')]

# Downsample the categories with more than 10000 samples
politics = resample(politics, replace=False, n_samples=10000, random_state=42)
wellness = resample(wellness, replace=False, n_samples=10000, random_state=42)
entertainment = resample(entertainment, replace=False, n_samples=10000, random_state=42)

# Combine the downsampled categories back into a single dataframe
df_downsampled = pd.concat([politics, wellness, entertainment])

# Merge the downsampled dataframe with the dataframe of other categories
df = pd.concat([other_categories, df_downsampled])

In [None]:
# Plot the count of each category after downsampling
plt.figure(figsize=(10, 5))
sns.countplot(df['category'])
plt.title('Count of Each Category After Downsampling')
plt.show()

In [None]:
# Count the number of samples in each category
category_counts = df['category'].value_counts()

# Get a list of categories that have less than 6000 samples
categories_to_drop = category_counts[category_counts < 6000].index

# Drop these categories from the dataframe
df = df[~df['category'].isin(categories_to_drop)]

In [None]:
# Plot the count of each category after dropping categories with less than 6000 samples
plt.figure(figsize=(10, 5))
sns.countplot(df['category'])
plt.title('Count of Each Category')
plt.show()

In [None]:
# Reset the index
df = df.reset_index(drop=True)

In [None]:
# Display the updated dataset
df

# -----------------------------------------------------------------------------

# NLP

# -----------------------------------------------------------------------------

# Language Detection

In [None]:
# Import the library
from langdetect import detect

# Function to detect the language
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'
    
# Apply the function to the 'text' column
df['language'] = df['text'].apply(detect_language)

# Drop the rows where the language is not English
df = df[df['language'] == 'en']

# Drop the 'language' column
df = df.drop('language', axis=1)

# Reset the index
df = df.reset_index(drop=True)

# Display the updated dataset
df

# Normalization 

In [None]:
# Import the library
import re
from unidecode import unidecode

# Function that Normalizes my 'text' column
def clean_text(text):
    
    # Convert text to lowercase
    text = text.lower()
    
    # Replace all diacritical marks with their corresponding characters
    text = unidecode(text)
    
    # Remove mentions
    text = re.sub(r'@[a-zA-Z0-9_]+', '', text)
    
    # Remove hashtags
    text = re.sub(r'#', '', text)
    
    # Remove URLs    
    text = re.sub(r'https?:\/\/\S+', '', text)
    
    # Remove new line characters
    text = re.sub(r'\n', '', text)
    
    # Remove punctuation and underscores
    text = re.sub(r'[^\w\s]|_', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Return the cleaned text
    return text

# Here I apply the clean_text function to the 'text' column
df['text'] = df['text'].apply(clean_text)

# Display the updated dataset
df

# Stopwords Removal

In [None]:
# Import the library
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply the remove_stopwords function to the 'text' column
df['text'] = df['text'].apply(remove_stopwords)

# Display the updated dataset
df

# Lemmatization

In [None]:
# Import the library
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# Function to lemmatize the text
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Apply the lemmatize_text function to the 'text' column
df['text'] = df['text'].apply(lemmatize_text)

# Display the updated dataset
df

# -----------------------------------------------------------------------------

# Splitting Data

# -----------------------------------------------------------------------------

In [None]:
# Split the data into training, test sets
from sklearn.model_selection import train_test_split

# Split the data into X and Y
X = df['text'] # Feature
Y = df['category'] # Target

# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

# -----------------------------------------------------------------------------

# Vectorization

# -----------------------------------------------------------------------------

# BoW

In [None]:
# BoW Vectorization
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
cv = CountVectorizer()

# Fit and transform the training data
X_train_bow = cv.fit_transform(X_train)

# Transform the test data
X_test_bow = cv.transform(X_test)

# TF-IDF

In [None]:
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tv = TfidfVectorizer()

# Fit and transform the training data
X_train_tv = tv.fit_transform(X_train)

# Transform the test data
X_test_tv = tv.transform(X_test)

# Hashing Vectorization

In [None]:
# Hashing Vectorization

# Initialize the HashingVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=2**20, alternate_sign=False)

# Fit and transform the training data
X_train_hv = hv.fit_transform(X_train)

# Transform the test data
X_test_hv = hv.transform(X_test)

# -----------------------------------------------------------------------------

# Models

# -----------------------------------------------------------------------------

# Naive Bayes

In [None]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

# Initialize the Multinomial Naive Bayes Classifier
nb_bow = MultinomialNB()
nb_tv = MultinomialNB()
nb_hv = MultinomialNB()

# Train the models
nb_bow.fit(X_train_bow, Y_train)
nb_tv.fit(X_train_tv, Y_train)
nb_hv.fit(X_train_hv, Y_train)

# Predictions using Naive Bayes Classifier
Y_pred_nb_bow = nb_bow.predict(X_test_bow)
Y_pred_nb_tv = nb_tv.predict(X_test_tv)
Y_pred_nb_hv = nb_hv.predict(X_test_hv)

# Logistic Regression

In [None]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression Classifier
lr_bow = LogisticRegression(max_iter=1000)
lr_tv = LogisticRegression(max_iter=1000)
lr_hv = LogisticRegression(max_iter=1000)

# Train the models
lr_bow.fit(X_train_bow, Y_train)
lr_tv.fit(X_train_tv, Y_train)
lr_hv.fit(X_train_hv, Y_train)

# Predictions using Logistic Regression Classifier
Y_pred_lr_bow = lr_bow.predict(X_test_bow)
Y_pred_lr_tv = lr_tv.predict(X_test_tv)
Y_pred_lr_hv = lr_hv.predict(X_test_hv)

# SVM

In [None]:
# SVM Classifier
from sklearn.svm import LinearSVC

# Initialize the Linear SVM Classifier
svm_bow = LinearSVC()
svm_tv = LinearSVC()
svm_hv = LinearSVC()

# Train the models
svm_bow.fit(X_train_bow, Y_train)
svm_tv.fit(X_train_tv, Y_train)
svm_hv.fit(X_train_hv, Y_train)

# Predictions using SVM Classifier
Y_pred_svm_bow = svm_bow.predict(X_test_bow)
Y_pred_svm_tv = svm_tv.predict(X_test_tv)
Y_pred_svm_hv = svm_hv.predict(X_test_hv)

# -----------------------------------------------------------------------------

# Model Evaluation

# -----------------------------------------------------------------------------

In [None]:
# Functions to evaluate the models
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

def report(Y_test, Y_pred):
    print('\nClassification Report:\n', classification_report(Y_test, Y_pred))
    accuracy = accuracy_score(Y_test, Y_pred)
    print('Accuracy:', accuracy)

def validation(model, X, Y):
    accuracy = cross_val_score(model, X, Y, cv=5, scoring='accuracy')
    print('Cross Validation Accuracy:', accuracy.mean())

def plot(Y_test, Y_pred):
    categories = df['category'].unique()
    cm = confusion_matrix(Y_test, Y_pred)
    plt.figure(figsize=(10, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=categories, yticklabels=categories)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

def evaluation(model, X_train, Y_train, Y_test, Y_pred, vectorizer_name):
    print(f'\nModel using {vectorizer_name}:')
    report(Y_test, Y_pred)
    validation(model, X_train, Y_train)
    plot(Y_test, Y_pred)
    print("-"*100)

In [None]:
print('NAIVE BAYES')
evaluation(nb_bow, X_train_bow, Y_train, Y_test, Y_pred_nb_bow, 'BoW')
evaluation(nb_tv, X_train_tv, Y_train, Y_test, Y_pred_nb_tv, 'TF-IDF')
evaluation(nb_hv, X_train_hv, Y_train, Y_test,Y_pred_nb_hv, 'Hashing Vectorizer')

In [None]:
print('LOGISTIC REGRESSION')
evaluation(lr_bow, X_train_bow, Y_train, Y_test, Y_pred_lr_bow, 'BoW')
evaluation(lr_tv, X_train_tv, Y_train, Y_test, Y_pred_lr_tv, 'TF-IDF')
evaluation(lr_hv, X_train_hv, Y_train, Y_test, Y_pred_lr_hv, 'Hashing Vectorizer')

In [None]:
print('SVM')
evaluation(svm_bow, X_train_bow, Y_train, Y_test, Y_pred_svm_bow, 'BoW')
evaluation(svm_tv, X_train_tv, Y_train, Y_test, Y_pred_svm_tv, 'TF-IDF')
evaluation(svm_hv, X_train_hv, Y_train, Y_test, Y_pred_svm_hv, 'Hashing Vectorizer')

# -----------------------------------------------------------------------------

# Results 

# -----------------------------------------------------------------------------

In [None]:
vectorization_results = {
    'Naive Bayes BoW': accuracy_score(Y_test, Y_pred_nb_bow),
    'Naive Bayes TF-IDF': accuracy_score(Y_test, Y_pred_nb_tv),
    'Naive Bayes Hashing Vectorizer': accuracy_score(Y_test, Y_pred_nb_hv),
    'Logistic Regression BoW': accuracy_score(Y_test, Y_pred_lr_bow),
    'Logistic Regression TF-IDF': accuracy_score(Y_test, Y_pred_lr_tv),
    'Logistic Regression Hashing Vectorizer': accuracy_score(Y_test, Y_pred_lr_hv),
    'SVM BoW': accuracy_score(Y_test, Y_pred_svm_bow),
    'SVM TF-IDF': accuracy_score(Y_test, Y_pred_svm_tv),
    'SVM Hashing Vectorizer': accuracy_score(Y_test, Y_pred_svm_hv)
}

plt.figure(figsize=(10, 5))
plt.bar(vectorization_results.keys(), vectorization_results.values())
plt.title('Vectorization Results')
plt.xlabel('Models')
plt.ylabel('Accuracy')
for key, value in vectorization_results.items():
    plt.text(key, value, f'{value:.2f}', ha='center')
plt.xticks(rotation=45)
plt.show()