<a href="https://colab.research.google.com/github/usman619/news_category_classification/blob/main/NLP_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

dataset source: https://www.kaggle.com/datasets/rmisra/news-category-dataset

#**1. Importing Dependencies**

In [None]:
# Data importing and pre-processing
import pandas as pd
import numpy as np
# NLP
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# ML 
from sklearn.feature_extraction.text import CountVectorizer
# Google Drive
from google.colab import drive

#**2. Imporintg Dataset**

In [None]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
df = pd.read_json("gdrive/My Drive/NLP/project_dataset.json", lines= True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [None]:
df.columns

Index(['link', 'headline', 'category', 'short_description', 'authors', 'date'], dtype='object')

In [None]:
# Shape
df.shape

(209527, 6)

In [None]:
df.category.value_counts()

POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATION       

In [None]:
selected_cat=['SCIENCE', 'TECH', 'EDUCATION', 'WORLD NEWS']
data=df[['category','short_description']][df['category'].isin(selected_cat)].reset_index(drop=True)

In [None]:
data.head()

Unnamed: 0,category,short_description
0,WORLD NEWS,More than half a million people remained witho...
1,WORLD NEWS,White House officials say the crux of the pres...
2,WORLD NEWS,FIFA has come under pressure from several Euro...
3,WORLD NEWS,The incident underscores a growing wave of pro...
4,WORLD NEWS,Hurricane Fiona lashed the Turks and Caicos Is...


#**3. Preprocessing**

##**3.1 Downloading stopwords, WordNet etc**

In [None]:
# Downloading stopwords from NLTK
nltk.download('stopwords')

# Downloading WordNet corpus
nltk.download('wordnet')

# Defining the list of stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


##**3.2 Pre-Processing Function**



In [None]:
# Function for pre-processing the 'short_description' column and adding  into a new the column
def preprocess_text(df, column_name):
  
    lemmatizer = WordNetLemmatizer()

    def preprocess_text_helper(text):
        # Convert to lowercase
        text = text.lower()

        #Removing punctuations
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

        #Removing stop words
        text = ' '.join([word for word in text.split() if word not in stop_words])

        #Lemmatize words
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

        return text

    df['preprocessed_' + column_name] = df[column_name].apply(preprocess_text_helper)

    return df


In [None]:
data = preprocess_text(data, 'short_description')

##**3.3 Vectorization**



In [None]:
# !pip install spacy
# !python -m spacy download en_core_web_lg

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [None]:
data['vector'] = data['preprocessed_short_description'].apply(lambda x: nlp(x).vector)

##**3.4 Encoding the Cateogory**

In [None]:
# Moving the 'category' column to the end
col_name = 'category'
col_idx = data.columns.get_loc(col_name)
col = data.pop(col_name)
data.insert(len(data.columns), col_name, col)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encoding the 'category' to num
le = LabelEncoder()
data['category_enc'] = le.fit_transform(data.category)

In [None]:
data.head()

Unnamed: 0,short_description,preprocessed_short_description,vector,category,category_enc
0,More than half a million people remained witho...,half million people remained without water ser...,"[-2.5675826, 0.8016846, -2.1275575, 0.93179905...",WORLD NEWS,3
1,White House officials say the crux of the pres...,white house official say crux president visit ...,"[0.20682292, -0.018929992, -0.29956284, 0.6126...",WORLD NEWS,3
2,FIFA has come under pressure from several Euro...,fifa come pressure several european soccer fed...,"[0.062610626, 0.56664664, -1.6878421, 1.779222...",WORLD NEWS,3
3,The incident underscores a growing wave of pro...,incident underscore growing wave protest funer...,"[-0.66618574, 0.899985, -1.2139493, -0.0698380...",WORLD NEWS,3
4,Hurricane Fiona lashed the Turks and Caicos Is...,hurricane fiona lashed turk caicos island fore...,"[-0.78112, -0.029731652, -0.59545755, 0.493004...",WORLD NEWS,3


##**3.5 Saving the dataset**

In [None]:
data.category_enc.value_counts()

3    3299
1    2206
2    2104
0    1014
Name: category_enc, dtype: int64

In [None]:
data.to_csv("gdrive/My Drive/NLP/preprocessed_project_dataset_1.csv", index=False)

#**4. Model Making**

##**4.1 Import the saved dataset**

In [None]:
# data = pd.read_csv("gdrive/My Drive/NLP/preprocessed_project_dataset_1.csv")
# data.head()

##**4.2 Spliting the dataset into Train and Test**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data.vector.values,
    data.category_enc,
    test_size=0.2,
    random_state=42
)

In [None]:
X_train.shape

(6898,)

In [None]:
# Converting the tensor from 1D to 2D
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

##**4.3 ML Model**

###**4.3.1 Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

In [None]:
# Scaling the values from negtive to positive
scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)


clf = MultinomialNB()
clf.fit(scaled_train_embed, y_train)

In [None]:
y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00       205
           1       0.71      0.38      0.50       445
           2       0.54      0.43      0.48       401
           3       0.54      0.94      0.69       674

    accuracy                           0.56      1725
   macro avg       0.70      0.44      0.42      1725
weighted avg       0.64      0.56      0.51      1725



###**4.3.2 KNN**

In [None]:
from  sklearn.neighbors import KNeighborsClassifier

In [None]:
clf = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')
clf.fit(X_train_2d, y_train)

In [None]:
y_pred = clf.predict(X_test_2d)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.42      0.64      0.51       205
           1       0.59      0.72      0.65       445
           2       0.66      0.47      0.55       401
           3       0.82      0.71      0.76       674

    accuracy                           0.65      1725
   macro avg       0.62      0.64      0.62      1725
weighted avg       0.68      0.65      0.65      1725



###**4.3.3 Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(random_state = 42)
model.fit(X_train_2d, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_pred = model.predict(X_test_2d)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.71      0.59      0.64       205
           1       0.74      0.75      0.75       445
           2       0.69      0.72      0.70       401
           3       0.82      0.85      0.84       674

    accuracy                           0.76      1725
   macro avg       0.74      0.72      0.73      1725
weighted avg       0.76      0.76      0.76      1725

