In [17]:
import os
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import nltk
import string
from nltk.corpus import stopwords

In [11]:
## load dataset
df = pd.read_json("data/News_Category_Dataset_v3.json",lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [14]:
## select relevant no of data
display(df['category'].unique())
selected_cat=['POLITICS', 'ENTERTAINMENT', 'U.S. NEWS', 'WORLD NEWS']
data=df[['category','short_description']][df['category'].isin(selected_cat)].reset_index(drop=True)

array(['U.S. NEWS', 'COMEDY', 'PARENTING', 'WORLD NEWS', 'CULTURE & ARTS',
       'TECH', 'SPORTS', 'ENTERTAINMENT', 'POLITICS', 'WEIRD NEWS',
       'ENVIRONMENT', 'EDUCATION', 'CRIME', 'SCIENCE', 'WELLNESS',
       'BUSINESS', 'STYLE & BEAUTY', 'FOOD & DRINK', 'MEDIA',
       'QUEER VOICES', 'HOME & LIVING', 'WOMEN', 'BLACK VOICES', 'TRAVEL',
       'MONEY', 'RELIGION', 'LATINO VOICES', 'IMPACT', 'WEDDINGS',
       'COLLEGE', 'PARENTS', 'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE',
       'HEALTHY LIVING', 'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST',
       'FIFTY', 'ARTS', 'DIVORCE'], dtype=object)

Unnamed: 0,category,short_description
0,U.S. NEWS,Health experts said it is too early to predict...
1,U.S. NEWS,He was subdued by passengers and crew when he ...
2,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...
3,U.S. NEWS,The 63-year-old woman was seen working at the ...
4,U.S. NEWS,"""Who's that behind you?"" an anchor for New Yor..."


In [19]:
# label the category into int i.e preprocessing the data
encoder = LabelEncoder()
data['category'] = encoder.fit_transform(data['category'])

In [20]:
data.head()

Unnamed: 0,category,short_description
0,2,Health experts said it is too early to predict...
1,2,He was subdued by passengers and crew when he ...
2,2,Amy Cooper accused investment firm Franklin Te...
3,2,The 63-year-old woman was seen working at the ...
4,2,"""Who's that behind you?"" an anchor for New Yor..."


In [21]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [22]:
## clean_text

import string
from nltk.corpus import stopwords

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

train['clean_text'] = train['short_description'].apply(clean_text)
test['clean_text'] = test['short_description'].apply(clean_text)

In [25]:
## remove_stopwords

stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    words = text.split(' ')
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)
    return text

train['clean_text'] = train['clean_text'].apply(remove_stopwords)
test['clean_text'] = test['clean_text'].apply(remove_stopwords)

In [27]:
##  stemm_text

import nltk

stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

train['clean_text'] = train['clean_text'].apply(stemm_text)
test['clean_text'] = test['clean_text'].apply(stemm_text)

In [29]:
## preprocess_data

def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    # Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

train['clean_text'] = train['clean_text'].apply(preprocess_data)
test['clean_text'] = test['clean_text'].apply(preprocess_data)

In [31]:
train['clean_text']

34931    atlant citi — trump plaza casino hotel close w...
42887                            ron weasley get pilot nbc
55600    week nba commiss adam silver brought hammer do...
49846                                                     
4707     rescu team still tri reach bahamian communiti ...
                               ...                        
54343                                                     
38158                                     pay price extrem
860       screen actor guild award kick “hamilton” reun...
15795    continu campaign extermin rohingya peopl myanm...
56422    stronger  back black  love lose game  bodi sou...
Name: clean_text, Length: 46112, dtype: object

In [32]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer()
#fit_transform for train data
X_train = vectorizer.fit_transform(train['clean_text'])
y_train = train['category']
#transform for test data
X_test = vectorizer.transform(test['clean_text'])
y_test = test['category']

In [33]:
X_train

<46112x24433 sparse matrix of type '<class 'numpy.float64'>'
	with 393650 stored elements in Compressed Sparse Row format>

Model_Building Logestic Regression


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [35]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [36]:
model = LogisticRegression(random_state = 42)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f'acc:{acc}')

acc:0.7800065054754418


In [38]:
#Predictions on test set
test_pred = model.predict(X_test)
test_true= y_test

In [39]:
print(classification_report(test_true,test_pred,target_names=class_names,digits=4))

               precision    recall  f1-score   support

ENTERTAINMENT     0.8103    0.6225    0.7041      3417
     POLITICS     0.7720    0.9375    0.8467      7169
    U.S. NEWS     0.5000    0.0217    0.0415       277
   WORLD NEWS     0.6973    0.1940    0.3035       665

     accuracy                         0.7792     11528
    macro avg     0.6949    0.4439    0.4740     11528
 weighted avg     0.7725    0.7792    0.7538     11528



Model get trained with an accuracy of 79%

In [45]:
# dumping the model
import pickle

file = open('../artifacts/model.pkl', 'wb')
pickle.dump(model,file)
file.close()
