In [0]:
import numpy as np
import pandas as pd

import os
import math
import time

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

# Below libraries are for text processing using NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

  import pandas.util.testing as tm


In [0]:
news_articles = pd.read_json("News_Category_Dataset_v2.json", lines = True)

In [0]:
news_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   category           200853 non-null  object        
 1   headline           200853 non-null  object        
 2   authors            200853 non-null  object        
 3   link               200853 non-null  object        
 4   short_description  200853 non-null  object        
 5   date               200853 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


In [0]:
news_articles = news_articles[news_articles['date'] >= pd.Timestamp(2018,1,1)]
# news_articles

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26
...,...,...,...,...,...,...
8578,MEDIA,Remember Don Lemon's Drunken NYE Rant Last Yea...,Rebecca Shapiro,https://www.huffingtonpost.com/entry/don-lemon...,The CNN host introduced the audience to his sp...,2018-01-01
8579,ENTERTAINMENT,Halle Berry Thanks Her Middle Finger For Getti...,Ron Dicker,https://www.huffingtonpost.com/entry/halle-ber...,"""Always standing up for me!""",2018-01-01
8580,POLITICS,Barack Obama Shares His Favorite Books And Son...,Rebecca Shapiro,https://www.huffingtonpost.com/entry/barack-ob...,"The former president shared the ""songs that go...",2018-01-01
8581,POLITICS,You Can Now Buy Legal Recreational Marijuana I...,Matt Ferner,https://www.huffingtonpost.com/entry/legal-mar...,The first state to legalize medical cannabis b...,2018-01-01


In [0]:
news_articles.sort_values('headline',inplace=True, ascending=False)
duplicated_articles_series = news_articles.duplicated('headline', keep = False)
news_articles = news_articles[~duplicated_articles_series]

category             0
headline             0
authors              0
link                 0
short_description    0
date                 0
dtype: int64

In [0]:
news_articles.isna().sum() # check for blank cells

In [138]:
fig = go.Figure([go.Bar(x=news_articles["category"].value_counts().index, y=news_articles["category"].value_counts().values)])
fig.update_traces(marker_color='rgb(217, 179, 255)')
fig['layout'].update(title={"text" : 'Distribution of articles according to category','y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'}, xaxis_title="Category",yaxis_title="Articles")
fig.update_layout(width=800,height=700)
fig

In [0]:
fig = go.Figure([go.Bar(x=news_articles_per_month.index.strftime("%b"), y=news_articles_per_month)])
fig.update_traces(marker_color='rgb(217, 179, 255)')
fig['layout'].update(title={"text" : 'Distribution of articles month-wise','y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'}, xaxis_title="Month",yaxis_title="Number of articles")
fig.update_layout(width=500,height=500)
fig

In [0]:
news_articles.index = range(news_articles.shape[0])

In [0]:
news_articles["day and month"] = news_articles["date"].dt.strftime("%a") + "_" + news_articles["date"].dt.strftime("%b")

In [0]:
news_articles_temp = news_articles.copy()

In [0]:
stop_words = set(stopwords.words('english'))

In [0]:
for i in range(len(news_articles_temp["headline"])):
    string = ""
    for word in news_articles_temp["headline"][i].split():
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
          string += word + " "  

In [0]:
lemmatizer = WordNetLemmatizer()

In [0]:
for i in range(len(news_articles_temp["headline"])):
    string = ""
    for w in word_tokenize(news_articles_temp["headline"][i]):
        string += lemmatizer.lemmatize(w,pos = "v") + " "
    news_articles_temp.at[i, "headline"] = string.strip()

In [0]:
headline_vectorizer = CountVectorizer()
headline_features   = headline_vectorizer.fit_transform(news_articles_temp['headline'])

In [0]:
headline_features.get_shape()

(8538, 11149)

In [0]:
pd.set_option('display.max_colwidth', -1)


Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.



In [0]:
def bag_of_words_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(headline_features,headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    return df.iloc[1:,]
bag_of_words_based_model(133, 11) # Change the row index for any other queried article

  (0, 3395)	1
  (0, 3892)	1
  (0, 3952)	1
  (0, 4025)	1
  (0, 6693)	1
  (0, 9717)	1
  (0, 10295)	1
  (0, 10982)	1
headline :  Woman Fired After Flipping Off Trump's Motorcade Sues Former Employer



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2018-01-11,Each And Both,2.828427
2,2018-04-02,The Trump Administration Is Suing California Again,2.828427
3,2018-01-16,The World According To Trump,3.0
4,2018-01-07,"Seriously, How Dumb Is Trump?",3.0
5,2018-04-29,How To Convict A Rapist,3.162278
6,2018-02-15,Trump's 'Infrastructure Week' Crumbles Again,3.162278
7,2018-01-14,What Common Core Won,3.162278
8,2018-01-17,What Is A 'Sunshine Baby'?,3.162278
9,2018-04-27,Donald Trump And The Next Crash,3.162278
10,2018-02-12,What You Should Know About Trump's Nihilist Budget,3.162278


In [0]:
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(news_articles_temp['headline'])

In [0]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    return df.iloc[1:,]
tfidf_based_model(133, 11)

headline :  Woman Fired After Flipping Off Trump's Motorcade Sues Former Employer



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2018-01-11,Each And Both,1.0
2,2018-05-21,The Supreme Court Just Made It A Lot Harder For You To Sue Your Employer,1.164079
3,2018-04-02,The Trump Administration Is Suing California Again,1.253832
4,2018-04-10,"Lou Dobbs Flips Out On Live TV, Urges Trump To 'Fire The SOB' Robert Mueller",1.258771
5,2018-04-26,Cardi B's Former Manager Sues Her For $10 Million,1.268644
6,2018-04-03,A Third Woman Is Suing To Break A Trump-Related Nondisclosure Agreement,1.274202
7,2018-02-24,Former RNC Chair Fires Back At Claim He Was Only Hired Because He Was Black,1.274791
8,2018-01-16,State Employer Side Payroll Taxes And Loser Liberalism,1.276724
9,2018-02-21,Democrats Flip Kentucky State House Seat Where Trump Won Overwhelmingly,1.281999
10,2018-01-09,Big Tax Game Hunting: Employer Side Payroll Taxes,1.285187
