In [1]:
import numpy as np
import pandas as pd
import os
import math
import time
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\venil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\venil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\venil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
news = pd.read_json("News_Category_Dataset_v3.json", lines = True)

In [3]:
news = news[news['date'] >= pd.Timestamp(2020,1,1)]

In [4]:
news.shape

(5518, 6)

In [5]:
news.sort_values('headline',inplace=True, ascending=False)
duplicated_articles_series = news.duplicated('headline', keep = False)
news = news[~duplicated_articles_series]
print("Total number of articles after removing duplicates:", news.shape[0])

Total number of articles after removing duplicates: 5516


In [6]:
news.index = range(news.shape[0])

In [7]:
news["day and month"] = news["date"].dt.strftime("%a") + "_" + news["date"].dt.strftime("%b")

In [8]:
news.index = range(news.shape[0])
news.index.shape[0]

5516

In [9]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5516 entries, 0 to 5515
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   link               5516 non-null   object        
 1   headline           5516 non-null   object        
 2   category           5516 non-null   object        
 3   short_description  5516 non-null   object        
 4   authors            5516 non-null   object        
 5   date               5516 non-null   datetime64[ns]
 6   day and month      5516 non-null   object        
dtypes: datetime64[ns](1), object(6)
memory usage: 301.8+ KB


In [10]:
news_temp = news.copy()

In [11]:
stop_words = set(stopwords.words('english'))

In [12]:
for i in range(len(news_temp["headline"])):
    string = ""
    for word in news_temp["headline"][i].split():
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
          string += word + " "  
    news_temp.at[i,"headline"] = string.strip()

In [13]:
lemmatizer = WordNetLemmatizer()

In [14]:
for i in range(len(news_temp["headline"])):
    string = ""
    for w in word_tokenize(news_temp["headline"][i]):
        string += lemmatizer.lemmatize(w,pos = "v") + " "
    news_temp.at[i, "headline"] = string.strip()

In [15]:
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0.0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(news_temp['headline'])

In [16]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news['date'][indices].values,
               'headline':news['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel(), 'link':news['link'][indices].values})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    
    
    return df.iloc[1:,]
tfidf_based_model(4156, 11)

headline :  Deadly California Wildfire Was Ignited To Cover Up A Murder, Sheriff Says



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article,link
1,2020-09-06,California Wildfire Traps Campers In National ...,1.231473,https://www.huffpost.com/entry/california-wild...
2,2020-08-15,Bull Chases Firefighters Battling California W...,1.236348,https://www.huffpost.com/entry/bull-firefighte...
3,2022-03-07,4 Of 7 Killed In Iowa Tornadoes From Same Fami...,1.254576,https://www.huffpost.com/entry/iowa-tornado-fo...
4,2020-09-12,Parents Say Son Died In Wildfire Trying To Sav...,1.27083,https://www.huffpost.com/entry/oregon-wildfire...
5,2021-02-25,LA Sheriff Says Tiger Woods Crash ‘Purely An A...,1.271936,https://www.huffpost.com/entry/tiger-woods-cra...
6,2021-02-18,U.S. Needs To Brace Itself For More Deadly Sto...,1.272614,https://www.huffpost.com/entry/climate-change-...
7,2022-08-11,Steve Martin Says ‘Only Murders In The Buildin...,1.276902,https://www.huffpost.com/entry/steve-martin-re...
8,2022-08-21,Minneapolis Teacher Contract Race Language Ign...,1.290466,https://www.huffpost.com/entry/united-states-m...
9,2020-06-11,Tennessee Lawmakers Vote To Keep KKK Leader’s ...,1.29695,https://www.huffpost.com/entry/tennessee-capit...
10,2022-04-05,Sen. Josh Hawley Washes His Hands Of The 'Pro-...,1.298878,https://www.huffpost.com/entry/josh-hawley-ket...
