In [1]:
## import useful libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity,manhattan_distances,euclidean_distances
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import TfidfVectorizer
from dython.nominal import associations
import warnings
warnings.filterwarnings('ignore')

In [2]:
## load dataset
df = pd.read_json("data/News_Category_Dataset_v3.json",lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [10]:
df.isnull().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [12]:
df.describe()

Unnamed: 0,link,headline,category,short_description,authors,date
count,209527,209527,209527,209527.0,209527.0,209527
unique,209486,207996,42,187022.0,29169.0,3890
top,https://www.huffingtonpost.comhttps://www.wash...,Sunday Roundup,POLITICS,,,2014-03-25 00:00:00
freq,2,90,35602,19712.0,37418.0,100
first,,,,,,2012-01-28 00:00:00
last,,,,,,2022-09-23 00:00:00


In [4]:
# Select relevant columns for analysis
data = df[['category', 'headline', 'short_description']]

In [5]:
# Preprocess the data
data['text'] = data['headline'] + ' ' + data['short_description']

In [6]:
data.head()

Unnamed: 0,category,headline,short_description,text
0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...,Health experts said it is too early to predict...,Over 4 Million Americans Roll Up Sleeves For O...
1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li...",He was subdued by passengers and crew when he ...,"American Airlines Flyer Charged, Banned For Li..."
2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...,"""Until you have a dog you don't understand wha...",23 Of The Funniest Tweets About Cats And Dogs ...
3,PARENTING,The Funniest Tweets From Parents This Week (Se...,"""Accidentally put grown-up toothpaste on my to...",The Funniest Tweets From Parents This Week (Se...
4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...,Amy Cooper accused investment firm Franklin Te...,Woman Who Called Cops On Black Bird-Watcher Lo...


In [28]:
# Vectorize the text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['text'])

In [32]:
# Function to find the most similar data using different similarity algorithms
def find_similar_data(query, top_n=5):
    # Vectorize the query
    query_vector = vectorizer.transform([query])

    # Calculate similarities using different algorithms
    cosine_sim = cosine_similarity(X, query_vector).flatten()
    euclidean_sim = euclidean_distances(X, query_vector).flatten()
    manhattan_sim = manhattan_distances(X, query_vector).flatten()
    # jaccard_sim = jaccard_score(X,query_vector).flatten()


    # Combine similarities from different algorithms
    similarity_scores = (cosine_sim + euclidean_sim + manhattan_sim ) / 4

    # Find the indices of top similar data points
    top_indices = similarity_scores.argsort()[-top_n:][::-1]

    # Return the top similar data points
    similar_data = data.iloc[top_indices]

    return similar_data

In [33]:
# Example usage
query = "New research on climate change"
similar_data = find_similar_data(query)
print(similar_data)

         category                          headline  \
109802  WORLDPOST  Weekend Roundup: Laughing at God   
66816    POLITICS                    Sunday Roundup   
63109    POLITICS                    Sunday Roundup   
107893   POLITICS                    Sunday Roundup   
64398    POLITICS                    Sunday Roundup   

                                        short_description  \
109802  The first principle of an open society is not ...   
66816   This week the nation watched as the #NeverTrum...   
63109   This week, the nation was reminded, in ways bo...   
107893  This week began with "The Horrible Call" final...   
64398   This week started off with the horror in Orlan...   

                                                     text  
109802  Weekend Roundup: Laughing at God The first pri...  
66816   Sunday Roundup This week the nation watched as...  
63109   Sunday Roundup This week, the nation was remin...  
107893  Sunday Roundup This week began with "The Horri...  
64

<generator object DataFrame.iterrows at 0x00000146932FD350>