In [2]:
import pandas as pd
import warnings
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from sklearn.metrics import jaccard_score
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_json("../data/News_Category_Dataset_v3.json",lines=True)

In [4]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [5]:
# Preprocess the data
df['text'] = df['headline'] + ' ' + df['short_description']

In [6]:
lemmatizer = WordNetLemmatizer()
corpus = []

for i in range(len(df)):
    text = re.sub('[^a-zA-Z]', '', df['text'][i])
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [lemmatizer.lemmatize(word) for word in text if not word in set(
        stopwords.words('english'))]
    text = ' '.join(text)
    corpus.append(text)

In [7]:
# Function to find the most similar data using different similarity algorithms
def find_similar_data(query, top_n=5):
    # Prepocess the query
    query = re.sub('[^a-zA-Z]', '', query)
    query = query.lower()
    query = nltk.word_tokenize(query)
    query = [lemmatizer.lemmatize(word) for word in query if not word in set(
        stopwords.words('english'))]
    query = ' '.join(query)
    corpus.append(query)
    # Vectorize the query
    cv = CountVectorizer(max_features=7000)
    X = cv.fit_transform(corpus)
    query_vector = X[-1]


    # Calculate similarities using different algorithms
    cosine_sim = cosine_similarity(X, query_vector).flatten()
    euclidean_sim = euclidean_distances(X, query_vector).flatten()
    manhattan_sim = manhattan_distances(X, query_vector).flatten()
    # Calculate Jaccard score for each row in the DataFrame
    jaccard_scores = []
    for row in range(X.shape[0]):
        jaccard_score_value = jaccard_score(X[row].toarray()[0], query_vector.toarray()[0])
        jaccard_scores.append(jaccard_score_value)

    # Combine similarities from different algorithms
    similarity_scores = (cosine_sim + euclidean_sim + manhattan_sim + jaccard_scores) / 4

    # Find the indices of top similar data points
    top_indices = similarity_scores.argsort()[-top_n:][::-1]

    # Return the top similar data points
    similar_data = df.iloc[top_indices]

    return similar_data

In [12]:
query = "geopolitics"
similar_data = find_similar_data(query)
similar_data

Unnamed: 0,link,headline,category,short_description,authors,date,text
53398,https://www.huffingtonpost.com/entry/alaska-me...,'RuPaul's Drag Race' Winner Alaska Just Met La...,QUEER VOICES,OMG!,James Michael Nichols,2016-10-21,'RuPaul's Drag Race' Winner Alaska Just Met La...
21893,https://www.huffingtonpost.com/entry/transgend...,Ryan Murphy's New Show Makes History By Castin...,QUEER VOICES,The NYC-based series will make history with it...,James Michael Nichols,2017-10-26,Ryan Murphy's New Show Makes History By Castin...
45570,https://www.huffingtonpost.com/entry/edward-sn...,Russia Extends Edward Snowden's Residency By A...,THE WORLDPOST,"In 2013, Snowden leaked classified information...",,2017-01-18,Russia Extends Edward Snowden's Residency By A...
146552,https://www.huffingtonpost.com/entry/festivus_...,Search 'Festivus' On Google For A Hilarious Su...,TECH,"Google is celebrating ""Festivus"" a little earl...",Alexis Kleinman,2013-11-30,Search 'Festivus' On Google For A Hilarious Su...
111681,https://www.huffingtonpost.com/entry/route-66-...,"Route 66, Ride for the Relay, Day Twelve",TRAVEL,Everyone was up early packing their bikes and ...,"Mary Anne Erickson, ContributorArtist, Photogr...",2014-12-27,"Route 66, Ride for the Relay, Day Twelve Every..."
