In [None]:
#!pip install numpy==1.24

In [67]:
import numpy as np 
import pandas as pd 
import json
import zipfile

df = pd.read_json('News_Category_Dataset_v3.json', lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


## **Data Cleaning**

In [69]:
#Checking for missing values
df.isnull().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

Getting these values because: This can happen due to the presence of "missing" values that aren't technically NaN (the value used by Pandas to represent missing data), but are instead represented by other placeholder values, such as empty strings (''), whitespace, or special markers like None.

In [70]:
# Example to show the presence of missing values
df[209217:209224]

Unnamed: 0,link,headline,category,short_description,authors,date
209217,https://www.huffingtonpost.comhttp://flavorwir...,TV's Most Ill-Advised Weddings,WEDDINGS,Everyone knows that TV weddings are ratings bo...,,2012-01-31
209218,https://www.huffingtonpost.comhttp://www.busin...,The Super Bowl Is More Important Than Just Abo...,WEDDINGS,The Super Bowl is really important to American...,,2012-01-31
209219,https://www.huffingtonpost.comhttp://www.daily...,The Couple That Argues Together Stays Together...,WEDDINGS,A new survey is sure to confirm just what many...,,2012-01-31
209220,https://www.huffingtonpost.com/entry/candice-s...,Candice Swanepoel Prabal Gurung Spring 2012 Ca...,STYLE & BEAUTY,The twists and poses are typical for a high-fa...,Ellie Krupnick,2012-01-31
209221,https://www.huffingtonpost.com/entry/santorum-...,Santorum And The Politics Of Parenting,PARENTING,Never has a presidential candidate lived so se...,"Lisa Belkin, Contributor\nSenior Columnist, Th...",2012-01-31
209222,https://www.huffingtonpost.com/entry/justin-ti...,Justin Timberlake Birthday: A Look Back At 31 ...,STYLE & BEAUTY,"Happy birthday, Justin Timberlake! Since Timbe...",Michelle Manetti,2012-01-31
209223,https://www.huffingtonpost.com/entry/heidi-klu...,Heidi Klum and Seal: What Blew Up Their Marriage?,DIVORCE,"After almost seven years of marriage, Heidi Kl...","Dr. Jane Greer, Contributor\nSHRINK WRAP by Dr...",2012-01-31


In this case we have the some missing values in the author column

In [71]:
#Checking if there are any other mssing values
df.replace("", pd.NA, inplace=True)
df.isnull().sum()

link                     0
headline                 6
category                 0
short_description    19712
authors              37418
date                     0
dtype: int64

In [72]:
#Percentage of missing values
missing_percentage = df.isnull().mean()*100
print(missing_percentage)

link                  0.000000
headline              0.002864
category              0.000000
short_description     9.407857
authors              17.858319
date                  0.000000
dtype: float64


In [73]:
#Replacing the author column column with unknown
df['authors']=df['authors'].replace(pd.NA, "unknown")

#Drop the remaining columns with missing values
df.dropna(inplace=True)

#Checking if there are any other missing values
df.isnull().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

In [74]:
#Dropping the date and the link column

df.drop(columns=['link', 'date'], inplace=True)

df.columns

Index(['headline', 'category', 'short_description', 'authors'], dtype='object')

## **Text Preprocessing**

In [None]:
#!pip install -U pip setuptools wheel
#!pip install -U spacy

#!python -m spacy download en_core_web_sm

In [75]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser"])

In [76]:
#Combining the headline column and the short description column
#df['text'] = df['headline'] + " " + df['short_description']
df.head()

Unnamed: 0,headline,category,short_description,authors
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP"
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski


In [77]:
#Dropping the headline and short description column
df.drop(['authors'], inplace=True, axis=1)
df.head()

Unnamed: 0,headline,category,short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha..."
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to..."
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...


In [78]:
df = df.head(1000)

df.shape

(1000, 3)

In [79]:
df.head()

Unnamed: 0,headline,category,short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha..."
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to..."
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...


In [80]:
#Create a language preprocessing pipeline for the dataset

#Processing the text in batches using spacy's nlp.pipelines for faster execution
def preprocess_text_column(df, column):
    texts = df[column].astype(str).tolist() # Converting columns to alist of strings
    processed_texts = []
    
    
    #Using nlp for batch processing
    for doc in nlp.pipe(texts, batch_size=1000, disable=['parser'], n_process=-1):
        #Change text to lowercase
        doc = nlp(doc.text.lower())
        tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        processed_texts.append(" ".join(tokens))
        
    df[column]=processed_texts
    return df
        
#Columns to process
columns_to_process = ["headline", "category", "short_description"]

for column in columns_to_process:
    df = preprocess_text_column(df, column)
    
df.head()

Unnamed: 0,headline,category,short_description
0,4 million americans roll sleeve omicron target...,u.s news,health expert say early predict demand match 1...
1,american airlines flyer charge ban life punch ...,u.s news,subdue passenger crew flee aircraft confrontat...
2,23 funniest tweet cat dog week sept 17 23,comedy,dog understand eat
3,funniest tweet parent week sept 17 23,parenting,accidentally grow toothpaste toddler toothbrus...
4,woman call cop black bird watcher lose lawsuit...,u.s news,amy cooper accuse investment firm franklin tem...


In [81]:
df.to_json('data.json', index=False)

In [97]:
#df.to_csv('data.csv', index=False, header=True, sep=',')

## **Text Representation**

In [82]:
#Using one hot encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

#Extracting the text column
text = df[['headline', 'category', 'short_description']]

#Creating an instance of the onehotencoder
encoder = OneHotEncoder()

pipeline1 = Pipeline(steps=[('OneHotEncoder', OneHotEncoder())])


#Fitting and transforming the genres column
text_encoded = pipeline1.fit_transform(text.values.reshape(-1, 1))

In [83]:
text_encoded

<3000x2024 sparse matrix of type '<class 'numpy.float64'>'
	with 3000 stored elements in Compressed Sparse Row format>

In [84]:
import joblib

joblib.dump(pipeline1, "pipeline1.joblib")

from sklearn.neighbors import NearestNeighbors

#Creating a instance of the NearestNeighbors class
recommender = NearestNeighbors(metric='cosine')

#Fitting the encoded genres to the recommender
recommender.fit(text_encoded.toarray())

## **Making Recommendations**

In [85]:
# Number of recommendations to return
num_recommendations = 5

# Getting the recommendations
_, recommendations = recommender.kneighbors(text_encoded.toarray(), n_neighbors=num_recommendations)

# Ensure the indices are within bounds of the DataFrame
recommendations = [index for index in recommendations[0] if index < len(df)]

# Extracting the text from the recommendations
if recommendations:
    recommended_text_titles = df.iloc[recommendations][['category', 'headline', 'short_description']]
else:
    recommended_text_titles = pd.DataFrame(columns=['category', 'headline', 'short_description'])  # Empty if no valid indices




recommended_text_titles

text_encoded.toarray().shape

(3000, 2024)

## **Saving to a vector database**

In [None]:
#!pip install faiss-cpu
#!pip install numpy==1.25.2

In [86]:
import numpy as np
import faiss

#Create a sample numpy array
dimension = 2024
num_vectors = 3000

#Create a FAISS index
index = faiss.IndexFlatL2(dimension)

#Add vectors to the index
index.add(text_encoded.toarray())

#Save the index to disk
faiss.write_index(index, 'vector_database.index')
print(f'Index saved with {index.ntotal} vectors')

Index saved with 3000 vectors


In [87]:
#Query from database

#Load the saved index
index = faiss.read_index('vector_database.index')

#Create a query vector/ multiple query vectors
text_index = 0
num_queries = 1
query_vectors = text_encoded[text_index].toarray()

#Perform the search
k=5 #Number of nearest neighbors to retrieve
distances, indices = index.search(query_vectors, k)

#Print results
for i in range(num_queries):
    print(f'Query {i}:')
    for j in range(k):
        print(f"Neighbor {j}: Index {indices[i][j]}, Distance {distances[i][j]}")

#Create a query vector
num_queries = 1
query_vector = text_encoded[5].toarray()

#Perform the search
k=5 #Number of nearest neighbors to retrieve
distances, indices = index.search(query_vector, k)

#Print results
print(f'Query:')
for j in range (k):
    print(f'Neighbor {j}: Index {indices[0][j]}, Distance {distances[0][j]}')
    
    #Acces the corresponding Dataframe rows using the retrieved indices
    neighbor_rows = df.iloc[indices[0]]
    print(neighbor_rows)

# prompt: How to get the text representation of query_vector = text_encoded[0].toarray()  # Assuming you want to query with the first vector

query_vector_text = df.iloc[0][['headline', 'category', 'short_description']]
print(query_vector_text)

Query 0:
Neighbor 0: Index 0, Distance 0.0
Neighbor 1: Index 1, Distance 2.0
Neighbor 2: Index 2, Distance 2.0
Neighbor 3: Index 3, Distance 2.0
Neighbor 4: Index 4, Distance 2.0
Query:
Neighbor 0: Index 5, Distance 0.0
                                            headline   category  \
5    clean dead belk bathroom 4 day body find police   u.s news   
0  4 million americans roll sleeve omicron target...   u.s news   
1  american airlines flyer charge ban life punch ...   u.s news   
2          23 funniest tweet cat dog week sept 17 23     comedy   
3              funniest tweet parent week sept 17 23  parenting   

                                   short_description  
5  63 year old woman see work south carolina stor...  
0  health expert say early predict demand match 1...  
1  subdue passenger crew flee aircraft confrontat...  
2                                 dog understand eat  
3  accidentally grow toothpaste toddler toothbrus...  
Neighbor 1: Index 0, Distance 2.0
             

## **Using TF-IDF**

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [89]:
df

Unnamed: 0,headline,category,short_description
0,4 million americans roll sleeve omicron target...,u.s news,health expert say early predict demand match 1...
1,american airlines flyer charge ban life punch ...,u.s news,subdue passenger crew flee aircraft confrontat...
2,23 funniest tweet cat dog week sept 17 23,comedy,dog understand eat
3,funniest tweet parent week sept 17 23,parenting,accidentally grow toothpaste toddler toothbrus...
4,woman call cop black bird watcher lose lawsuit...,u.s news,amy cooper accuse investment firm franklin tem...
...,...,...,...
995,china weigh exit zero covid risk involve,world news,change appear imminent government continue pol...
996,russian missile hit near lviv airport strike c...,world news,world leader call investigation russia repeat ...
997,syria assad visit uae 1st trip arab country war,world news,assad office say meet sheikh mohamed bin rashi...
998,strong japan earthquake kill 4 injure 107,world news,7.4 magnitude temblor knock power cause extens...


In [90]:
vectorizer = TfidfVectorizer()
vector_df = vectorizer.fit_transform(df['headline'].values).toarray()
vector_df

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'df' is your DataFrame
vectorizer = TfidfVectorizer()
vector_df = vectorizer.fit_transform(df['category'].values).toarray()

# Get the words (features) from the vectorizer
words = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame for easier handling
tfidf_df = pd.DataFrame(vector_df, columns=words)

# Set the number of top words to retrieve
top_n = 5  # You can change this to any number of top words you'd like

# Extract the top words per document
top_words_per_doc = {}

for idx, row in tfidf_df.iterrows():
    # Get top n words based on TF-IDF score for the current document
    top_words = row.nlargest(top_n).index.tolist()
    top_words_per_doc[f"Document {idx+1}"] = top_words

# Display the top words for each document
for doc, top_words in top_words_per_doc.items():
    print(f"{doc}: {', '.join(top_words)}")

In [92]:
df

Unnamed: 0,headline,category,short_description
0,4 million americans roll sleeve omicron target...,u.s news,health expert say early predict demand match 1...
1,american airlines flyer charge ban life punch ...,u.s news,subdue passenger crew flee aircraft confrontat...
2,23 funniest tweet cat dog week sept 17 23,comedy,dog understand eat
3,funniest tweet parent week sept 17 23,parenting,accidentally grow toothpaste toddler toothbrus...
4,woman call cop black bird watcher lose lawsuit...,u.s news,amy cooper accuse investment firm franklin tem...
...,...,...,...
995,china weigh exit zero covid risk involve,world news,change appear imminent government continue pol...
996,russian missile hit near lviv airport strike c...,world news,world leader call investigation russia repeat ...
997,syria assad visit uae 1st trip arab country war,world news,assad office say meet sheikh mohamed bin rashi...
998,strong japan earthquake kill 4 injure 107,world news,7.4 magnitude temblor knock power cause extens...


In [96]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Combine 'headline' and 'short_description' for vectorization
df['combined_text'] = df['headline'] + ' ' + df['short_description']

# Vectorize the combined text
vectorizer = TfidfVectorizer()
vectorized_text = vectorizer.fit_transform(df['combined_text'].values)

# Calculate cosine similarity on the vectorized text
cosine_sim = cosine_similarity(vectorized_text, vectorized_text)

# Define recommendation function with category filtering
def get_recommendations(category, cosine_sim=cosine_sim, df=df):
    # Check if the article with the specified title and category exists
    matching_rows = df['category'] == category
    if matching_rows.empty:
        print("No article found with the specified category.")
        return
    
    # Get the index of the article that matches the category
    idx = matching_rows.index[0]

    # Get pairwise similarity scores for all articles with the selected article
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort articles by similarity score in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 5 most similar articles within the same category
    recommendations = [
        i for i in sim_scores if df.iloc[i[0]]['category'] == category and i[0] != idx
    ][:5]

    # Display recommended articles
    if recommendations:
        for i in recommendations:
            #print(f"Title: {df.iloc[i[0]]['headline']}")
            print(f"Category: {df.iloc[i[0]]['category']}")
            print(f"Description: {df.iloc[i[0]]['short_description']}\n")
    else:
        print("No similar articles found within the same category.")

# Test the recommendation function with a specific headline and category
get_recommendations("comedy")

Category: comedy
Description: sorry buddy give away favorite drink diet pepsi late night host say

Category: comedy
Description: instagram post che clarify leave snl confirm intend stay weekend update desk

Category: comedy
Description: coup yes way intentional plan coup say james austin johnson donald trump cold open

Category: comedy
Description: hear ad fresh cat food say cute kitty descend fierce desert cat look cat cat

Category: comedy
Description: dog understand eat



In [94]:
import joblib

# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

# Save the cosine similarity matrix
joblib.dump(cosine_sim, 'cosine_sim.pkl')

['cosine_sim.pkl']

In [95]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Combine 'headline' and 'short_description' for vectorization
df['combined_text'] = df['headline'] + ' ' + df['short_description']

# Load the vectorizer and cosine similarity matrix
vectorizer = joblib.load('vectorizer.pkl')
cosine_sim = joblib.load('cosine_sim.pkl')

# Define recommendation function with category filtering
def get_recommendations(category, cosine_sim=cosine_sim, df=df):
    # Check if the article with the specified title and category exists
    matching_rows = df['category'] == category
    if matching_rows.empty:
        print("No article found with the specified category.")
        return
    
    # Get the index of the article that matches the category
    idx = matching_rows.index[0]

    # Get pairwise similarity scores for all articles with the selected article
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort articles by similarity score in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 5 most similar articles within the same category
    recommendations = [
        i for i in sim_scores if df.iloc[i[0]]['category'] == category and i[0] != idx
    ][:5]

    # Display recommended articles
    if recommendations:
        for i in recommendations:
            #print(f"Title: {df.iloc[i[0]]['headline']}")
            print(f"Category: {df.iloc[i[0]]['category']}")
            print(f"Description: {df.iloc[i[0]]['short_description']}\n")
    else:
        print("No similar articles found within the same category.")

# Test the recommendation function with a specific headline and category
get_recommendations("environment")

Category: environment
Description: hundred people block bridge london scale oil tanker mass protest demand stop new oil gas project

Category: environment
Description: zachary head sanctuary well life ahead

Category: environment
Description: kind way united nations secretary general antónio guterre say

Category: environment
Description: monday world public database fossil fuel production reserve emission launch

Category: environment
Description: 10 year people refer september 2022 storm benchmark storm



# **Deployment**

In [None]:
import streamlit as st
import numpy as np
import pandas as pd
#import faiss  # if required
#from your_model_file import load_your_model, recommend_news  # assuming these functions are defined

In [None]:
#!pip install streamlit

In [None]:
# Title and description
st.title("News Recommendation System")
st.write("Get personalized news recommendations based on your preferences.")

# User input (e.g., interest area)
user_input = st.text_input("Enter your interests or topics:")

In [None]:
@st.cache
def load_model():
    return load_your_model("model_path.pkl")  # replace with actual loading function

model = load_model()

if st.button("Recommend News"):
    recommendations = recommend_news(model, user_input)
    for rec in recommendations:
        st.write(f"Title: {rec['title']}\nDescription: {rec['description']}")