In [None]:
#!pip install numpy==1.24

In [None]:
import numpy as np 
import pandas as pd 
import json
import zipfile

df = pd.read_json('News_Category_Dataset_v3.json', lines=True)
df.head()

In [None]:
df.info()

## **Data Cleaning**

In [None]:
#Checking for missing values
df.isnull().sum()

Getting these values because: This can happen due to the presence of "missing" values that aren't technically NaN (the value used by Pandas to represent missing data), but are instead represented by other placeholder values, such as empty strings (''), whitespace, or special markers like None.

In [None]:
# Example to show the presence of missing values
df[209217:209224]

In this case we have the some missing values in the author column

In [None]:
#Checking if there are any other mssing values
df.replace("", pd.NA, inplace=True)
df.isnull().sum()

In [None]:
#Percentage of missing values
missing_percentage = df.isnull().mean()*100
print(missing_percentage)

In [None]:
#Replacing the author column column with unknown
df['authors']=df['authors'].replace(pd.NA, "unknown")

#Drop the remaining columns with missing values
df.dropna(inplace=True)

#Checking if there are any other missing values
df.isnull().sum()

In [None]:
#Dropping the date and the link column

df.drop(columns=['link', 'date'], inplace=True)

df.columns

## **Text Preprocessing**

In [None]:
#!pip install -U pip setuptools wheel
#!pip install -U spacy

#!python -m spacy download en_core_web_sm

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser"])

In [None]:
#Combining the headline column and the short description column
#df['text'] = df['headline'] + " " + df['short_description']
df.head()

In [None]:
#Dropping the headline and short description column
df.drop(['authors'], inplace=True, axis=1)
df.head()

In [None]:
df = df.head(1000)

df.shape

In [None]:
df.head()

In [None]:
#Create a language preprocessing pipeline for the dataset

#Processing the text in batches using spacy's nlp.pipelines for faster execution
def preprocess_text_column(df, column):
    texts = df[column].astype(str).tolist() # Converting columns to alist of strings
    processed_texts = []
    
    
    #Using nlp for batch processing
    for doc in nlp.pipe(texts, batch_size=1000, disable=['parser'], n_process=-1):
        #Change text to lowercase
        doc = nlp(doc.text.lower())
        tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        processed_texts.append(" ".join(tokens))
        
    df[column]=processed_texts
    return df
        
#Columns to process
columns_to_process = ["headline", "category", "short_description"]

for column in columns_to_process:
    df = preprocess_text_column(df, column)
    
df.head()

In [None]:
df.to_json('data.json', index=False)

In [None]:
#df.to_csv('data.csv', index=False, header=True, sep=',')

## **Text Representation**

In [None]:
#Using one hot encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

#Extracting the text column
text = df[['headline', 'category', 'short_description']]

#Creating an instance of the onehotencoder
encoder = OneHotEncoder()

pipeline1 = Pipeline(steps=[('OneHotEncoder', OneHotEncoder())])


#Fitting and transforming the genres column
text_encoded = pipeline1.fit_transform(text.values.reshape(-1, 1))

In [None]:
text_encoded

In [None]:
import joblib

joblib.dump(pipeline1, "pipeline1.joblib")

from sklearn.neighbors import NearestNeighbors

#Creating a instance of the NearestNeighbors class
recommender = NearestNeighbors(metric='cosine')

#Fitting the encoded genres to the recommender
recommender.fit(text_encoded.toarray())

## **Making Recommendations**

In [None]:
# Number of recommendations to return
num_recommendations = 5

# Getting the recommendations
_, recommendations = recommender.kneighbors(text_encoded.toarray(), n_neighbors=num_recommendations)

# Ensure the indices are within bounds of the DataFrame
recommendations = [index for index in recommendations[0] if index < len(df)]

# Extracting the text from the recommendations
if recommendations:
    recommended_text_titles = df.iloc[recommendations][['category', 'headline', 'short_description']]
else:
    recommended_text_titles = pd.DataFrame(columns=['category', 'headline', 'short_description'])  # Empty if no valid indices




recommended_text_titles

text_encoded.toarray().shape

## **Saving to a vector database**

In [None]:
#!pip install faiss-cpu
#!pip install numpy==1.25.2

In [None]:
import numpy as np
import faiss

#Create a sample numpy array
dimension = 2024
num_vectors = 3000

#Create a FAISS index
index = faiss.IndexFlatL2(dimension)

#Add vectors to the index
index.add(text_encoded.toarray())

#Save the index to disk
faiss.write_index(index, 'vector_database.index')
print(f'Index saved with {index.ntotal} vectors')

In [None]:
#Query from database

#Load the saved index
index = faiss.read_index('vector_database.index')

#Create a query vector/ multiple query vectors
text_index = 0
num_queries = 1
query_vectors = text_encoded[text_index].toarray()

#Perform the search
k=5 #Number of nearest neighbors to retrieve
distances, indices = index.search(query_vectors, k)

#Print results
for i in range(num_queries):
    print(f'Query {i}:')
    for j in range(k):
        print(f"Neighbor {j}: Index {indices[i][j]}, Distance {distances[i][j]}")

#Create a query vector
num_queries = 1
query_vector = text_encoded[5].toarray()

#Perform the search
k=5 #Number of nearest neighbors to retrieve
distances, indices = index.search(query_vector, k)

#Print results
print(f'Query:')
for j in range (k):
    print(f'Neighbor {j}: Index {indices[0][j]}, Distance {distances[0][j]}')
    
    #Acces the corresponding Dataframe rows using the retrieved indices
    neighbor_rows = df.iloc[indices[0]]
    print(neighbor_rows)

# prompt: How to get the text representation of query_vector = text_encoded[0].toarray()  # Assuming you want to query with the first vector

query_vector_text = df.iloc[0][['headline', 'category', 'short_description']]
print(query_vector_text)

## **Using TF-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df

In [None]:
vectorizer = TfidfVectorizer()
vector_df = vectorizer.fit_transform(df['headline'].values).toarray()
vector_df

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'df' is your DataFrame
vectorizer = TfidfVectorizer()
vector_df = vectorizer.fit_transform(df['category'].values).toarray()

# Get the words (features) from the vectorizer
words = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame for easier handling
tfidf_df = pd.DataFrame(vector_df, columns=words)

# Set the number of top words to retrieve
top_n = 5  # You can change this to any number of top words you'd like

# Extract the top words per document
top_words_per_doc = {}

for idx, row in tfidf_df.iterrows():
    # Get top n words based on TF-IDF score for the current document
    top_words = row.nlargest(top_n).index.tolist()
    top_words_per_doc[f"Document {idx+1}"] = top_words

# Display the top words for each document
for doc, top_words in top_words_per_doc.items():
    print(f"{doc}: {', '.join(top_words)}")

In [None]:
df

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Combine 'headline' and 'short_description' for vectorization
df['combined_text'] = df['headline'] + ' ' + df['short_description']

# Vectorize the combined text
vectorizer = TfidfVectorizer()
vectorized_text = vectorizer.fit_transform(df['combined_text'].values)

# Calculate cosine similarity on the vectorized text
cosine_sim = cosine_similarity(vectorized_text, vectorized_text)

# Define recommendation function with category filtering
def get_recommendations(category, cosine_sim=cosine_sim, df=df):
    # Check if the article with the specified title and category exists
    matching_rows = df['category'] == category
    if matching_rows.empty:
        print("No article found with the specified category.")
        return
    
    # Get the index of the article that matches the category
    idx = matching_rows.index[0]

    # Get pairwise similarity scores for all articles with the selected article
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort articles by similarity score in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 5 most similar articles within the same category
    recommendations = [
        i for i in sim_scores if df.iloc[i[0]]['category'] == category and i[0] != idx
    ][:5]

    # Display recommended articles
    if recommendations:
        for i in recommendations:
            #print(f"Title: {df.iloc[i[0]]['headline']}")
            print(f"Category: {df.iloc[i[0]]['category']}")
            print(f"Description: {df.iloc[i[0]]['short_description']}\n")
    else:
        print("No similar articles found within the same category.")

# Test the recommendation function with a specific headline and category
get_recommendations("comedy")

In [None]:
import joblib

# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

# Save the cosine similarity matrix
joblib.dump(cosine_sim, 'cosine_sim.pkl')

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Combine 'headline' and 'short_description' for vectorization
df['combined_text'] = df['headline'] + ' ' + df['short_description']

# Load the vectorizer and cosine similarity matrix
vectorizer = joblib.load('vectorizer.pkl')
cosine_sim = joblib.load('cosine_sim.pkl')

# Define recommendation function with category filtering
def get_recommendations(category, cosine_sim=cosine_sim, df=df):
    # Check if the article with the specified title and category exists
    matching_rows = df['category'] == category
    if matching_rows.empty:
        print("No article found with the specified category.")
        return
    
    # Get the index of the article that matches the category
    idx = matching_rows.index[0]

    # Get pairwise similarity scores for all articles with the selected article
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort articles by similarity score in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 5 most similar articles within the same category
    recommendations = [
        i for i in sim_scores if df.iloc[i[0]]['category'] == category and i[0] != idx
    ][:5]

    # Display recommended articles
    if recommendations:
        for i in recommendations:
            #print(f"Title: {df.iloc[i[0]]['headline']}")
            print(f"Category: {df.iloc[i[0]]['category']}")
            print(f"Description: {df.iloc[i[0]]['short_description']}\n")
    else:
        print("No similar articles found within the same category.")

# Test the recommendation function with a specific headline and category
get_recommendations("environment")

# **Deployment**

In [None]:
import streamlit as st
import numpy as np
import pandas as pd
#import faiss  # if required
#from your_model_file import load_your_model, recommend_news  # assuming these functions are defined

In [None]:
#!pip install streamlit

In [None]:
# Title and description
st.title("News Recommendation System")
st.write("Get personalized news recommendations based on your preferences.")

# User input (e.g., interest area)
user_input = st.text_input("Enter your interests or topics:")

In [None]:
@st.cache
def load_model():
    return load_your_model("model_path.pkl")  # replace with actual loading function

model = load_model()

if st.button("Recommend News"):
    recommendations = recommend_news(model, user_input)
    for rec in recommendations:
        st.write(f"Title: {rec['title']}\nDescription: {rec['description']}")