In [1]:
from pymongo import MongoClient
import pandas as pd
import numpy as np

In [20]:
client = MongoClient("localhost", 27017)

In [21]:
db = client["NewsDatabase"]
collection = db["FullArticles"]

In [22]:
df = pd.DataFrame(collection.find().to_list())
df = df.drop("_id",axis=1)

In [23]:
df

Unnamed: 0,title,link,news
0,Will launch door-to-door signature campaign fo...,https://www.thehindu.com/news/national/jammu-a...,Chief Minister Omar Abdullah on Friday (August...
1,"Farmers, yoga volunteers special guests for P...",https://www.thehindu.com/news/national/farmers...,Recognising their contributions in the fields ...
2,‘Super Six’ poll promises in Andhra Pradesh ‘s...,https://www.thehindu.com/news/national/andhra-...,Slamming the previous YSRCP government for all...
3,Water level of River Yamuna to breach the ‘war...,https://www.thehindu.com/news/cities/Delhi/wat...,The water level of the Yamuna river in Delhi b...
4,I-Day celebrations | Telangana CM requests Cen...,https://www.thehindu.com/news/national/telanga...,Telangana Chief Minister A. Revanth Reddy has ...
...,...,...,...
503,Predator fly unleashes chaotic ant wars that c...,https://www.sciencedaily.com/releases/2025/08/...,To help manage agricultural practices with few...
504,Hubble spots a nearly invisible galaxy hiding ...,https://www.sciencedaily.com/releases/2025/08/...,This NASA/ESA Hubble Space Telescope Picture o...
505,NASA’s Curiosity Mars rover just learned how t...,https://www.sciencedaily.com/releases/2025/08/...,New capabilities allow the rover to do science...
506,Stark images show water's role in human strife...,https://www.newscientist.com/article/mg2673556...,As part of the exhibition Thirst: In Search of...


In [24]:
data = df["news"]
data

0      Chief Minister Omar Abdullah on Friday (August...
1      Recognising their contributions in the fields ...
2      Slamming the previous YSRCP government for all...
3      The water level of the Yamuna river in Delhi b...
4      Telangana Chief Minister A. Revanth Reddy has ...
                             ...                        
503    To help manage agricultural practices with few...
504    This NASA/ESA Hubble Space Telescope Picture o...
505    New capabilities allow the rover to do science...
506    As part of the exhibition Thirst: In Search of...
507    From DNA sequencing to rainbows, the world of ...
Name: news, Length: 508, dtype: object

In [25]:
import re
import string
from nltk.stem import PorterStemmer

for i,art in enumerate(df["news"]):
    text_no_punct = art.translate(str.maketrans('', '', string.punctuation))
    text_no_numbers = re.sub(r'\d+', '', text_no_punct)
    words = text_no_numbers.split()
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    data[i] = ' '.join(stemmed_words)

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(data)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['aadhaar' 'aaduvom' 'aaduvoma' ... 'čad' 'šeško' '𝙏𝙝𝙚𝙎𝙁𝙉𝙞𝙣𝙚𝙧𝙨']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [27]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(X)

In [28]:
print(similarity_matrix)

[[1.         0.12253833 0.25601676 ... 0.0519553  0.02911643 0.03641867]
 [0.12253833 1.         0.28452132 ... 0.05364799 0.04073328 0.02426141]
 [0.25601676 0.28452132 1.         ... 0.1094837  0.07444541 0.09720892]
 ...
 [0.0519553  0.05364799 0.1094837  ... 1.         0.09178087 0.13490207]
 [0.02911643 0.04073328 0.07444541 ... 0.09178087 1.         0.08508807]
 [0.03641867 0.02426141 0.09720892 ... 0.13490207 0.08508807 1.        ]]


In [29]:
collection = db["User"]
source = collection.find_one({"likedArticles":{"$exists": True}})
likedArticles = source.get("likedArticles")

In [30]:
source = collection.find_one({"dislikedArticles":{"$exists": True}})
dislikedArticles = source.get("dislikedArticles")

In [39]:
likedIndexes = df[df["link"].isin(likedArticles)].index
meanlikedIndexes =  X[likedIndexes].mean(axis=0) if len(likedIndexes) > 0 else np.zeros(X[0].shape)

dislikedIndexes = df[df["link"].isin(dislikedArticles)].index
meandislikedIndexes = X[dislikedIndexes].mean(axis=0) if len(dislikedArticles) > 0 else np.zeros(X[0].shape)
    

In [40]:
user_profile = meanlikedIndexes - meandislikedIndexes
user_profile

matrix([[0.        , 0.06666667, 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [41]:
similarities = cosine_similarity(np.array(user_profile), X.toarray()).flatten()

for idx in likedIndexes:
    similarities[idx] = -1

top_indices = similarities.argsort()[::-1][:10]
top_indices_scores = similarities[similarities.argsort()[::-1][:10]]
print("Recommended articles:", top_indices)
print("scores",top_indices_scores)


Recommended articles: [414  54 116 203 346 178 398 328  17  67]
scores [0.06234231 0.04944711 0.04487081 0.04350086 0.04242098 0.03772511
 0.03722355 0.03603633 0.03407653 0.03373471]


In [42]:
for k,i in enumerate(likedIndexes):
    print(k+1, df.iloc[i]["title"])

1 PM used Independence Day speech to target 'new enemy' in form of infiltrators: TMC 
2 Hyderabad | Show-cause notices issued to Oakridge International School for parking school vehicles on public roads
3 307 people evacuated from Hyderabad’s low-lying areas as water gushes from Himayatsagar 
4 How Tamil cinema plunged into freedom movement after the advent of talkies 
5 Golconda Fort gets rainproof makeover for Independence Day celebrations; security tightened across Hyderabad
6 India vs Australia hockey; Gukesh at St. Louis; Chennai Grand Masters concludes on Independence Day: Indian Sports LIVE, August 15
7 Which is chess' most important format? Frequent switching raises questions
8 FIDE Chess rankings for August: Divya Deshmukh reaches career-high rank of 15
9 Cornell  researchers build first ‘microwave brain’ on a chip
10 The disappearing planet next door has astronomers intrigued
11 I-Day celebrations | CM Revanth reflects on achievements, plan to transform Telangana
12 The surpr

In [44]:
for k,i in enumerate(top_indices):
    print(k+1, df.iloc[i]["title"])

1 A planet the size of Saturn could orbit the nearest sun-like star
2 78 Years of Freedom: A collection of stories commemorating the progress of nation
3 Telangana to soon have Tourist police across major attractions in State 
4 Students’ unions demand A.P. government to conduct elections in educational institutions
5 How NASA’s Lunar Trailblazer was lost before reaching the Moon
6 Telangana Rain | Half day for schools in GHMC limits, full holiday in five districts on August 13 and 14
7 These ants are one of the most effective teams in the natural world
8 Voyager missed it, but now we know Uranus has a fiery secret
9 Omar attends Independence Day event in Srinagar, first elected CM in eight years to unfurl tricolour in J&K
10 Tricolours made by special needs children to flutter at Madin campus


In [45]:
client.close()