In [1]:
import pandas as pd
# Load dataset
df=pd.read_csv('news_data.csv') # Make sure the name matches dataset file!
# Look at the first 5 rows to make sure it loaded correctly
df.head()

Unnamed: 0,Source,Title,Published,Author,Link,Summary,Published_Datetime
0,Wired,"Best Travel Cameras (2026), Tested and Reviewed","Mon, 12 Jan 2026 12:00:00 +0000",Scott Gilbertson,https://www.wired.com/gallery/best-travel-came...,We’ve found the best travel cameras—from point...,2026-01-12 12:00:00+00:00
1,Wired,"10 Best Pet Cameras (2026), Tested With Our Pets","Mon, 12 Jan 2026 11:34:00 +0000",Molly Higgins,https://www.wired.com/gallery/best-pet-cameras/,"Whether you’re near or far, keep an eye on you...",2026-01-12 11:34:00+00:00
2,Ars Technica,The most fascinating monitors at CES 2026,"Mon, 12 Jan 2026 11:30:06 +0000",Scharon Harding,https://arstechnica.com/gadgets/2026/01/the-mo...,"Big sizes, big resolution, and big ideas.",2026-01-12 11:30:06+00:00
3,Wired,Hyte X50 PC Case Review: Computers Should Be Cute,"Mon, 12 Jan 2026 11:00:00 +0000",Brad Bourque,https://www.wired.com/review/hyte-x50/,"Hyte’s rounded glass case brings a fun, bubbly...",2026-01-12 11:00:00+00:00
4,TechCrunch,These Gen Zers just raised $11.75M to put Afri...,"Mon, 12 Jan 2026 08:00:00 +0000",Dominic-Madori Davis,https://techcrunch.com/2026/01/12/these-gen-ze...,African defense company Terra comes out of ste...,2026-01-12 08:00:00+00:00


In [5]:
# Create a copy of the dataframe to keep it safe
df_processed=df.copy()
# Fill any empty summaries with a blank space (just in case)
df_processed['Summary'] = df_processed['Summary'].fillna('')
# Combine Title and Summary into a single text column
df_processed['combined_features'] = df_processed['Title'] + " " + df_processed['Summary']
# Look at the first entry of our new column
print(df_processed['combined_features'].iloc[0])
# Look at the first 5 rows to make sure it will display correctly
df_processed.head()

Best Travel Cameras (2026), Tested and Reviewed We’ve found the best travel cameras—from point-and-shoot to full-frame—to help you bring home the perfect vacation photos.


Unnamed: 0,Source,Title,Published,Author,Link,Summary,Published_Datetime,combined_features
0,Wired,"Best Travel Cameras (2026), Tested and Reviewed","Mon, 12 Jan 2026 12:00:00 +0000",Scott Gilbertson,https://www.wired.com/gallery/best-travel-came...,We’ve found the best travel cameras—from point...,2026-01-12 12:00:00+00:00,"Best Travel Cameras (2026), Tested and Reviewe..."
1,Wired,"10 Best Pet Cameras (2026), Tested With Our Pets","Mon, 12 Jan 2026 11:34:00 +0000",Molly Higgins,https://www.wired.com/gallery/best-pet-cameras/,"Whether you’re near or far, keep an eye on you...",2026-01-12 11:34:00+00:00,"10 Best Pet Cameras (2026), Tested With Our Pe..."
2,Ars Technica,The most fascinating monitors at CES 2026,"Mon, 12 Jan 2026 11:30:06 +0000",Scharon Harding,https://arstechnica.com/gadgets/2026/01/the-mo...,"Big sizes, big resolution, and big ideas.",2026-01-12 11:30:06+00:00,The most fascinating monitors at CES 2026 Big ...
3,Wired,Hyte X50 PC Case Review: Computers Should Be Cute,"Mon, 12 Jan 2026 11:00:00 +0000",Brad Bourque,https://www.wired.com/review/hyte-x50/,"Hyte’s rounded glass case brings a fun, bubbly...",2026-01-12 11:00:00+00:00,Hyte X50 PC Case Review: Computers Should Be C...
4,TechCrunch,These Gen Zers just raised $11.75M to put Afri...,"Mon, 12 Jan 2026 08:00:00 +0000",Dominic-Madori Davis,https://techcrunch.com/2026/01/12/these-gen-ze...,African defense company Terra comes out of ste...,2026-01-12 08:00:00+00:00,These Gen Zers just raised $11.75M to put Afri...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize the vectorizer
# 'stop_words=english' tells it to ignore common words like 'the', 'is', 'and'
tfidf=TfidfVectorizer(stop_words='english')
# Transform our text into a matrix of numbers
tfidf_matrix=tfidf.fit_transform(df_processed['combined_features'])
# This matrix is what the computer actually 'sees'
print(f"Matrix Shape: {tfidf_matrix.shape}")
# (100, x) means 100 articles and 'x' unique important words found

Matrix Shape: (100, 1252)


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
# This creates a 100x100 table where every article is compared to every other article
cosine_sim=cosine_similarity(tfidf_matrix, tfidf_matrix)
# Let's test it! Let's pick the first article (index 0)
# and find the 3 most similar articles.
article_index=0
sim_scores=list(enumerate(cosine_sim[article_index]))
# Sort them based on similarity score (highest first)
sim_scores=sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the top 3 (excluding the first one, which is the article itself)
top_3=[i[0] for i in sim_scores[1:4]]
print(f"Selected Article: {df['Title'].iloc[article_index]}")
print("---")
print("Recommendations:")
for i in top_3:
    print(f"- {df['Title'].iloc[i]}")

Selected Article: Best Travel Cameras (2026), Tested and Reviewed
---
Recommendations:
- Best Bird Feeders With Cameras, Tested and Reviewed (2026)
- 10 Best Pet Cameras (2026), Tested With Our Pets
- Best Side-Sleeper Mattresses 2026: Picked by a Sleep Science Coach
