In [14]:
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
# 2.1 Load raw article metadata
news = pd.read_csv(
    '../data/news.tsv/news.tsv',
    sep='\t',
    header=None,
    names=['newsID','category','subcategory','title','abstract','url','entities','abstract_entities'],
    dtype=str
)

# 2.2 Replicate preprocessing from Notebook 01: drop rows missing title or abstract
news = news.dropna(subset=['title','abstract']).reset_index(drop=True)

# 2.3 Load saved TF‑IDF matrix and user profile vector
with open('../results/tfidf_matrix.pkl','rb') as f:
    tfidf_matrix = pickle.load(f)
with open('../results/profile_vector.pkl','rb') as f:
    profile_vector = pickle.load(f)

# Align data
if tfidf_matrix.shape[0] != news.shape[0]:
    print(f"Warning: TF-IDF matrix has {tfidf_matrix.shape[0]} rows, news DataFrame has {news.shape[0]} rows. Aligning to the smaller size.")
    min_len = min(tfidf_matrix.shape[0], news.shape[0])
    tfidf_matrix = tfidf_matrix[:min_len]
    news = news.iloc[:min_len].reset_index(drop=True)

In [16]:
# Compute similarity scores between the single profile and all articles
similarities = cosine_similarity(profile_vector, tfidf_matrix).flatten()

# Attach scores to the DataFrame
news['similarity'] = similarities

In [17]:
# Display top 10 most similar articles
top10 = news.sort_values('similarity', ascending=False).head(10)
print(top10[['newsID','title','similarity']])

       newsID                                              title  similarity
18377  N34710  Everything We Know About Dinosaur Evolution Ju...    0.295583
12442  N23287  This Arctic Expedition Is Freezing a Ship for ...    0.292887
5878   N14589  Scientists just witnessed the birth of a heavy...    0.290391
28043  N46932  Falling Atoms Are Helping NASA Measure Earth's...    0.290175
31088  N18480  Could Ocean Cleanup's New Interceptor Help Sol...    0.287964
21864   N6738  Azure goes Quantum at Microsoft Ignite 2019, a...    0.276855
9679   N64837  Scientists and researchers reveal 13 dark tech...    0.257430
12694  N11176  Why scientists are so excited about "quantum s...    0.254188
4924   N23131              For girls in science, the time is now    0.253235
46674  N16211  Microsoft AI helps diagnose cervical cancer fa...    0.246589


In [18]:
with open('../results/similarity_scores.pkl','wb') as f:
    pickle.dump(similarities, f)
print("Saved similarity_scores.pkl into ../results/")

Saved similarity_scores.pkl into ../results/
