In [1]:
import pandas as pd

In [2]:
behaviors_columns = [
    "Impression ID", "User ID", "Time", "History", "Impressions"
]
behaviors = pd.read_csv('./MINDsmall_train/behaviors.tsv', sep='\t', names=behaviors_columns, header=0)
behaviors.head()

Unnamed: 0,Impression ID,User ID,Time,History,Impressions
0,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
1,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
2,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
3,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...
4,6,U19739,11/11/2019 6:52:13 PM,N39074 N14343 N32607 N32320 N22007 N442 N19001...,N21119-1 N53696-0 N33619-1 N25722-0 N2869-0


In [3]:
news_columns = [
    "News ID", "Category", "SubCategory", "Title", "Abstract", "URL", 
    "Title Entities", "Abstract Entities"
]
news = pd.read_csv('./MINDsmall_train/news.tsv', sep='\t', names=news_columns, header=0)
news = news.fillna('')
news.head()

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities
0,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
1,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
2,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
3,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."
4,N2073,sports,football_nfl,Should NFL be able to fine players for critici...,Several fines came down against NFL players fo...,https://assets.msn.com/labs/mind/AAJ4lap.html,"[{""Label"": ""National Football League"", ""Type"":...","[{""Label"": ""National Football League"", ""Type"":..."


In [4]:
len(news), len(behaviors)

(51281, 156964)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
# Preprocessing the news dataset to combine title and abstract as the input text
news['text'] = news['Title'] + " " + news['Abstract']

# Initializing TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit the model and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(news['text'])

# Computing cosine similarity between all pairs of news articles
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get top N most similar articles
def get_recommendations(article_id, cosine_sim=cosine_sim, top_n=5):
    sim_scores = list(enumerate(cosine_sim[article_id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    article_indices = [i[0] for i in sim_scores]
    return news.iloc[article_indices]

In [13]:
# Example: Get top 5 similar articles to the first article
recommendations = get_recommendations(0, cosine_sim, top_n=5)
print(recommendations[['News ID', 'Title']])

      News ID                                              Title
153    N60584  Those Grueling Workouts May Not Help You Lose ...
6171   N47331  Discouraged From Trying to Lose Belly Fat and ...
6439   N56301            10 Ways to Burn Belly Fat in 10 Minutes
14022  N29276  3 Foods You Should Eat at Breakfast If You Wan...
291    N16032  If You Have a Slow Metabolism, Here Are 5 Doct...


In [15]:
news.iloc[0]

News ID                                                         N19639
Category                                                        health
SubCategory                                                 weightloss
Title                                    50 Worst Habits For Belly Fat
Abstract             These seemingly harmless habits are holding yo...
URL                      https://assets.msn.com/labs/mind/AAB19MK.html
Title Entities       [{"Label": "Adipose tissue", "Type": "C", "Wik...
Abstract Entities    [{"Label": "Adipose tissue", "Type": "C", "Wik...
text                 50 Worst Habits For Belly Fat These seemingly ...
Name: 0, dtype: object

In [17]:
import numpy as np
from sklearn.metrics import roc_auc_score

In [23]:
# Preprocess impression data into a list of (user history, impression list)
def parse_behaviors(behaviors):
    data = []
    for _, row in behaviors.iterrows():
        history = row['History'].split() if pd.notna(row['History']) else []
        impressions = row['Impressions'].split()
        labels, news_ids = [], []
        for imp in impressions:
            nid, label = imp.split('-')
            news_ids.append(nid)
            labels.append(int(label))
        data.append((history, news_ids, labels))
    return data
parsed_data = parse_behaviors(behaviors)

In [42]:
# Map News ID to its row index in news dataframe
news_index = {nid: idx for idx, nid in enumerate(news['News ID'])}

# Function to rank candidate news articles based on content similarity to user history
def rank_candidates(user_history_ids, candidate_ids):
    # Convert News IDs to indices
    try:
        user_history_indices = [news_index[nid] for nid in user_history_ids if nid in news_index]
        candidate_indices = [news_index[nid] for nid in candidate_ids if nid in news_index]
    except KeyError as e:
        print(f"KeyError: {e} not found in news_index")
        return [0] * len(candidate_ids)  # or some fallback

    # Get TF-IDF vectors
    user_history_vectors = tfidf_matrix[user_history_indices]
    candidate_vectors = tfidf_matrix[candidate_indices]

    # Compute user profile (mean of history vectors)
    user_profile = user_history_vectors.mean(axis=0)

    # Ensure proper shape
    if isinstance(user_profile, np.matrix):
        user_profile = np.asarray(user_profile)

    # Compute cosine similarity
    scores = cosine_similarity(user_profile, candidate_vectors).flatten()
    return scores

# Evaluation metrics
def get_metrics(labels, scores, k=10):
    labels = np.array(labels)
    scores = np.array(scores)
    
    # Sort by scores
    sorted_indices = np.argsort(scores)[::-1]
    sorted_labels = labels[sorted_indices]

    auc = roc_auc_score(labels, scores) if len(set(labels)) > 1 else None

    # MRR
    mrr = 0
    for rank, label in enumerate(sorted_labels, start=1):
        if label == 1:
            mrr = 1 / rank
            break

    # nDCG@K
    def dcg(scores):
        return sum([(2**rel - 1) / np.log2(idx + 2) for idx, rel in enumerate(scores)])

    def ndcg_at_k(k):
        ideal_labels = sorted(labels, reverse=True)
        return dcg(sorted_labels[:k]) / (dcg(ideal_labels[:k]) or 1)

    return auc, mrr, ndcg_at_k(5), ndcg_at_k(10)

In [44]:
total_auc, total_mrr, total_ndcg5, total_ndcg10 = [], [], [], []

for history, candidate_ids, labels in parsed_data:
    if not history or not candidate_ids:
        continue
    scores = rank_candidates(history, candidate_ids)
    auc, mrr, ndcg5, ndcg10 = get_metrics(labels, scores)

    if auc is not None:
        total_auc.append(auc)
    total_mrr.append(mrr)
    total_ndcg5.append(ndcg5)
    total_ndcg10.append(ndcg10)

# Report final results
print("Evaluation Results:")
print(f"AUC: {np.mean(total_auc):.4f}")
print(f"MRR: {np.mean(total_mrr):.4f}")
print(f"nDCG@5: {np.mean(total_ndcg5):.4f}")
print(f"nDCG@10: {np.mean(total_ndcg10):.4f}")

Evaluation Results:
AUC: 0.5908
MRR: 0.3381
nDCG@5: 0.3175
nDCG@10: 0.3732
