In [24]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score

In [36]:
data_path = "data/MINDlarge_train/"
behaviors = pd.read_csv(data_path+"behaviors.tsv",sep="\t",header=None,names=["impression_id", "user_id", "time", "history", "impressions"])
news = pd.read_csv(data_path+"news.tsv",sep="\t",header=None,names=["news_id", "category", "subcategory", "title", "abstract",
                          "url", "title_entities", "abstract_entities"])

In [3]:
behaviors.head()

Unnamed: 0,impression_id,user_id,time,history,impressions
0,1,U87243,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,2,U598644,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,3,U532401,11/13/2019 11:23:03 AM,N128643 N87446 N122948 N9375 N82348 N129412 N5...,N103852-0 N53474-0 N127836-0 N47925-1
3,4,U593596,11/12/2019 12:24:09 PM,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
4,5,U239687,11/14/2019 8:03:01 PM,N65250 N122359 N71723 N53796 N41663 N41484 N11...,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...


In [4]:
news.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [9]:
behaviors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2232748 entries, 0 to 2232747
Data columns (total 5 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   impression_id  int64 
 1   user_id        object
 2   time           object
 3   history        object
 4   impressions    object
dtypes: int64(1), object(4)
memory usage: 85.2+ MB


In [13]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101527 entries, 0 to 101526
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   news_id            101527 non-null  object
 1   category           101527 non-null  object
 2   subcategory        101527 non-null  object
 3   title              101527 non-null  object
 4   abstract           96112 non-null   object
 5   url                101527 non-null  object
 6   title_entities     101524 non-null  object
 7   abstract_entities  101521 non-null  object
dtypes: object(8)
memory usage: 6.2+ MB


In [14]:
sum(behaviors["history"].isna())

46065

In [6]:
# create the tfidf matrix for all the articles
vectorizer = TfidfVectorizer(max_features=500)
tfidf_matrix = vectorizer.fit_transform(news["title"])  # shape: [num_articles x 500]

# Map news_id to TF-IDF vector row index
news_id_to_idx = {nid: i for i, nid in enumerate(news["news_id"])}

In [16]:
tfidf_matrix.shape

(101527, 500)

In [17]:
num_features = tfidf_matrix.shape[1]

In [21]:
#build the user profile vectors - mean of user's tf-idf article vectors
user_articles = defaultdict(set) #init a default dict with default value empty set

for _, row in behaviors.iterrows(): #iterate through the impressions
    user = row["user_id"] #get the user
    if pd.notna(row["history"]): #if the user has clicked on some articles before
        try:
            clicked_news = row["history"].split() #get history as list of clicked articles
        except Exception as e:
            print(e,row["history"],type(row["history"]))
            break
        #get the list of indices of clicked news articles:
        indices = {news_id_to_idx[nid] for nid in clicked_news if nid in news_id_to_idx}
        user_articles[user] |= indices #updates set of indices of articles clicked

user_profiles = {}
#average across all user's tf-idf article vectors to get their profile
for user in behaviors["user_id"].unique():
    article_indices = user_articles[user]
    if article_indices:
        article_vectors = tfidf_matrix[list(article_indices)].toarray()
        user_profiles[user] = np.mean(article_vectors, axis=0)
    else:
        user_profiles[user] = np.zeros(num_features)  # cold-start fallback

In [23]:
def recommend(u,k=10):
    """recommend top k similar articles to a given user u based on cosine similarity"""
    if u not in user_profiles:
        user_profile = np.zeros(num_features)
    else:
        user_profile = user_profiles[u]
    user_vector = user_profile.reshape(1, -1)
    similarities = cosine_similarity(user_vector, tfidf_matrix)
    top_indices = similarities[0].argsort()[::-1][:k]
    recommendation = news.iloc[top_indices][["news_id", "title"]]
    return recommendation




In [44]:
def get_scores(u, impression):
    if u not in user_profiles:
        user_profile = np.zeros(num_features)
    else:
        user_profile = user_profiles[u]
    
    user_vector = user_profile.reshape(1, -1)
    
    # Get article vectors for this impression only
    indices = [news_id_to_idx[nid] for nid in impression if nid in news_id_to_idx]
    article_vectors = tfidf_matrix[indices]  # shape: [num_articles_in_impression x num_features]

    similarities = cosine_similarity(user_vector, article_vectors)
    return similarities[0]  # also return indices so you know which articles the scores belong to

In [45]:
def mrr_score(labels, scores):
    ranked = np.argsort(scores)[::-1]
    for rank, idx in enumerate(ranked):
        if labels[idx] == 1:
            return 1.0 / (rank + 1)
    return 0.0

def ndcg_score(labels, scores, k=5):
    ranked = np.argsort(scores)[::-1][:k]
    dcg = 0.0
    for i, idx in enumerate(ranked):
        rel = labels[idx]
        dcg += rel / np.log2(i + 2)
    ideal_dcg = sum([1.0 / np.log2(i + 2) for i in range(min(sum(labels), k))])
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

In [29]:
# Lists to collect scores for all impressions
all_auc, all_mrr, all_ndcg5, all_ndcg10= [], [], [], []

In [28]:
#preprocess 
# Extract user IDs (this is fine)
users = behaviors["user_id"]

# Preprocess impressions and labels
impressions = behaviors["impressions"].apply(
    lambda row: [x.split("-")[0] for x in row.split()]
)

labels = behaviors["impressions"].apply(
    lambda row: [int(x.split("-")[1]) for x in row.split()]
)

user_impression_data = list(zip(users, impressions, labels))


In [32]:
for user,impression,label in user_impression_data:
    scores = get_scores(user,impression)
    if len(scores) < 2 or sum(label) == 0:
        continue #ignore unmeaningful data

    try:
        all_auc.append(roc_auc_score(label, scores))
    except Exception as e:
        pass  # AUC may error on uniform labels
    all_mrr.append(mrr_score(label, scores))
    all_ndcg5.append(ndcg_score(label, scores, k=5))
    all_ndcg10.append(ndcg_score(label, scores, k=10))

In [33]:
# Print final evaluation results
print("AUC:", np.mean(all_auc))
print("MRR:", np.mean(all_mrr))
print("nDCG@5:", np.mean(all_ndcg5))
print("nDCG@10:", np.mean(all_ndcg10))

AUC: 0.5433194661590481
MRR: 0.2880567787139299
nDCG@5: 0.2694343397608446
nDCG@10: 0.32650412394963335


In [35]:
dev_news = pd.read_csv("data/MINDlarge_dev/news.tsv",sep="\t",header=None,names=["news_id", "category", "subcategory", "title", "abstract",
                          "url", "title_entities", "abstract_entities"])


In [38]:
combined_news = pd.concat([news, dev_news]).drop_duplicates(subset=["news_id"])  # remove duplicate news IDs

In [39]:
# create the tfidf matrix for all the articles
vectorizer = TfidfVectorizer(max_features=500)
tfidf_matrix = vectorizer.fit_transform(combined_news["title"])  # shape: [num_articles x 500]

# Map news_id to TF-IDF vector row index
news_id_to_idx = {nid: i for i, nid in enumerate(combined_news["news_id"])}

In [40]:
#rebuild the user profile vectors - mean of user's tf-idf article vectors from only TRAIN
user_articles = defaultdict(set) #init a default dict with default value empty set

for _, row in behaviors.iterrows(): #iterate through the impressions
    user = row["user_id"] #get the user
    if pd.notna(row["history"]): #if the user has clicked on some articles before
        try:
            clicked_news = row["history"].split() #get history as list of clicked articles
        except Exception as e:
            print(e,row["history"],type(row["history"]))
            break
        #get the list of indices of clicked news articles:
        indices = {news_id_to_idx[nid] for nid in clicked_news if nid in news_id_to_idx}
        user_articles[user] |= indices #updates set of indices of articles clicked

user_profiles = {}
#average across all user's tf-idf article vectors to get their profile
for user in behaviors["user_id"].unique():
    article_indices = user_articles[user]
    if article_indices:
        article_vectors = tfidf_matrix[list(article_indices)].toarray()
        user_profiles[user] = np.mean(article_vectors, axis=0)
    else:
        user_profiles[user] = np.zeros(num_features)  # cold-start fallback

In [41]:
all_auc, all_mrr, all_ndcg5, all_ndcg10= [], [], [], []

In [42]:
behaviors_dev = pd.read_csv("data/MINDlarge_dev/behaviors.tsv",sep="\t",header=None,names=["impression_id", "user_id", "time", "history", "impressions"])


In [43]:
users = behaviors_dev["user_id"]

# Preprocess impressions and labels
impressions = behaviors_dev["impressions"].apply(
    lambda row: [x.split("-")[0] for x in row.split()]
)

labels = behaviors_dev["impressions"].apply(
    lambda row: [int(x.split("-")[1]) for x in row.split()]
)

user_impression_data = list(zip(users, impressions, labels))

In [46]:
def get_scores(u, impression):
    if u not in user_profiles:
        user_profile = np.zeros(num_features)
    else:
        user_profile = user_profiles[u]
    
    user_vector = user_profile.reshape(1, -1)
    
    # Get article vectors for this impression only
    indices = [news_id_to_idx[nid] for nid in impression if nid in news_id_to_idx]
    article_vectors = tfidf_matrix[indices]  # shape: [num_articles_in_impression x num_features]

    similarities = cosine_similarity(user_vector, article_vectors)
    return similarities[0]  # also return indices so you know which articles the scores belong to

def mrr_score(labels, scores):
    ranked = np.argsort(scores)[::-1]
    for rank, idx in enumerate(ranked):
        if labels[idx] == 1:
            return 1.0 / (rank + 1)
    return 0.0

def ndcg_score(labels, scores, k=5):
    ranked = np.argsort(scores)[::-1][:k]
    dcg = 0.0
    for i, idx in enumerate(ranked):
        rel = labels[idx]
        dcg += rel / np.log2(i + 2)
    ideal_dcg = sum([1.0 / np.log2(i + 2) for i in range(min(sum(labels), k))])
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

In [47]:
for user,impression,label in user_impression_data:
    scores = get_scores(user,impression)
    if len(scores) < 2 or sum(label) == 0:
        continue #ignore unmeaningful data

    try:
        all_auc.append(roc_auc_score(label, scores))
    except Exception as e:
        pass  # AUC may error on uniform labels
    all_mrr.append(mrr_score(label, scores))
    all_ndcg5.append(ndcg_score(label, scores, k=5))
    all_ndcg10.append(ndcg_score(label, scores, k=10))

In [48]:
# Print final evaluation results on eval set
print("AUC:", np.mean(all_auc))
print("MRR:", np.mean(all_mrr))
print("nDCG@5:", np.mean(all_ndcg5))
print("nDCG@10:", np.mean(all_ndcg10))

AUC: 0.5463529382705986
MRR: 0.2813361078733785
nDCG@5: 0.2611318886150245
nDCG@10: 0.3225301851463481
