In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, ndcg_score

# load behaviors data
def load_behaviors_sample(file_path, sample_frac=0.1, random_state=None):
    behaviors = pd.read_csv(file_path, sep='\t', header=None,
                            names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])
    behaviors_sample = behaviors.sample(frac=sample_frac, random_state=random_state).reset_index(drop=True)
    return behaviors_sample
 
# build interaction matrix
def build_interaction_matrix(behaviors):
    user_item_pairs = []
    for _, row in behaviors.iterrows():
        user = row['UserID']
        impressions = row['Impressions'].split()
        for imp in impressions:
            news_id, label = imp.split('-')
            user_item_pairs.append((user, news_id, int(label)))
    interaction_df = pd.DataFrame(user_item_pairs, columns=['UserID', 'NewsID', 'Label'])
    return interaction_df

# load news
def load_news(file_path):
    news_df = pd.read_csv(file_path, sep='\t', header=None,
                          names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Entity', 'AbstractEntities'])
    return news_df

# category one hot encoding
def encode_news_category(news_df):
    categories = pd.get_dummies(news_df['Category'])
    categories.index = news_df['NewsID']
    news_category_map = {news_id: categories.loc[news_id].values for news_id in news_df['NewsID']}
    category_dim = categories.shape[1]
    return news_category_map, category_dim

# load entity embedding
def load_entity_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            key = parts[0]
            vec = np.array([float(x) for x in parts[1:]])
            embeddings[key] = vec
    return embeddings

# compute news entity vectors
def compute_news_entity_vectors(news_df, entity_embeddings):
    news_entity_vecs = {}
    for row in news_df.itertuples():
        entity_ids = str(row.Entity).split(';') if pd.notna(row.Entity) else []
        vecs = [entity_embeddings[eid] for eid in entity_ids if eid in entity_embeddings]
        if vecs:
            mean_vec = np.mean(vecs, axis=0)
            news_entity_vecs[row.NewsID] = mean_vec
        else:
            news_entity_vecs[row.NewsID] = np.zeros_like(next(iter(entity_embeddings.values())))
    return news_entity_vecs, len(next(iter(entity_embeddings.values())))

# basic SVD
def svd_basic(interaction_df, num_factors=8, num_iter=5, lr=0.01, reg=0.5):
    users = interaction_df['UserID'].unique()
    items = interaction_df['NewsID'].unique()
    user_index = {u: i for i, u in enumerate(users)}
    item_index = {i: j for j, i in enumerate(items)}
    num_users = len(users)
    num_items = len(items)

    P = np.random.normal(scale=1./num_factors, size=(num_users, num_factors))
    Q = np.random.normal(scale=1./num_factors, size=(num_items, num_factors))

    for _ in range(num_iter):
        for row in interaction_df.itertuples():
            u, i, r = user_index[row.UserID], item_index[row.NewsID], row.Label
            pred = np.dot(P[u], Q[i])
            err = r - pred
            P[u] += lr * (err * Q[i] - reg * P[u])
            Q[i] += lr * (err * P[u] - reg * Q[i])
    return P, Q, user_index, item_index

# SVD + category
def svd_with_category(interaction_df, news_category_map, category_dim, num_factors=8, num_iter=5, lr=0.01, reg=0.5):
    users = interaction_df['UserID'].unique()
    items = interaction_df['NewsID'].unique()
    user_index = {u: i for i, u in enumerate(users)}
    item_index = {i: j for j, i in enumerate(items)}

    num_users = len(users)
    num_items = len(items)
    total_dim = num_factors + category_dim

    P = np.random.normal(scale=1./total_dim, size=(num_users, total_dim))
    Q = np.random.normal(scale=1./num_factors, size=(num_items, num_factors))

    for _ in range(num_iter):
        for row in interaction_df.itertuples():
            u, i, r = user_index[row.UserID], item_index[row.NewsID], row.Label
            cat_vec = news_category_map[row.NewsID]
            Qi_extended = np.concatenate((Q[i], cat_vec))
            pred = np.dot(P[u], Qi_extended)
            err = r - pred
            P[u] += lr * (err * Qi_extended - reg * P[u])
            Q[i] += lr * (err * P[u][:num_factors] - reg * Q[i])
    return P, Q, user_index, item_index, news_category_map

# SVD + Category + Entity
def svd_with_category_entity(interaction_df, news_category_map, news_entity_vecs, category_dim, entity_dim,
                              num_factors=8, num_iter=5, lr=0.01, reg=0.5):
    users = interaction_df['UserID'].unique()
    items = interaction_df['NewsID'].unique()
    user_index = {u: i for i, u in enumerate(users)}
    item_index = {i: j for j, i in enumerate(items)}
    num_users = len(users)
    num_items = len(items)
    total_dim = num_factors + category_dim + entity_dim

    P = np.random.normal(scale=1./total_dim, size=(num_users, total_dim))
    Q = np.random.normal(scale=1./num_factors, size=(num_items, num_factors))

    for _ in range(num_iter):
        for row in interaction_df.itertuples():
            u, i, r = user_index[row.UserID], item_index[row.NewsID], row.Label
            cat_vec = news_category_map[row.NewsID]
            entity_vec = news_entity_vecs[row.NewsID]
            Qi_extended = np.concatenate((Q[i], cat_vec, entity_vec))
            pred = np.dot(P[u], Qi_extended)
            err = r - pred
            P[u] += lr * (err * Qi_extended - reg * P[u])
            Q[i] += lr * (err * P[u][:num_factors] - reg * Q[i])
    return P, Q, user_index, item_index

# caculate matrics
def evaluate_sample(interaction_df, P, Q, user_index, item_index, news_category_map=None, news_entity_vecs=None, k=5):
    y_true_all = []
    y_score_all = []
    ndcg_values = []
    precision_list = []
    recall_list = []
    hitrate_count = 0

    for user in user_index.keys():
        u = user_index[user]
        user_data = interaction_df[interaction_df['UserID'] == user]
        true_labels = []
        pred_scores = []
        positive_news = set(user_data[user_data['Label'] == 1]['NewsID'])

        for row in user_data.itertuples():
            if row.NewsID not in item_index:
                continue
            i = item_index[row.NewsID]
            if news_category_map is None:
                Qi_ext = Q[i]
            elif news_entity_vecs is None:
                Qi_ext = np.concatenate((Q[i], news_category_map[row.NewsID]))
            else:
                Qi_ext = np.concatenate((Q[i], news_category_map[row.NewsID], news_entity_vecs[row.NewsID]))
            pred = np.dot(P[u], Qi_ext)
            true_labels.append(row.Label)
            pred_scores.append(pred)
            y_true_all.append(row.Label)
            y_score_all.append(pred)

        if len(true_labels) > 1:
            ndcg = ndcg_score([true_labels], [pred_scores], k=k)
            ndcg_values.append(ndcg)

            sorted_indices = np.argsort(pred_scores)[::-1][:k]
            recommended_top_k = [user_data.iloc[i].NewsID for i in sorted_indices]
            hit_count = len(set(recommended_top_k).intersection(positive_news))

            precision_list.append(hit_count / k)
            recall_list.append(hit_count / len(positive_news) if positive_news else 0)
            if hit_count > 0:
                hitrate_count += 1

    auc = roc_auc_score(y_true_all, y_score_all)
    mean_ndcg = np.mean(ndcg_values)
    mean_precision = np.mean(precision_list)
    mean_recall = np.mean(recall_list)
    hitrate = hitrate_count / len(user_index)

    return auc, mean_ndcg, mean_precision, mean_recall, hitrate

def evaluate_on_train_sample(interaction_df_train, P, Q, user_index, item_index, news_category_map=None, news_entity_vecs=None, k=5, sample_size=1000):
    np.random.seed(42)
    all_users = list(user_index.keys())
    sampled_users = np.random.choice(all_users, size=min(sample_size, len(all_users)), replace=False)

    y_true_all = []
    y_score_all = []
    ndcg_values = []
    precision_list = []
    recall_list = []
    hitrate_count = 0

    for user in sampled_users:
        u = user_index[user]
        user_data = interaction_df_train[interaction_df_train['UserID'] == user]
        true_labels = []
        pred_scores = []
        positive_news = set(user_data[user_data['Label'] == 1]['NewsID'])

        for row in user_data.itertuples():
            if row.NewsID not in item_index:
                continue
            i = item_index[row.NewsID]
            if news_category_map is None:
                Qi_ext = Q[i]
            elif news_entity_vecs is None:
                Qi_ext = np.concatenate((Q[i], news_category_map[row.NewsID]))
            else:
                Qi_ext = np.concatenate((Q[i], news_category_map[row.NewsID], news_entity_vecs[row.NewsID]))
            pred = np.dot(P[u], Qi_ext)
            true_labels.append(row.Label)
            pred_scores.append(pred)
            y_true_all.append(row.Label)
            y_score_all.append(pred)

        if len(true_labels) > 1:
            ndcg = ndcg_score([true_labels], [pred_scores], k=k)
            ndcg_values.append(ndcg)

            sorted_indices = np.argsort(pred_scores)[::-1][:k]
            recommended_top_k = [user_data.iloc[i].NewsID for i in sorted_indices]
            hit_count = len(set(recommended_top_k).intersection(positive_news))

            precision_list.append(hit_count / k)
            recall_list.append(hit_count / len(positive_news) if positive_news else 0)
            if hit_count > 0:
                hitrate_count += 1

    auc = roc_auc_score(y_true_all, y_score_all)
    mean_ndcg = np.mean(ndcg_values)
    mean_precision = np.mean(precision_list)
    mean_recall = np.mean(recall_list)
    hitrate = hitrate_count / len(sampled_users)

    return auc, mean_ndcg, mean_precision, mean_recall, hitrate

# main
# if __name__ == '__main__':
#     behaviors = load_behaviors_sample('MINDsmall_train/behaviors.tsv', sample_frac=0.01)
#     news_df = load_news('MINDsmall_train/news.tsv')
#     interaction_df = build_interaction_matrix(behaviors)

#     train_df = interaction_df.sample(frac=0.9, random_state=42)
#     test_df = interaction_df.drop(train_df.index)

#     news_category_map, category_dim = encode_news_category(news_df)
#     entity_embeddings = load_entity_embeddings('MINDsmall_train/entity_embedding.vec')
#     news_entity_vecs, entity_dim = compute_news_entity_vectors(news_df, entity_embeddings)

#     # 1. Basic SVD
#     P_svd, Q_svd, user_idx_svd, item_idx_svd = svd_basic(train_df)
#     auc_svd, ndcg_svd, precision_svd, recall_svd, hitrate_svd = evaluate_sample(test_df_sampled, P_svd, Q_svd, user_idx_svd, item_idx_svd)
#     print(f"Basic SVD -> AUC: {auc_svd:.4f}, nDCG@5: {ndcg_svd:.4f}, Precision@5: {precision_svd:.4f}, Recall@5: {recall_svd:.4f}, HitRate@5: {hitrate_svd:.4f}")

#     # 2. SVD + Category
#     P_cat, Q_cat, user_idx_cat, item_idx_cat, _ = svd_with_category(train_df, news_category_map, category_dim)
#     auc_cat, ndcg_cat, precision_cat, recall_cat, hitrate_cat = evaluate_sample(test_df_sampled, P_cat, Q_cat, user_idx_cat, item_idx_cat, news_category_map=news_category_map)
#     print(f"SVD + Category -> AUC: {auc_cat:.4f}, nDCG@5: {ndcg_cat:.4f}, Precision@5: {precision_cat:.4f}, Recall@5: {recall_cat:.4f}, HitRate@5: {hitrate_cat:.4f}")

#     # 3. SVD + Category + Entity
#     P_all, Q_all, user_idx_all, item_idx_all = svd_with_category_entity(train_df, news_category_map, news_entity_vecs, category_dim, entity_dim)
#     auc_all, ndcg_all, precision_all, recall_all, hitrate_all = evaluate_sample(test_df_sampled, P_all, Q_all, user_idx_all, item_idx_all, news_category_map=news_category_map, news_entity_vecs=news_entity_vecs)
#     print(f"SVD + Category + Entity -> AUC: {auc_all:.4f}, nDCG@5: {ndcg_all:.4f}, Precision@5: {precision_all:.4f}, Recall@5: {recall_all:.4f}, HitRate@5: {hitrate_all:.4f}")

if __name__ == '__main__':
    behaviors = load_behaviors_sample('MINDsmall_train/behaviors.tsv', sample_frac=0.1)
    news_df = load_news('MINDsmall_train/news.tsv')
    interaction_df = build_interaction_matrix(behaviors)

    print(news_df.head())

    train_df = interaction_df.sample(frac=0.8, random_state=42)
    test_df = interaction_df.drop(train_df.index)

    news_category_map, category_dim = encode_news_category(news_df)
    entity_embeddings = load_entity_embeddings('MINDsmall_train/entity_embedding.vec')
    news_entity_vecs, entity_dim = compute_news_entity_vectors(news_df, entity_embeddings)

    # 1. Basic SVD
    P_svd, Q_svd, user_idx_svd, item_idx_svd = svd_basic(train_df)
    auc_svd, ndcg_svd, precision_svd, recall_svd, hitrate_svd = evaluate_sample(test_df, P_svd, Q_svd, user_idx_svd, item_idx_svd)
    print(f"Test set Basic SVD -> AUC: {auc_svd:.4f}, nDCG@5: {ndcg_svd:.4f}, Precision@5: {precision_svd:.4f}, Recall@5: {recall_svd:.4f}, HitRate@5: {hitrate_svd:.4f}")

    # Train Sample Evaluation for Basic SVD
    auc_train_svd, ndcg_train_svd, precision_train_svd, recall_train_svd, hitrate_train_svd = evaluate_on_train_sample(
        train_df, P_svd, Q_svd, user_idx_svd, item_idx_svd)
    print(f"Train set Basic SVD -> AUC: {auc_train_svd:.4f}, nDCG@5: {ndcg_train_svd:.4f}, Precision@5: {precision_train_svd:.4f}, Recall@5: {recall_train_svd:.4f}, HitRate@5: {hitrate_train_svd:.4f}")

    # 2. SVD + Category
    P_cat, Q_cat, user_idx_cat, item_idx_cat, _ = svd_with_category(train_df, news_category_map, category_dim)
    auc_cat, ndcg_cat, precision_cat, recall_cat, hitrate_cat = evaluate_sample(test_df, P_cat, Q_cat, user_idx_cat, item_idx_cat, news_category_map=news_category_map)
    print(f"Test set SVD + Category -> AUC: {auc_cat:.4f}, nDCG@5: {ndcg_cat:.4f}, Precision@5: {precision_cat:.4f}, Recall@5: {recall_cat:.4f}, HitRate@5: {hitrate_cat:.4f}")

    # Train Sample Evaluation for SVD + Category
    auc_train_cat, ndcg_train_cat, precision_train_cat, recall_train_cat, hitrate_train_cat = evaluate_on_train_sample(
        train_df, P_cat, Q_cat, user_idx_cat, item_idx_cat, news_category_map=news_category_map)
    print(f"Train set SVD + Category -> AUC: {auc_train_cat:.4f}, nDCG@5: {ndcg_train_cat:.4f}, Precision@5: {precision_train_cat:.4f}, Recall@5: {recall_train_cat:.4f}, HitRate@5: {hitrate_train_cat:.4f}")

    # 3. SVD + Category + Entity
    P_all, Q_all, user_idx_all, item_idx_all = svd_with_category_entity(train_df, news_category_map, news_entity_vecs, category_dim, entity_dim)
    auc_all, ndcg_all, precision_all, recall_all, hitrate_all = evaluate_sample(test_df, P_all, Q_all, user_idx_all, item_idx_all, news_category_map=news_category_map, news_entity_vecs=news_entity_vecs)
    print(f"Test set SVD + Category + Entity -> AUC: {auc_all:.4f}, nDCG@5: {ndcg_all:.4f}, Precision@5: {precision_all:.4f}, Recall@5: {recall_all:.4f}, HitRate@5: {hitrate_all:.4f}")

    # Train Sample Evaluation for SVD + Category + Entity
    auc_train_all, ndcg_train_all, precision_train_all, recall_train_all, hitrate_train_all = evaluate_on_train_sample(
        train_df, P_all, Q_all, user_idx_all, item_idx_all, news_category_map=news_category_map, news_entity_vecs=news_entity_vecs)
    print(f"Train set SVD + Category + Entity -> AUC: {auc_train_all:.4f}, nDCG@5: {ndcg_train_all:.4f}, Precision@5: {precision_train_all:.4f}, Recall@5: {recall_train_all:.4f}, HitRate@5: {hitrate_train_all:.4f}")


   NewsID   Category      SubCategory  \
0  N55528  lifestyle  lifestyleroyals   
1  N19639     health       weightloss   
2  N61837       news        newsworld   
3  N53526     health           voices   
4  N38324     health          medical   

                                               Title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1                      50 Worst Habits For Belly Fat   
2  The Cost of Trump's Aid Freeze in the Trenches...   
3  I Was An NBA Wife. Here's How It Affected My M...   
4  How to Get Rid of Skin Tags, According to a De...   

                                            Abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  These seemingly harmless habits are holding yo...   
2  Lt. Ivan Molchanets peeked over a parapet of s...   
3  I felt like I was a fraud, and being an NBA wi...   
4  They seem harmless, but there's a very good re...   

                                             URL  \
0  https://assets.msn.com/l

# SVD-Based Recommendation Evaluation (Train vs Test)

---

## Evaluation Results

| Model                      | AUC (Train) | AUC (Test) | nDCG@5 (Train) | nDCG@5 (Test) | Precision@5 (Test) | Recall@5 (Test) | HitRate@5 (Test) |
|----------------------------|--------------|------------|----------------|---------------|---------------------|-----------------|------------------|
| Basic SVD                  | 0.6058       | 0.5007     | 0.2735         | 0.1368        | 0.0450              | 0.2007          | 0.1712           |
| SVD + Category             | 0.8718       | 0.5115     | 0.4670         | 0.1386        | 0.0482              | 0.2122          | 0.1806           |
| SVD + Category + Entity    | 0.9297       | 0.4961     | 0.5785         | 0.1382        | 0.0474              | 0.2098          | 0.1783           |

---

## Analysis

- Performance on the training set is significantly higher than on the test set, indicating clear overfitting.
- AUC on the test set is close to 0.5, meaning ranking quality is almost random on unseen data.
- nDCG@5 and Precision@5 are low, suggesting limited practical recommendation value.
- Adding Category and Entity features improves training performance but does not help test performance.

---

## Possible Causes

- Model complexity is too high due to large feature dimensions, leading to overfitting.
- Entity features are sparse or of low quality, failing to improve generalization.
- User-item interaction data is sparse, limiting SVD's ability to capture latent relationships.
- Training iterations or regularization parameters are not properly tuned.

---
