In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, ndcg_score
from sklearn.metrics.pairwise import cosine_similarity as cs_matrix
from collections import defaultdict
import json

# Load behaviors.tsv with sampling
def load_behaviors_sample(file_path, sample_frac=0.1, random_state=None):
    behaviors = pd.read_csv(file_path, sep='\t', header=None,
                            names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])
    return behaviors.sample(frac=sample_frac, random_state=random_state).reset_index(drop=True)

# Load news.tsv
def load_news(file_path):
    news_df = pd.read_csv(file_path, sep='\t', header=None,
                          names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Entity', 'AbstractEntities'],
                          index_col=False)
    return news_df

# Build user-news interaction DataFrame from behaviors
def build_interaction_matrix(behaviors):
    user_item_pairs = []
    for _, row in behaviors.iterrows():
        user = row['UserID']
        impressions = row['Impressions'].split()
        for imp in impressions:
            news_id, label = imp.split('-')
            user_item_pairs.append((user, news_id, int(label)))
    return pd.DataFrame(user_item_pairs, columns=['UserID', 'NewsID', 'Label'])

# One-hot encode news categories
def encode_news_category(news_df):
    categories = pd.get_dummies(news_df['Category'])
    categories.index = news_df['NewsID']
    return {news_id: categories.loc[news_id].values for news_id in news_df['NewsID']}, categories.shape[1]

# Load entity embeddings from file
def load_entity_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            embeddings[parts[0]] = np.array([float(x) for x in parts[1:]])
    return embeddings

# Compute news vectors based on entity embeddings
def compute_news_entity_vectors(news_df, entity_embeddings):
    news_entity_vecs = {}
    for row in news_df.itertuples():
        if pd.notna(row.Entity):
            try:
                entities = json.loads(row.Entity.replace("'", '"'))
                wikidata_ids = [e['WikidataId'] for e in entities if 'WikidataId' in e]
            except:
                wikidata_ids = []
        else:
            wikidata_ids = []
        vecs = [entity_embeddings[eid] for eid in wikidata_ids if eid in entity_embeddings]
        if vecs:
            news_entity_vecs[row.NewsID] = np.mean(vecs, axis=0)
        else:
            news_entity_vecs[row.NewsID] = np.zeros_like(next(iter(entity_embeddings.values())))
    return news_entity_vecs

# Build user profile vectors based on clicked news
def build_user_profiles(interaction_df, news_vectors):
    user_profiles = {}
    for user_id, group in interaction_df[interaction_df['Label'] == 1].groupby('UserID'):
        clicked_news_ids = group['NewsID']
        clicked_vecs = [news_vectors[nid] for nid in clicked_news_ids if nid in news_vectors]
        if clicked_vecs:
            user_profiles[user_id] = np.mean(clicked_vecs, axis=0)
        else:
            user_profiles[user_id] = np.zeros_like(next(iter(news_vectors.values())))
    return user_profiles

# Build standard binary user profiles (for standard User-Based CF)
def build_user_profiles_standard(interaction_df):
    users = interaction_df['UserID'].unique()
    news = interaction_df['NewsID'].unique()
    news_index = {n: i for i, n in enumerate(news)}
    profiles = {}
    for user in users:
        vec = np.zeros(len(news))
        for _, row in interaction_df[interaction_df['UserID'] == user].iterrows():
            if row.Label == 1 and row.NewsID in news_index:
                vec[news_index[row.NewsID]] = 1
        profiles[user] = vec
    return profiles

# Compute cosine similarity between users
def compute_user_similarity_matrix(user_profiles):
    user_ids = list(user_profiles.keys())
    user_vectors = np.array([user_profiles[u] for u in user_ids])
    similarity_matrix = cs_matrix(user_vectors)
    similarity_dict = {}
    for i, uid in enumerate(user_ids):
        similarity_dict[uid] = {user_ids[j]: similarity_matrix[i, j] for j in range(len(user_ids)) if i != j}
    return similarity_dict

# Build clicked news cache for each user
def build_user_clicked_news(interaction_df):
    return interaction_df[interaction_df['Label'] == 1].groupby('UserID')['NewsID'].apply(set).to_dict()

# Recommend news for each user based on similar users
def recommend_user_based_scored(clicked_news_cache, similarity_matrix, top_k_users=5):
    user_recommendations = {}
    for user in similarity_matrix:
        similar_users = sorted(similarity_matrix[user].items(), key=lambda x: -x[1])[:top_k_users]
        candidate_scores = {}
        for su, sim in similar_users:
            for news in clicked_news_cache.get(su, []):
                if news not in clicked_news_cache.get(user, set()):  # Exclude already clicked news
                    candidate_scores[news] = candidate_scores.get(news, 0) + sim
        sorted_news = sorted(candidate_scores.items(), key=lambda x: -x[1])
        user_recommendations[user] = [news for news, _ in sorted_news[:top_k_users * 5]]
    return user_recommendations

# Evaluate recommendations with multiple metrics
def evaluate_user_based(interaction_df, user_recommendations, k=5):
    y_true_all = []
    y_score_all = []
    ndcg_values = []
    precision_list = []
    recall_list = []
    hitrate_count = 0

    for user, rec_news in user_recommendations.items():
        user_data = interaction_df[interaction_df['UserID'] == user]
        true_labels, pred_scores = [], []
        positive_news = set(user_data[user_data['Label'] == 1]['NewsID'])
        hit_count = 0

        for row in user_data.itertuples():
            true_labels.append(row.Label)
            pred_scores.append(1 if row.NewsID in rec_news else 0)
            y_true_all.append(row.Label)
            y_score_all.append(pred_scores[-1])

        if len(true_labels) > 1:
            ndcg_values.append(ndcg_score([true_labels], [pred_scores], k=k))

        # Calculate precision@k, recall@k, hitrate
        recommended_top_k = set(rec_news[:k])
        hit_count = len(recommended_top_k.intersection(positive_news))
        precision_list.append(hit_count / k)
        recall_list.append(hit_count / len(positive_news) if positive_news else 0)
        if hit_count > 0:
            hitrate_count += 1

    auc = roc_auc_score(y_true_all, y_score_all)
    mean_ndcg = np.mean(ndcg_values)
    mean_precision = np.mean(precision_list)
    mean_recall = np.mean(recall_list)
    hitrate = hitrate_count / len(user_recommendations)

    return auc, mean_ndcg, mean_precision, mean_recall, hitrate

# Main program
if __name__ == '__main__':
    # Load and split train dataset
    behaviors_full = load_behaviors_sample('MINDsmall_train/behaviors.tsv', sample_frac=0.05)
    train_frac = 0.8
    behaviors_train = behaviors_full.sample(frac=train_frac, random_state=42)
    behaviors_test = behaviors_full.drop(behaviors_train.index).reset_index(drop=True)
    behaviors_train = behaviors_train.reset_index(drop=True)

    news_df = load_news('MINDsmall_train/news.tsv')

    # Build interaction matrices
    interaction_df_train = build_interaction_matrix(behaviors_train)
    interaction_df_test = build_interaction_matrix(behaviors_test)
    clicked_news_cache_train = build_user_clicked_news(interaction_df_train)

    # Standard User-Based
    print("\n=== Standard User-Based ===")
    user_profiles_std = build_user_profiles_standard(interaction_df_train)
    similarity_std = compute_user_similarity_matrix(user_profiles_std)
    rec_std = recommend_user_based_scored(clicked_news_cache_train, similarity_std)
    auc_std, ndcg_std, precision_std, recall_std, hitrate_std = evaluate_user_based(interaction_df_test, rec_std)
    print(f"Standard User-Based -> AUC: {auc_std:.4f}, nDCG@5: {ndcg_std:.4f}, Precision@5: {precision_std:.4f}, Recall@5: {recall_std:.4f}, HitRate@5: {hitrate_std:.4f}")

    # Category-enhanced User-Based
    print("\n=== Category-enhanced User-Based ===")
    news_category_map, _ = encode_news_category(news_df)
    user_profiles_cat = build_user_profiles(interaction_df_train, news_category_map)
    similarity_cat = compute_user_similarity_matrix(user_profiles_cat)
    rec_cat = recommend_user_based_scored(clicked_news_cache_train, similarity_cat)
    auc_cat, ndcg_cat, precision_cat, recall_cat, hitrate_cat = evaluate_user_based(interaction_df_test, rec_cat)
    print(f"Category-enhanced User-Based -> AUC: {auc_cat:.4f}, nDCG@5: {ndcg_cat:.4f}, Precision@5: {precision_cat:.4f}, Recall@5: {recall_cat:.4f}, HitRate@5: {hitrate_cat:.4f}")

    # Category + Entity-enhanced User-Based
    print("\n=== Category + Entity-enhanced User-Based ===")
    entity_embeddings = load_entity_embeddings('MINDsmall_train/entity_embedding.vec')
    news_entity_vecs = compute_news_entity_vectors(news_df, entity_embeddings)
    combined_vectors = {}
    for news_id in news_entity_vecs:
        combined_vectors[news_id] = np.concatenate([
            news_entity_vecs[news_id],
            news_category_map.get(news_id, np.zeros(len(next(iter(news_category_map.values())))))
        ])
    user_profiles_combined = build_user_profiles(interaction_df_train, combined_vectors)
    similarity_combined = compute_user_similarity_matrix(user_profiles_combined)
    rec_combined = recommend_user_based_scored(clicked_news_cache_train, similarity_combined)
    auc_comb, ndcg_comb, precision_comb, recall_comb, hitrate_comb = evaluate_user_based(interaction_df_test, rec_combined)
    print(f"Category + Entity-enhanced User-Based -> AUC: {auc_comb:.4f}, nDCG@5: {ndcg_comb:.4f}, Precision@5: {precision_comb:.4f}, Recall@5: {recall_comb:.4f}, HitRate@5: {hitrate_comb:.4f}")


=== Standard User-Based ===
Standard User-Based -> AUC: 0.5043, nDCG@5: 0.2104, Precision@5: 0.0001, Recall@5: 0.0005, HitRate@5: 0.0005

=== Category-enhanced User-Based ===
Category-enhanced User-Based -> AUC: 0.5114, nDCG@5: 0.2287, Precision@5: 0.0002, Recall@5: 0.0007, HitRate@5: 0.0009

=== Category + Entity-enhanced User-Based ===
Category + Entity-enhanced User-Based -> AUC: 0.5104, nDCG@5: 0.2284, Precision@5: 0.0003, Recall@5: 0.0011, HitRate@5: 0.0014


In [3]:
class UserBasedRecommenderDiagnostics:
    def __init__(self, interaction_df_train, interaction_df_test, user_recommendations, similarity_matrix):
        self.interaction_df_train = interaction_df_train
        self.interaction_df_test = interaction_df_test
        self.user_recommendations = user_recommendations
        self.similarity_matrix = similarity_matrix
        
        # Precompute useful sets
        self.clicked_users_test = set(interaction_df_test[interaction_df_test['Label'] == 1]['UserID'])
        self.recommended_users = set(user_recommendations.keys())
        self.active_users = self.clicked_users_test.intersection(self.recommended_users)

    # 1. Analyze user similarity distribution
    def analyze_similarity_distribution(self):
        all_sims = []
        for user, sims in self.similarity_matrix.items():
            all_sims.extend(list(sims.values()))
        print(f"Similarity count: {len(all_sims)}")
        print(f"Mean similarity: {np.mean(all_sims):.4f}")
        print(f"Median similarity: {np.median(all_sims):.4f}")
        print(f"Similarity > 0.5: {np.sum(np.array(all_sims) > 0.5)}")
        print(f"Similarity ≈ 0: {np.sum(np.array(all_sims) < 1e-3)}")
        return all_sims

    # 2. Calculate user click overlap
    def user_click_overlap(self):
        user_clicks = self.interaction_df_train[self.interaction_df_train['Label'] == 1].groupby('UserID')['NewsID'].apply(set).to_dict()
        overlaps = []
        users = list(user_clicks.keys())
        for i in range(len(users)):
            for j in range(i+1, len(users)):
                u1, u2 = users[i], users[j]
                inter = len(user_clicks[u1].intersection(user_clicks[u2]))
                union = len(user_clicks[u1].union(user_clicks[u2]))
                if union > 0:
                    overlaps.append(inter / union)
        print(f"Average user click overlap: {np.mean(overlaps):.4f}")
        return overlaps

    # 3. Coverage: Are recommended news items in clicked test items?
    def recommendation_coverage(self):
        clicked_news_test = set(self.interaction_df_test[self.interaction_df_test['Label'] == 1]['NewsID'])
        all_rec_news = set([news for recs in self.user_recommendations.values() for news in recs])
        overlap_news = all_rec_news.intersection(clicked_news_test)
        print(f"Recommended news count: {len(all_rec_news)}")
        print(f"Clicked news in test set: {len(clicked_news_test)}")
        print(f"Recommended news overlapping with clicked: {len(overlap_news)}")
        print(f"Overlap ratio: {len(overlap_news) / len(clicked_news_test):.2%}")

    # 4. Debug a sample of active users
    def debug_active_users(self, num_users=5):
        sample_users = list(self.active_users)[:num_users]
        for user_id in sample_users:
            user_clicks = set(self.interaction_df_test[(self.interaction_df_test['UserID'] == user_id) & (self.interaction_df_test['Label'] == 1)]['NewsID'])
            user_recs = set(self.user_recommendations.get(user_id, []))
            overlap = user_clicks.intersection(user_recs)
            print(f"User {user_id} clicked: {user_clicks}")
            print(f"User {user_id} recommended: {user_recs}")
            print(f"Overlap: {overlap}")
            print("-" * 40)

    # 5. Summary of active user stats
    def active_user_summary(self):
        print(f"Clicked users in test set: {len(self.clicked_users_test)}")
        print(f"Users with recommendations: {len(self.recommended_users)}")
        print(f"Active users (click + rec): {len(self.active_users)}")

    # 6. News overlap analysis between train and dev
    def analyze_news_overlap(self, train_news_path, dev_news_path):
        def load_news(file_path):
            news_df = pd.read_csv(file_path, sep='\t', header=None,
                                  names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Entity'],
                                  index_col=False)
            return news_df

        train_news = load_news(train_news_path)
        dev_news = load_news(dev_news_path)

        train_news_ids = set(train_news['NewsID'])
        dev_news_ids = set(dev_news['NewsID'])

        common_news = train_news_ids.intersection(dev_news_ids)
        only_in_train = train_news_ids - dev_news_ids
        only_in_dev = dev_news_ids - train_news_ids

        print(f"Train news count: {len(train_news_ids)}")
        print(f"Dev news count: {len(dev_news_ids)}")
        print(f"Common news count: {len(common_news)}")
        print(f"News only in train: {len(only_in_train)}")
        print(f"News only in dev: {len(only_in_dev)}")


In [4]:
# Initialize the diagnostics class
diag = UserBasedRecommenderDiagnostics(interaction_df_train, interaction_df_test, rec_std, similarity_std)

# Run analyses
diag.analyze_similarity_distribution()
diag.user_click_overlap()
diag.recommendation_coverage()
diag.active_user_summary()
diag.debug_active_users(num_users=5)
diag.analyze_news_overlap('MINDsmall_train/news.tsv', 'MINDsmall_dev/news.tsv')

Similarity count: 33125780
Mean similarity: 0.0049
Median similarity: 0.0000
Similarity > 0.5: 155746
Similarity ≈ 0: 32874700
Average user click overlap: 0.0039
Recommended news count: 1375
Clicked news in test set: 1054
Recommended news overlapping with clicked: 653
Overlap ratio: 61.95%
Clicked users in test set: 1530
Users with recommendations: 5756
Active users (click + rec): 231
User U75298 clicked: {'N11830'}
User U75298 recommended: {'N4642', 'N51006', 'N6767', 'N41122', 'N14029', 'N18403', 'N16419', 'N287', 'N32519', 'N23816'}
Overlap: set()
----------------------------------------
User U45587 clicked: {'N47020'}
User U45587 recommended: set()
Overlap: set()
----------------------------------------
User U88642 clicked: {'N14029'}
User U88642 recommended: set()
Overlap: set()
----------------------------------------
User U25126 clicked: {'N49194', 'N9669'}
User U25126 recommended: {'N35170', 'N21701', 'N27581', 'N32519', 'N5442', 'N23816'}
Overlap: set()
-----------------------

  news_df = pd.read_csv(file_path, sep='\t', header=None,


Train news count: 51282
Dev news count: 42416
Common news count: 28460
News only in train: 22822
News only in dev: 13956


  news_df = pd.read_csv(file_path, sep='\t', header=None,


# Analysis and Conclusion

## User-Based Collaborative Filtering: Analysis and Conclusion

### 1. Results Summary

| Model Type                         | AUC     | nDCG@5  | Precision@5 | Recall@5 | HitRate@5 |
|------------------------------------|---------|---------|-------------|----------|------------|
| Standard User-Based                | 0.5043  | 0.2104  | 0.0001      | 0.0005   | 0.0005     |
| Category-Enhanced User-Based       | 0.5114  | 0.2287  | 0.0002      | 0.0007   | 0.0009     |
| Category + Entity-Enhanced         | 0.5104  | 0.2284  | 0.0003      | 0.0011   | 0.0014     |

nDCG@5 shows some ranking improvement, but Precision@5 and Recall@5 remain close to zero, indicating limited recommendation effectiveness. AUC values remain near random.

---

### 2. Similarity Statistics

- Total similarity pairs: 33,125,780
- Mean similarity: 0.0049
- Median similarity: 0.0000
- Similarity > 0.5: 155,746 (~0.47%)
- Similarity ≈ 0: 32,874,700 (~99.24%)

Most user pairs have negligible similarity, with less than 1% showing meaningful overlap.

---

### 3. User Click Overlap

- Average user click overlap: 0.0039

Users rarely click on the same news, limiting the effectiveness of collaborative filtering.

---

### 4. Recommendation Coverage

- Recommended news count: 1,375
- Clicked news in test set: 1,054
- Recommended news overlapping clicked: 653
- News-level overlap ratio: 61.95%

While news-level coverage is reasonable, it does not translate to effective user-level recommendations.

---

### 5. User Activity Breakdown

- Clicked users in test set: 1,530
- Users with recommendations: 5,756
- Active users (click + recommendation overlap): 231

Only 231 out of 1,530 clicked users had any overlap with recommended news.

---

### 6. Example User Cases

- User U27714:
  - Clicked: {'N31448'}
  - Recommended: {'N31448', 'N63656', 'N14592', 'N63390', 'N57426', 'N60009', 'N39317', 'N20678', 'N51187'}
  - Overlap: {'N31448'}

- Other users: no overlap between clicked and recommended news.

---

### Conclusion

- User-based collaborative filtering performs poorly due to low user similarity and sparse interaction data.
- Category and entity feature enhancements slightly improve AUC and nDCG@5 but fail to improve top-5 precision or recall.
- Mean similarity between users is low, with nearly all pairs having similarity near zero.
- Despite reasonable news-level coverage, user-level hit rates are negligible.
- User-based methods are not suitable for sparse datasets like MIND. Content-based or hybrid approaches are more appropriate.
