In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, ndcg_score
from numpy.linalg import norm
import json

# Load behaviors.tsv
def load_behaviors_sample(file_path, sample_frac=0.1, random_state=None):
    behaviors = pd.read_csv(file_path, sep='\t', header=None,
                            names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])
    behaviors_sample = behaviors.sample(frac=sample_frac, random_state=random_state).reset_index(drop=True)
    return behaviors_sample

# Load news.tsv
def load_news(file_path):
    news_df = pd.read_csv(file_path, sep='\t', header=None,
                          names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Entity'],
                          index_col=False)
    return news_df

# build interaction matrix
def build_interaction_matrix(behaviors):
    user_item_pairs = []
    for _, row in behaviors.iterrows():
        user = row['UserID']
        impressions = row['Impressions'].split()
        for imp in impressions:
            news_id, label = imp.split('-')
            user_item_pairs.append((user, news_id, int(label)))
    interaction_df = pd.DataFrame(user_item_pairs, columns=['UserID', 'NewsID', 'Label'])
    return interaction_df

# Load entity embedding
def load_entity_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            key = parts[0]
            vec = np.array([float(x) for x in parts[1:]])
            embeddings[key] = vec
    return embeddings

# computer entity vector
def compute_news_entity_vectors(news_df, entity_embeddings):
    news_entity_vecs = {}
    for row in news_df.itertuples():
        if pd.notna(row.Entity):
            try:
                entity_info_list = json.loads(row.Entity.replace("'", '"')) 
                wikidata_ids = [entity['WikidataId'] for entity in entity_info_list if 'WikidataId' in entity]
            except Exception as e:
                wikidata_ids = []
        else:
            wikidata_ids = []

        vecs = [entity_embeddings[eid] for eid in wikidata_ids if eid in entity_embeddings]
        if vecs:
            mean_vec = np.mean(vecs, axis=0)
            news_entity_vecs[row.NewsID] = mean_vec
        else:
            news_entity_vecs[row.NewsID] = np.zeros_like(next(iter(entity_embeddings.values())))
    return news_entity_vecs, len(next(iter(entity_embeddings.values())))


# category One-Hot encoding
def encode_news_category(news_df):
    categories = pd.get_dummies(news_df['Category'])
    categories.index = news_df['NewsID']
    news_category_map = {news_id: categories.loc[news_id].values for news_id in news_df['NewsID']}
    return news_category_map, categories.shape[1]

# concatenate entity and category features
def combine_news_features(news_entity_vecs, news_category_map):
    combined_vectors = {}
    for news_id in news_entity_vecs:
        entity_vec = news_entity_vecs[news_id]
        category_vec = news_category_map.get(news_id, np.zeros(len(next(iter(news_category_map.values())))))
        combined_vectors[news_id] = np.concatenate([entity_vec, category_vec])
    return combined_vectors

# build user interest vectors
def build_user_profiles(interaction_df, news_vectors):
    user_profiles = {}
    for user_id, group in interaction_df[interaction_df['Label'] == 1].groupby('UserID'):
        clicked_news_ids = group['NewsID']
        clicked_vecs = [news_vectors[nid] for nid in clicked_news_ids if nid in news_vectors]
        if clicked_vecs:
            user_profiles[user_id] = np.mean(clicked_vecs, axis=0)
        else:
            user_profiles[user_id] = np.zeros_like(next(iter(news_vectors.values())))
    return user_profiles

# computer cosin similarity
def cosine_similarity(vec1, vec2):
    if norm(vec1) == 0 or norm(vec2) == 0:
        return 0.0
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

# Evaluate with additional metrics
def evaluate_content_based(interaction_df, user_profiles, news_vectors, k=5, sample_size=100):
    np.random.seed(42)
    all_users = list(user_profiles.keys())
    sampled_users = np.random.choice(all_users, size=min(sample_size, len(all_users)), replace=False)
    
    y_true_all = []
    y_score_all = []
    ndcg_values = []
    precision_list = []
    recall_list = []
    hitrate_count = 0

    for user in sampled_users:
        user_data = interaction_df[interaction_df['UserID'] == user]
        true_labels = []
        pred_scores = []
        user_vec = user_profiles[user]
        positive_news = set(user_data[user_data['Label'] == 1]['NewsID'])

        for row in user_data.itertuples():
            if row.NewsID not in news_vectors:
                continue
            news_vec = news_vectors[row.NewsID]
            pred = cosine_similarity(user_vec, news_vec)
            true_labels.append(row.Label)
            pred_scores.append(pred)
            y_true_all.append(row.Label)
            y_score_all.append(pred)

        if len(true_labels) > 1:
            ndcg = ndcg_score([true_labels], [pred_scores], k=k)
            ndcg_values.append(ndcg)

            # Precision, Recall, HitRate
            sorted_indices = np.argsort(pred_scores)[::-1][:k]
            recommended_top_k = [user_data.iloc[i].NewsID for i in sorted_indices]
            hit_count = len(set(recommended_top_k).intersection(positive_news))

            precision_list.append(hit_count / k)
            recall_list.append(hit_count / len(positive_news) if positive_news else 0)
            if hit_count > 0:
                hitrate_count += 1

    auc = roc_auc_score(y_true_all, y_score_all)
    mean_ndcg = np.mean(ndcg_values)
    mean_precision = np.mean(precision_list)
    mean_recall = np.mean(recall_list)
    hitrate = hitrate_count / len(sampled_users)

    return auc, mean_ndcg, mean_precision, mean_recall, hitrate

if __name__ == '__main__':
    # Load data
    behaviors = load_behaviors_sample('MINDsmall_train/behaviors.tsv', sample_frac=1)
    news_df = load_news('MINDsmall_train/news.tsv')
    interaction_df = build_interaction_matrix(behaviors)

    # Split train/test
    train_df = interaction_df.sample(frac=0.8, random_state=42)
    test_df = interaction_df.drop(train_df.index)

    # Entity embeddings
    entity_embeddings = load_entity_embeddings('MINDsmall_train/entity_embedding.vec')
    news_entity_vecs, entity_dim = compute_news_entity_vectors(news_df, entity_embeddings)

    # Category features
    news_category_map, category_dim = encode_news_category(news_df)

    # -------------------------------
    # 1. Category-only Features
    # -------------------------------
    print("\n===== Category-only Content-Based =====")
    category_only_profiles = build_user_profiles(train_df, news_category_map)
    auc_category, ndcg_category, precision_category, recall_category, hitrate_category = evaluate_content_based(
        test_df_sampled, category_only_profiles, news_category_map)

    print(f"Category Only -> AUC: {auc_category:.4f}, nDCG@5: {ndcg_category:.4f}, Precision@5: {precision_category:.4f}, Recall@5: {recall_category:.4f}, HitRate@5: {hitrate_category:.4f}")

    # -------------------------------
    # 2. Category + Entity Features
    # -------------------------------
    print("\n===== Category + Entity Content-Based =====")
    combined_vectors = combine_news_features(news_entity_vecs, news_category_map)
    combined_profiles = build_user_profiles(train_df, combined_vectors)
    auc_combined, ndcg_combined, precision_combined, recall_combined, hitrate_combined = evaluate_content_based(
        test_df_sampled, combined_profiles, combined_vectors)

    print(f"Category + Entity -> AUC: {auc_combined:.4f}, nDCG@5: {ndcg_combined:.4f}, Precision@5: {precision_combined:.4f}, Recall@5: {recall_combined:.4f}, HitRate@5: {hitrate_combined:.4f}")

    # Entity vector sparsity check
    non_zero_vec_count = sum(1 for vec in news_entity_vecs.values() if not np.all(vec == 0))
    print(f"\nNon-zero entity vectors: {non_zero_vec_count} / {len(news_entity_vecs)}")



  news_df = pd.read_csv(file_path, sep='\t', header=None,



===== Category-only Content-Based =====
Category Only -> AUC: 0.7679, nDCG@5: 0.0441, Precision@5: 0.0179, Recall@5: 0.0536, HitRate@5: 0.0300

===== Category + Entity Content-Based =====
Category + Entity -> AUC: 0.7781, nDCG@5: 0.0445, Precision@5: 0.0143, Recall@5: 0.0446, HitRate@5: 0.0300

Non-zero entity vectors: 36210 / 51282


# Analysis and Conclusion

---

## Evaluation Metrics

| Model                        | AUC    | nDCG@5 | Precision@5 | Recall@5 | HitRate@5 |
|------------------------------|--------|--------|--------------|-----------|------------|
| Category Only                | 0.7679 | 0.0441 | 0.0179       | 0.0536    | 0.0300     |
| Category + Entity            | 0.7781 | 0.0445 | 0.0143       | 0.0446    | 0.0300     |
| Entity Coverage              | -      | -      | -            | -         | 36210 / 51282 (70.6%) |

---

## Analysis

### AUC
- Both models show good overall ranking ability.
- Category + Entity slightly outperforms Category Only.

### nDCG@5
- Both models achieve similar nDCG@5 around 0.044.
- Indicates limited effectiveness in top-5 recommendation quality.

### Precision@5 and Recall@5
- Precision is low for both models, between 1.4% and 1.8%.
- Recall is slightly higher for Category Only.

### HitRate@5
- About 3% of users received at least one correct recommendation.
- HitRate is the same for both models.

### Entity Coverage
- 70.6% of news items have valid entity vectors.
- Entity vectors contribute to AUC improvement but have minimal effect on top-k metrics.

---

## Conclusions

1. Content-Based models significantly outperform User-Based models in ranking capability (AUC > 0.77 vs. 0.5).
2. Top-k recommendation effectiveness remains low.
3. Entity embeddings improve AUC but do not significantly enhance precision or recall.
