In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# ===========================
# Load Amazon Beauty .inter files
# ===========================
def load_inter_file(path):
    return pd.read_csv(
        path,
        sep="\t",  # or ' ' if space-separated
        dtype={"user_id": str, "item_id": str, "timestamp": str, "label": int},
        low_memory=False
    )

train_df = load_inter_file("amazon-beauty-train.inter")
val_df = load_inter_file("amazon-beauty-valid.inter")   # optional
test_df = load_inter_file("amazon-beauty-test.inter")

# Build last item dictionary for leave-one-out
user_last_item = train_df.groupby("user_id")["item_id"].last().to_dict()

In [3]:
print(train_df.head())
print(train_df.dtypes)

  user_id item_id    timestamp  label
0    2238     657  908755200.0      1
1    2254     661  912297600.0      1
2    2274     671  921542400.0      1
3    2355     687  928368000.0      1
4    2197     647  937267200.0      1
user_id      object
item_id      object
timestamp    object
label         int64
dtype: object


In [4]:
# ===========================
# Baseline Functions
# ===========================

# -------- Most Popular --------
def most_popular(train_df, K=10):
    """
    Count item frequencies (label=1) in train set and return top-K items.
    """
    item_counts = train_df[train_df["label"]==1]["item_id"].value_counts()
    return item_counts.index[:K].tolist()


def evaluate_most_popular(test_df, top_k_items):
    """
    Evaluate HitRate@K for Most Popular baseline.
    Only considers positive interactions (label=1).
    """
    test_pos = test_df[test_df["label"]==1]
    hits = 0
    for _, row in test_pos.iterrows():
        if row["item_id"] in top_k_items:
            hits += 1
    return hits / len(test_pos)

In [5]:
# ===========================
# Item-based KNN baseline with sparse matrix
# ===========================
# Map string IDs to integer indices
user2idx = {u: i for i, u in enumerate(train_df['user_id'].unique())}
item2idx = {i: j for j, i in enumerate(train_df['item_id'].unique())}
idx2item = {v: k for k, v in item2idx.items()}

rows = train_df['item_id'].map(item2idx)
cols = train_df['user_id'].map(user2idx)
data = train_df['label'].astype(float)

# Build sparse item-user matrix
sparse_matrix = csr_matrix((data, (rows, cols)), shape=(len(item2idx), len(user2idx)))

# Cosine similarity (sparse)
sim_matrix = cosine_similarity(sparse_matrix, dense_output=False)

# Recommendation function
def recommend_item_knn_sparse(sim_matrix, last_item, K=10):
    if last_item not in item2idx:
        return []
    idx = item2idx[last_item]
    sim_scores = np.array(sim_matrix[idx].toarray()).flatten()
    top_idx = np.argsort(-sim_scores)[1:K+1]  # skip itself
    return [idx2item[i] for i in top_idx]

# Evaluate HR@K
def evaluate_item_knn_sparse(sim_matrix, test_df, user_last_item, K=10):
    test_pos = test_df[test_df["label"]==1]
    hits = 0
    for _, row in test_pos.iterrows():
        user = row["user_id"]
        true_item = row["item_id"]
        last_item = user_last_item.get(user)
        if last_item is None:
            continue
        recs = recommend_item_knn_sparse(sim_matrix, last_item, K)
        if true_item in recs:
            hits += 1
    return hits / len(test_pos)

In [7]:
# ===========================
# Run baselines
# ===========================
# Most Popular
for K in [10, 20, 50]:
    top_items = most_popular(train_df, K=K)
    hr = evaluate_most_popular(test_df, top_items)
    print(f"Most Popular HR@{K}: {round(hr,4)}")

# Item-KNN
for K in [10, 20, 50]:
    hr = evaluate_item_knn_sparse(sim_matrix, test_df, user_last_item, K=K)
    print(f"Item-KNN HR@{K}: {round(hr,4)}")

# ===========================
# Print top recommendations for first 5 users
# ===========================
print("\nTop recommendations for first 5 users (Item-KNN, K=10):")
for user in list(user_last_item.keys())[:5]:
    last_item = user_last_item[user]
    recs = recommend_item_knn_sparse(sim_matrix, last_item, K=10)
    print(f"User {user}, last item {last_item}, recommended: {recs}")

# ===========================
# Save results
# ===========================
results = []
for K in [10, 20, 50]:
    mp_hr = evaluate_most_popular(test_df, most_popular(train_df, K=K))
    knn_hr = evaluate_item_knn_sparse(sim_matrix, test_df, user_last_item, K=K)
    results.append({"Model": f"Most Popular HR@{K}", "HR": mp_hr})
    results.append({"Model": f"Item-KNN HR@{K}", "HR": knn_hr})

pd.DataFrame(results).to_csv("baseline_results.csv", index=False)
print("\nBaseline results saved to baseline_results.csv")

Most Popular HR@10: 0.0077
Most Popular HR@20: 0.0132
Most Popular HR@50: 0.0241
Item-KNN HR@10: 0.0123
Item-KNN HR@20: 0.018
Item-KNN HR@50: 0.0294

Top recommendations for first 5 users (Item-KNN, K=10):
User 1, last item 81855, recommended: ['42447', '1', '139583', '169344', '186634', '22048', '235440', '149737', '150342', '150341']
User 10, last item 89347, recommended: ['171605', '62224', '123397', '192557', '68192', '65397', '37626', '231254', '84153', '61967']
User 100, last item 63, recommended: ['134541', '66920', '43777', '51432', '162531', '35537', '242240', '241069', '239975', '239860']
User 1000, last item 287, recommended: ['242240', '241069', '239975', '239860', '235450', '231477', '223801', '210788', '190804', '124416']
User 10000, last item 906, recommended: ['2562', '15820', '13258', '60767', '189071', '57441', '57442', '31798', '63466', '56004']

Baseline results saved to baseline_results.csv
