In [48]:
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
items = pd.read_csv("kaggle_data/items.csv")
interactions = pd.read_csv("kaggle_data/interactions_train.csv")
train_df = pd.read_csv("kaggle_data/train_data.csv")
test_df = pd.read_csv("kaggle_data/test_data.csv")

In [3]:
n_users = np.max(interactions.u) + 1
n_items = np.max(interactions.i) + 1
print(f'Number of users = {n_users},\nNumber of items = {n_items},\nNumber of interactions = {len(interactions)}')

Number of users = 7838,
Number of items = 15291,
Number of interactions = 87047


# 1. Pure CF

In [None]:
def create_data_matrix(data, n_users, n_items):
    """
    This function returns a numpy matrix with shape (n_users, n_items).
    Each entry is a binary value indicating positive interaction.
    """
    data_matrix = np.zeros((n_users, n_items))
    for row in data.itertuples():
        user_id = getattr(row, "u")
        item_id = getattr(row, "i")
        rating = getattr(row, "rating")
        data_matrix[user_id, item_id] = rating
    return data_matrix


def create_data_matrix_cf(data, n_users, n_items):
    """
    This function returns a numpy matrix with shape (n_users, n_items).
    Each entry is a binary value indicating positive interaction.
    """
    data_matrix = np.zeros((n_users, n_items))
    for row in data.itertuples():
        user_id = getattr(row, "u")
        item_id = getattr(row, "i")
        data_matrix[user_id, item_id] = 1
    return data_matrix


def item_based_predict(interactions, similarity, epsilon=1e-9):
    """
    Predicts user-item interactions based on item-item similarity.
    Parameters:
        interactions (numpy array): The user-item interaction matrix.
        similarity (numpy array): The item-item similarity matrix.
        epsilon (float): Small constant added to the denominator to avoid division by zero.
    Returns:
        numpy array: The predicted interaction scores for each user-item pair.
    """
    pred = similarity.dot(interactions.T) / (
        similarity.sum(axis=1)[:, np.newaxis] + epsilon
    )
    return pred.T


def user_based_predict(interactions, similarity, epsilon=1e-9):
    """
    Predicts user-item interactions based on user-user similarity.
    Parameters:
        interactions (numpy array): The user-item interaction matrix.
        similarity (numpy array): The user-user similarity matrix.
        epsilon (float): Small constant added to the denominator to avoid division by zero.
    Returns:
        numpy array: The predicted interaction scores for each user-item pair.
    """
    pred = similarity.dot(interactions) / (
        np.abs(similarity).sum(axis=1)[:, np.newaxis] + epsilon
    )
    return pred


def add_rating(df):
    interactions = df.copy()
    interactions["rating"] = (interactions["t"] - interactions["t"].min()) / (
        interactions["t"].max() - interactions["t"].min()
    )
    interactions_time = interactions.groupby(["u", "i"], as_index=False).agg(
        t_max=("rating", "max"), t_count=("rating", "count")
    )
    interactions_time["rating"] = (
        np.exp(2 * interactions_time["t_max"]) * interactions_time["t_count"]
    )
    interactions_time["rating"] = np.log1p(interactions_time["rating"]) * 100
    return interactions_time


def print_map_10(test_df, prediction, top_k=10):
    ground_truth = test_df.groupby("u")["i"].apply(list).to_dict()

    user_ids = np.array(range(n_users))
    item_ids = np.array(range(n_items))

    top10_preds = {}

    for idx, user in enumerate(user_ids):
        scores = prediction[idx]
        top_indices = np.argsort(scores)[::-1][:top_k]
        top_items = item_ids[top_indices]
        top10_preds[user] = top_items.tolist()

    def average_precision_at_k(predicted, actual, k=10):
        if not actual:
            return 0.0
        score = 0.0
        num_hits = 0
        for i, p in enumerate(predicted[:k]):
            if p in actual:
                num_hits += 1
                score += num_hits / (i + 1)
        return score / min(len(actual), k)

    ap_scores = [
        average_precision_at_k(top10_preds[user], ground_truth[user], k=10)
        for user in user_ids
        if user in top10_preds
    ]

    map10 = np.mean(ap_scores)
    print(f"MAP@10: {map10:.4f}")
    return map10


def print_map_10_from_csv(test_df: pd.DataFrame, submission_df: pd.DataFrame) -> float:
    ground_truth = test_df.groupby("u")["i"].apply(list).to_dict()

    predictions = submission_df.set_index("user_id")["recommendation"].apply(
        lambda x: list(map(int, x.strip().split()[:10]))
    ).to_dict()

    def average_precision_at_k(predicted, actual, k=10):
        if not actual:
            return 0.0
        score = 0.0
        num_hits = 0
        for i, p in enumerate(predicted[:k]):
            if p in actual:
                num_hits += 1
                score += num_hits / (i + 1)
        return score / min(len(actual), k)

    ap_scores = []
    for user, actual_items in ground_truth.items():
        if user in predictions:
            predicted_items = predictions[user]
            ap = average_precision_at_k(predicted_items, actual_items, k=10)
            ap_scores.append(ap)

    map_10 = np.mean(ap_scores)
    print(f"MAP@10: {map_10:.4f}")
    return map_10


In [None]:
def precision_recall_at_k(prediction, ground_truth, k=10):
    """
    Calculates Precision@K and Recall@K for top-K recommendations.
    Parameters:
        prediction (numpy array): The predicted interaction matrix with scores.
        ground_truth (numpy array): The ground truth interaction matrix (binary).
        k (int): Number of top recommendations to consider.
    Returns:
        precision_at_k (float): The average precision@K over all users.
        recall_at_k (float): The average recall@K over all users.
    """
    num_users = prediction.shape[0]
    precision_at_k, recall_at_k = 0, 0

    for user in range(num_users):
        top_k_items = np.argsort(prediction[user, :])[-k:]

        relevant_items_in_top_k = np.isin(top_k_items, np.where(ground_truth[user, :] > 0)[0]).sum()

        total_relevant_items = ground_truth[user, :].sum()

        precision_at_k += relevant_items_in_top_k / k
        recall_at_k += relevant_items_in_top_k / total_relevant_items if total_relevant_items > 0 else 0

    precision_at_k /= num_users
    recall_at_k /= num_users

    return precision_at_k, recall_at_k

In [None]:
def precision_recall_at_k_from_csv(prediction, ground_truth):
    k = 10
    num_users = prediction.shape[0]
    precision_at_k, recall_at_k = 0, 0

    for user in range(num_users):
        top_k_items = prediction.loc[prediction.user_id == user, "recommendation"].iloc[0].split(' ')
        top_k_items = [int(x) for x in top_k_items]

        relevant_items_in_top_k = np.isin(top_k_items, np.where(ground_truth[user, :] > 0)[0]).sum()

        total_relevant_items = ground_truth[user, :].sum()

        precision_at_k += relevant_items_in_top_k / k
        recall_at_k += relevant_items_in_top_k / total_relevant_items if total_relevant_items > 0 else 0

    precision_at_k /= num_users
    recall_at_k /= num_users

    return precision_at_k, recall_at_k

## 1.1 User-user CF

In [7]:
train_matrix_cf = create_data_matrix_cf(train_df, n_users, n_items)
user_similarity_cf = cosine_similarity(train_matrix_cf)
train_prediction_cf = user_based_predict(train_matrix_cf, user_similarity_cf)

In [19]:
test_matrix = create_data_matrix_cf(test_df, n_users, n_items)

In [9]:
precision_recall_at_k(train_prediction_cf, test_matrix)

(0.056532278642513596, 0.2906567293761024)

In [10]:
print_map_10(test_df, train_prediction_cf)

MAP@10: 0.1576


0.15764753477303142

## 1.2 Item-item CF

In [11]:
item_similarity_cf = cosine_similarity(train_matrix_cf.T)
train_item_prediction = item_based_predict(train_matrix_cf, item_similarity_cf)

In [12]:
precision_recall_at_k(train_item_prediction, test_matrix)

(0.05561367695841055, 0.26399361388179715)

In [13]:
print_map_10(test_df, train_item_prediction)

MAP@10: 0.1443


0.14425194703163602

# 2. CF with rating

In [50]:
train_df_rating = add_rating(train_df)

In [51]:
train_matrix = create_data_matrix(train_df_rating, n_users, n_items)
user_similarity = cosine_similarity(train_matrix)
train_prediction = user_based_predict(train_matrix, user_similarity)

In [16]:
precision_recall_at_k(train_prediction, test_matrix)

(0.0611635621332006, 0.2954780242458611)

In [52]:
print_map_10(test_df, train_prediction)

MAP@10: 0.1673


0.16725996993332032

# 3. Puring embedding

In [7]:
def create_embedding_df(file_name):
    results_df = pd.DataFrame(columns=["i", "custom_id", "vector"])
    with open(file_name, "r") as file:
        for line in tqdm(file):
            # Parsing the JSON string into a dict and appending to the list of results
            json_object = json.loads(line.strip())
            custom_id = json_object["custom_id"]
            i = custom_id.split("-")[1]
            vector = json_object["response"]["body"]["data"][0]["embedding"]
            vector = np.array(vector)
            # Add the new row to the DataFrame
            results_df = pd.concat(
                [
                    results_df,
                    pd.DataFrame(
                        {"i": [i], "custom_id": [custom_id], "vector": [vector]}
                    ),
                ],
                ignore_index=True,
            )

    return results_df

In [8]:
semantic_tags = create_embedding_df("./items_embedding/emb_semantic_tags_output.jsonl")
subgenre = create_embedding_df("./items_embedding/emb_subgenre_output.jsonl")
summary = create_embedding_df("./items_embedding/emb_summary_output.jsonl")
target_audience = create_embedding_df("./items_embedding/emb_target_audience_output.jsonl")
themes = create_embedding_df("./items_embedding/emb_themes_output.jsonl")
tone_mood = create_embedding_df("./items_embedding/emb_tone_mood_output.jsonl")

0it [00:00, ?it/s]

15291it [00:06, 2341.32it/s]
15291it [00:06, 2274.68it/s]
15291it [00:06, 2290.09it/s]
15291it [00:06, 2315.12it/s]
15291it [00:06, 2334.95it/s]
15291it [00:06, 2291.47it/s]


In [9]:
def check_index(df):
    if (df.index == df.i.astype(int)).all():
        return True
    else:
        return False

In [10]:
assert len(semantic_tags) == len(subgenre) == len(summary) == len(target_audience) == len(themes) == len(tone_mood) == len(items), "DataFrames have different lengths"
assert check_index(semantic_tags), "Index of semantic_tags is not correct"
assert check_index(subgenre), "Index of subgenre is not correct"
assert check_index(summary), "Index of summary is not correct"
assert check_index(target_audience), "Index of target_audience is not correct"
assert check_index(themes), "Index of themes is not correct"
assert check_index(tone_mood), "Index of tone_mood is not correct"

In [None]:
def get_book_similarity_matrix(vector_df):
    book_matrix = np.vstack(vector_df.vector.to_list())
    book_similarity = cosine_similarity(book_matrix)
    return book_similarity


def fast_recommend_to_df(interactions_df, similarity_matrix, top_n=10):
    user_ids = interactions_df['u'].unique()
    num_users = len(user_ids)
    num_items = similarity_matrix.shape[0]

    user_id_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
    index_to_user_id = {idx: uid for uid, idx in user_id_to_index.items()}

    interactions_df = interactions_df.sort_values(["u", "t_max"]).reset_index(drop=True)
    interactions_df["pct_rank"] = interactions_df.groupby("u")["t_max"].rank(pct=True, method='dense')

    interaction_matrix = np.zeros((num_users, num_items))
    for _, row in interactions_df.iterrows():
        u_idx = user_id_to_index[row['u']]
        i_idx = int(row['i'])
        if row['pct_rank'] < 0.7:
            interaction_matrix[u_idx, i_idx] = 0
        else:
            interaction_matrix[u_idx, i_idx] = np.exp(2 * row["t_max"])

    score_matrix = interaction_matrix @ similarity_matrix

    records = []
    for u_idx in range(num_users):
        top_items = np.argpartition(score_matrix[u_idx], -top_n)[-top_n:]
        top_items_sorted = top_items[np.argsort(score_matrix[u_idx][top_items])[::-1]]
        records.append({
            "user_id": index_to_user_id[u_idx],
            "recommendation": " ".join([str(x) for x in top_items_sorted])
        })

    recommendations_df = pd.DataFrame(records)
    return recommendations_df, score_matrix


def row_min_max_normalize(score_matrix):
    min_vals = np.min(score_matrix, axis=1, keepdims=True)
    max_vals = np.max(score_matrix, axis=1, keepdims=True)

    denom = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals)
    normalized = (score_matrix - min_vals) / denom
    return normalized

In [12]:
semantic_book_similarity = get_book_similarity_matrix(semantic_tags)
tone_book_similarity = get_book_similarity_matrix(tone_mood)
subgenre_book_similarity = get_book_similarity_matrix(subgenre)
summary_book_similarity = get_book_similarity_matrix(summary)
target_audience_book_similarity = get_book_similarity_matrix(target_audience)
themes_book_similarity = get_book_similarity_matrix(themes)

In [16]:
semantic_recommend_df, semantic_score_matrix = fast_recommend_to_df(train_df_rating, semantic_book_similarity)
tone_recommend_df, tone_score_matrix = fast_recommend_to_df(train_df_rating, tone_book_similarity)
subgenre_recommend_df, subgenre_score_matrix = fast_recommend_to_df(train_df_rating, subgenre_book_similarity)
summary_recommend_df, summary_score_matrix = fast_recommend_to_df(train_df_rating, summary_book_similarity)
target_audience_recommend_df, target_audience_score_matrix = fast_recommend_to_df(train_df_rating, target_audience_book_similarity)
themes_recommend_df, themes_score_matrix = fast_recommend_to_df(train_df_rating, themes_book_similarity)

In [17]:
score_cf_normalized = row_min_max_normalize(train_prediction)
score_semantic_normalized = row_min_max_normalize(semantic_score_matrix)
score_tone_normalized = row_min_max_normalize(tone_score_matrix)
score_subgenre_normalized = row_min_max_normalize(subgenre_score_matrix)
score_summary_normalized = row_min_max_normalize(summary_score_matrix)
score_target_audience_normalized = row_min_max_normalize(target_audience_score_matrix)
score_themes_normalized = row_min_max_normalize(themes_score_matrix)

In [20]:
precision_recall_at_k(semantic_score_matrix, test_matrix)

(0.03914263842817062, 0.2289522321277847)

In [21]:
print_map_10(test_df, semantic_score_matrix)

MAP@10: 0.1320


0.1319927161122262

# 4. Hybrid

In [22]:
score_hybrid = 0.92 * np.log1p(score_cf_normalized) + \
      0.2 * score_semantic_normalized + \
      0.0 * score_tone_normalized + \
      0.0 * np.log1p(score_subgenre_normalized) + \
      0.8 * np.log1p(score_summary_normalized) + \
      0.0 * np.log1p(score_target_audience_normalized) + \
      0.01 * np.log1p(score_themes_normalized)

In [24]:
precision_recall_at_k(score_hybrid, test_matrix)

(0.06358764991069532, 0.31800260329478647)

In [23]:
print_map_10(test_df, score_hybrid)

MAP@10: 0.1772


0.17716294016330694

# 5. Sequence

In [25]:
import torch
import torch.nn as nn
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

MAX_INPUT_LEN = 10
PREDICT_LEN = 5
BATCH_SIZE = 64
EPOCHS = 1000
EMB_DIM = 64
HIDDEN_DIM = 128
VOCAB_SIZE = 15298

Using device: mps


In [26]:
class TransformerPredictor(nn.Module):
    def __init__(
        self, vocab_size, emb_dim, hidden_dim, output_len, max_len=MAX_INPUT_LEN
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, emb_dim))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=emb_dim, nhead=4, dim_feedforward=hidden_dim
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.output_layer = nn.Sequential(
            nn.Linear(emb_dim, output_len), nn.ReLU()  # 保证 offset ≥ 0
        )

    def forward(self, x):
        pad_mask = x == 0
        seq_len = x.size(1)
        emb = self.embedding(x) + self.positional_encoding[:, :seq_len, :]
        emb = emb.permute(1, 0, 2)
        encoded = self.encoder(emb, src_key_padding_mask=pad_mask)
        pooled = encoded[-1]
        return self.output_layer(pooled)

In [None]:
checkpoint = torch.load("transformer_checkpoint.pt", map_location=device)
model = TransformerPredictor(
    vocab_size=VOCAB_SIZE,
    emb_dim=EMB_DIM,
    hidden_dim=HIDDEN_DIM,
    output_len=PREDICT_LEN
)
model.load_state_dict(checkpoint)
model.to(device)
model.eval()

In [28]:
train_user_history = train_df.groupby('u')['i'].apply(list).apply(lambda x: x[-10:] if len(x) >= 10 else x)
train_user_history

u
0                 [9, 10, 11, 13, 12, 14, 15, 16, 17, 18]
1                [31, 32, 33, 34, 35, 36, 36, 36, 37, 37]
2                [81, 81, 81, 81, 83, 84, 81, 81, 85, 86]
3       [150, 151, 152, 153, 154, 155, 156, 157, 158, ...
4       [195, 196, 197, 198, 199, 200, 201, 202, 203, ...
                              ...                        
7833                                          [975, 7322]
7834               [15276, 13891, 7128, 7128, 7128, 7128]
7835                                         [4820, 3055]
7836                                        [14550, 3471]
7837                                           [2191, 88]
Name: i, Length: 7838, dtype: object

In [29]:
def get_pred(history):
    history_tensor = torch.tensor([history], dtype=torch.long).to(device)
    with torch.no_grad():
        offset_pred = model(history_tensor)
        base = history_tensor[0, -1]
        pred_item_ids = (offset_pred + base).round()
    return [int(x) for x in pred_item_ids.cpu().tolist()[0]]

In [30]:
results_train = pd.DataFrame(columns=["user_id", "recommendation"])
results_train["user_id"] = np.arange(n_users)

In [33]:
for idx, row in tqdm(results_train.iterrows()):
    current_user = row["user_id"]
    if current_user < 1200:
        pred = get_pred(train_user_history.iloc[current_user])[:3]
        from_scores = np.argsort(score_hybrid[current_user, :])[-7:][::-1].tolist()
        final = pred + from_scores
    else:
        final = np.argsort(score_hybrid[current_user, :])[-10:][::-1].tolist()
    results_train.at[idx, "recommendation"] = " ".join(str(x) for x in final)

7838it [00:08, 977.62it/s] 


In [47]:
precision_recall_at_k_from_csv(results_train, test_matrix)

(0.07036233733095643, 0.34686592860705867)

In [34]:
print_map_10_from_csv(test_df, results_train)

MAP@10: 0.1958


0.19580166326025936