In [1]:
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
items = pd.read_csv("kaggle_data/items.csv")
interactions = pd.read_csv("kaggle_data/interactions_train.csv")
train_df = pd.read_csv("kaggle_data/train_data.csv")
test_df = pd.read_csv("kaggle_data/test_data.csv")

In [3]:
n_users = np.max(interactions.u) + 1
n_items = np.max(interactions.i) + 1
print(f'Number of users = {n_users},\nNumber of items = {n_items},\nNumber of interactions = {len(interactions)}')

Number of users = 7838,
Number of items = 15291,
Number of interactions = 87047


# 1. Read the embeddings

In [4]:
def create_embedding_df(file_name):
    results_df = pd.DataFrame(columns=["i", "custom_id", "vector"])
    with open(file_name, "r") as file:
        for line in tqdm(file):
            # Parsing the JSON string into a dict and appending to the list of results
            json_object = json.loads(line.strip())
            custom_id = json_object["custom_id"]
            i = custom_id.split("-")[1]
            vector = json_object["response"]["body"]["data"][0]["embedding"]
            vector = np.array(vector)
            # Add the new row to the DataFrame
            results_df = pd.concat(
                [
                    results_df,
                    pd.DataFrame(
                        {"i": [i], "custom_id": [custom_id], "vector": [vector]}
                    ),
                ],
                ignore_index=True,
            )

    return results_df

In [5]:
semantic_tags = create_embedding_df("./items_embedding/emb_semantic_tags_output.jsonl")
subgenre = create_embedding_df("./items_embedding/emb_subgenre_output.jsonl")
summary = create_embedding_df("./items_embedding/emb_summary_output.jsonl")
target_audience = create_embedding_df("./items_embedding/emb_target_audience_output.jsonl")
themes = create_embedding_df("./items_embedding/emb_themes_output.jsonl")
tone_mood = create_embedding_df("./items_embedding/emb_tone_mood_output.jsonl")

0it [00:00, ?it/s]

15291it [00:06, 2422.50it/s]
15291it [00:06, 2405.99it/s]
15291it [00:06, 2366.21it/s]
15291it [00:06, 2345.41it/s]
15291it [00:06, 2319.78it/s]
15291it [00:06, 2321.44it/s]


In [8]:
def check_index(df):
    if (df.index == df.i.astype(int)).all():
        return True
    else:
        return False

In [9]:
assert len(semantic_tags) == len(subgenre) == len(summary) == len(target_audience) == len(themes) == len(tone_mood) == len(items), "DataFrames have different lengths"
assert check_index(semantic_tags), "Index of semantic_tags is not correct"
assert check_index(subgenre), "Index of subgenre is not correct"
assert check_index(summary), "Index of summary is not correct"
assert check_index(target_audience), "Index of target_audience is not correct"
assert check_index(themes), "Index of themes is not correct"
assert check_index(tone_mood), "Index of tone_mood is not correct"

# 2. CF

In [None]:
def create_data_matrix(data, n_users, n_items):
    """
    This function returns a numpy matrix with shape (n_users, n_items).
    Each entry is a binary value indicating positive interaction.
    """
    data_matrix = np.zeros((n_users, n_items))
    for row in data.itertuples():
        user_id = getattr(row, "u")
        item_id = getattr(row, "i")
        rating = getattr(row, "rating")
        data_matrix[user_id, item_id] = rating
    return data_matrix


def item_based_predict(interactions, similarity, epsilon=1e-9):
    """
    Predicts user-item interactions based on item-item similarity.
    Parameters:
        interactions (numpy array): The user-item interaction matrix.
        similarity (numpy array): The item-item similarity matrix.
        epsilon (float): Small constant added to the denominator to avoid division by zero.
    Returns:
        numpy array: The predicted interaction scores for each user-item pair.
    """
    pred = similarity.dot(interactions.T) / (
        similarity.sum(axis=1)[:, np.newaxis] + epsilon
    )
    return pred.T


def user_based_predict(interactions, similarity, epsilon=1e-9):
    """
    Predicts user-item interactions based on user-user similarity.
    Parameters:
        interactions (numpy array): The user-item interaction matrix.
        similarity (numpy array): The user-user similarity matrix.
        epsilon (float): Small constant added to the denominator to avoid division by zero.
    Returns:
        numpy array: The predicted interaction scores for each user-item pair.
    """
    pred = similarity.dot(interactions) / (
        np.abs(similarity).sum(axis=1)[:, np.newaxis] + epsilon
    )
    return pred


def add_rating(df):
    interactions = df.copy()
    interactions["rating"] = (interactions["t"] - interactions["t"].min()) / (
        interactions["t"].max() - interactions["t"].min()
    )
    interactions_time = interactions.groupby(["u", "i"], as_index=False).agg(
        t_max=("rating", "max"), t_count=("rating", "count")
    )
    interactions_time["rating"] = (
        np.exp(2 * interactions_time["t_max"]) * interactions_time["t_count"]
    )
    interactions_time["rating"] = np.log1p(interactions_time["rating"]) * 100
    return interactions_time


def print_map_10(test_df, prediction, top_k=10):
    ground_truth = test_df.groupby("u")["i"].apply(list).to_dict()

    user_ids = np.array(range(n_users))
    item_ids = np.array(range(n_items))

    top10_preds = {}

    for idx, user in enumerate(user_ids):
        scores = prediction[idx]
        top_indices = np.argsort(scores)[::-1][:top_k]
        top_items = item_ids[top_indices]
        top10_preds[user] = top_items.tolist()

    def average_precision_at_k(predicted, actual, k=10):
        if not actual:
            return 0.0
        score = 0.0
        num_hits = 0
        for i, p in enumerate(predicted[:k]):
            if p in actual:
                num_hits += 1
                score += num_hits / (i + 1)
        return score / min(len(actual), k)

    ap_scores = [
        average_precision_at_k(top10_preds[user], ground_truth[user], k=10)
        for user in user_ids
        if user in top10_preds
    ]

    map10 = np.mean(ap_scores)
    print(f"MAP@10: {map10:.4f}")
    return map10


def print_map_10_from_csv(test_df: pd.DataFrame, submission_df: pd.DataFrame) -> float:
    ground_truth = test_df.groupby("u")["i"].apply(list).to_dict()

    predictions = submission_df.set_index("user_id")["recommendation"].apply(
        lambda x: list(map(int, x.strip().split()[:10]))
    ).to_dict()

    def average_precision_at_k(predicted, actual, k=10):
        if not actual:
            return 0.0
        score = 0.0
        num_hits = 0
        for i, p in enumerate(predicted[:k]):
            if p in actual:
                num_hits += 1
                score += num_hits / (i + 1)
        return score / min(len(actual), k)

    ap_scores = []
    for user, actual_items in ground_truth.items():
        if user in predictions:
            predicted_items = predictions[user]
            ap = average_precision_at_k(predicted_items, actual_items, k=10)
            ap_scores.append(ap)

    map_10 = np.mean(ap_scores)
    print(f"MAP@10: {map_10:.4f}")
    return map_10


In [11]:
train_df_rating = add_rating(train_df)

In [12]:
train_df_rating

Unnamed: 0,u,i,t_max,t_count,rating
0,0,0,0.133800,1,83.587130
1,0,1,0.144325,1,84.785132
2,0,2,0.144646,1,84.821868
3,0,3,0.196436,2,137.686485
4,0,4,0.247164,1,97.055006
...,...,...,...,...,...
49684,7835,4820,0.992263,1,211.331195
49685,7836,3471,0.995048,1,211.820981
49686,7836,14550,0.995043,1,211.820046
49687,7837,88,0.996661,1,212.104870


In [13]:
train_matrix = create_data_matrix(train_df_rating, n_users, n_items)
user_similarity = cosine_similarity(train_matrix)
train_prediction = user_based_predict(train_matrix, user_similarity)

In [14]:
print_map_10(test_df, train_prediction)

MAP@10: 0.1681


0.16811814107736978

In [15]:
item_similarity = cosine_similarity(train_matrix.T)
train_item_prediction = item_based_predict(train_matrix, item_similarity)

In [16]:
print_map_10(test_df, train_item_prediction)

MAP@10: 0.1472


0.14724122267890646

# 3. Add embeddings

In [None]:
def get_book_similarity_matrix(vector_df):
    book_matrix = np.vstack(vector_df.vector.to_list())
    book_similarity = cosine_similarity(book_matrix)
    return book_similarity


def fast_recommend_to_df(interactions_df, similarity_matrix, top_n=10):
    user_ids = interactions_df['u'].unique()
    num_users = len(user_ids)
    num_items = similarity_matrix.shape[0]

    user_id_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
    index_to_user_id = {idx: uid for uid, idx in user_id_to_index.items()}

    interactions_df = interactions_df.sort_values(["u", "t_max"]).reset_index(drop=True)
    interactions_df["pct_rank"] = interactions_df.groupby("u")["t_max"].rank(pct=True, method='dense')

    interaction_matrix = np.zeros((num_users, num_items))
    for _, row in interactions_df.iterrows():
        u_idx = user_id_to_index[row['u']]
        i_idx = int(row['i'])
        if row['pct_rank'] < 0.7:
            interaction_matrix[u_idx, i_idx] = 0
        else:
            interaction_matrix[u_idx, i_idx] = np.exp(2 * row["t_max"])

    score_matrix = interaction_matrix @ similarity_matrix

    records = []
    for u_idx in range(num_users):
        top_items = np.argpartition(score_matrix[u_idx], -top_n)[-top_n:]
        top_items_sorted = top_items[np.argsort(score_matrix[u_idx][top_items])[::-1]]
        records.append({
            "user_id": index_to_user_id[u_idx],
            "recommendation": " ".join([str(x) for x in top_items_sorted])
        })

    recommendations_df = pd.DataFrame(records)
    return recommendations_df, score_matrix


def row_min_max_normalize(score_matrix):
    min_vals = np.min(score_matrix, axis=1, keepdims=True)
    max_vals = np.max(score_matrix, axis=1, keepdims=True)

    denom = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals)
    normalized = (score_matrix - min_vals) / denom
    return normalized

In [18]:
semantic_book_similarity = get_book_similarity_matrix(semantic_tags)
tone_book_similarity = get_book_similarity_matrix(tone_mood)
subgenre_book_similarity = get_book_similarity_matrix(subgenre)
summary_book_similarity = get_book_similarity_matrix(summary)
target_audience_book_similarity = get_book_similarity_matrix(target_audience)
themes_book_similarity = get_book_similarity_matrix(themes)

In [19]:
semantic_recommend_df, semantic_score_matrix = fast_recommend_to_df(train_df_rating, semantic_book_similarity)
tone_recommend_df, tone_score_matrix = fast_recommend_to_df(train_df_rating, tone_book_similarity)
subgenre_recommend_df, subgenre_score_matrix = fast_recommend_to_df(train_df_rating, subgenre_book_similarity)
summary_recommend_df, summary_score_matrix = fast_recommend_to_df(train_df_rating, summary_book_similarity)
target_audience_recommend_df, target_audience_score_matrix = fast_recommend_to_df(train_df_rating, target_audience_book_similarity)
themes_recommend_df, themes_score_matrix = fast_recommend_to_df(train_df_rating, themes_book_similarity)

In [20]:
score_cf_normalized = row_min_max_normalize(train_prediction)
score_semantic_normalized = row_min_max_normalize(semantic_score_matrix)
score_tone_normalized = row_min_max_normalize(tone_score_matrix)
score_subgenre_normalized = row_min_max_normalize(subgenre_score_matrix)
score_summary_normalized = row_min_max_normalize(summary_score_matrix)
score_target_audience_normalized = row_min_max_normalize(target_audience_score_matrix)
score_themes_normalized = row_min_max_normalize(themes_score_matrix)

In [29]:
score_hybrid = 0.92 * np.log1p(score_cf_normalized) + \
      0.2 * score_semantic_normalized + \
      0.0 * score_tone_normalized + \
      0.0 * np.log1p(score_subgenre_normalized) + \
      0.8 * np.log1p(score_summary_normalized) + \
      0.0 * np.log1p(score_target_audience_normalized) + \
      0.01 * np.log1p(score_themes_normalized)

In [30]:
print_map_10(test_df, score_hybrid)

MAP@10: 0.1772


0.17716294016330694

In [396]:
results_score_hybrid = pd.DataFrame(columns=["user_id", "recommendation"])
results_score_hybrid["user_id"] = np.arange(n_users)
results_score_hybrid["recommendation"] = [
    " ".join(map(str, np.argsort(score_hybrid[i, :])[-10:][::-1]))
    for i in range(n_users)
]

In [None]:
print_map_10_from_csv(test_df, results_score_hybrid)

### Use full data

In [31]:
full_data_with_rating = add_rating(interactions)

In [32]:
full_matrix = create_data_matrix(full_data_with_rating, n_users, n_items)
user_similarity = cosine_similarity(full_matrix)
full_prediction = user_based_predict(full_matrix, user_similarity)

In [33]:
_, semantic_score_matrix_full = fast_recommend_to_df(full_data_with_rating, semantic_book_similarity)
# _, tone_score_matrix_full = fast_recommend_to_df(full_data_with_rating, tone_book_similarity)
# _, subgenre_score_matrix_full = fast_recommend_to_df(full_data_with_rating, subgenre_book_similarity)
_, summary_score_matrix_full = fast_recommend_to_df(full_data_with_rating, summary_book_similarity)
# _, target_audience_score_matrix_full = fast_recommend_to_df(full_data_with_rating, target_audience_book_similarity)
_, themes_score_matrix_full = fast_recommend_to_df(full_data_with_rating, themes_book_similarity)

In [34]:
score_cf_normalized_full = row_min_max_normalize(full_prediction)
score_semantic_normalized_full = row_min_max_normalize(semantic_score_matrix_full)
# score_tone_normalized_full = row_min_max_normalize(tone_score_matrix_full)
# score_subgenre_normalized_full = row_min_max_normalize(subgenre_score_matrix_full)
score_summary_normalized_full = row_min_max_normalize(summary_score_matrix_full)
# score_target_audience_normalized_full = row_min_max_normalize(target_audience_score_matrix_full)
score_themes_normalized_full = row_min_max_normalize(themes_score_matrix_full)

In [35]:
# score_hybrid_full = 0.92 * np.log(score_cf_normalized_full + 1) + \
#       0.2 * score_semantic_normalized_full + \
#       0.746 * np.log(score_summary_normalized_full + 1)

score_hybrid_full = 0.92 * np.log1p(score_cf_normalized_full) + \
      0.2 * score_semantic_normalized_full + \
      0.8 * np.log1p(score_summary_normalized_full) + \
      0.01 * np.log1p(score_themes_normalized_full)

In [36]:
np.save("score_cf_normalized.npy", score_cf_normalized_full)
np.save("score_semantic_normalized.npy", score_semantic_normalized_full)
np.save("score_hybrid_full.npy", score_hybrid_full)

In [277]:
results = pd.DataFrame(columns=["user_id", "recommendation"])
results["user_id"] = np.arange(n_users)
results["recommendation"] = [
    " ".join(map(str, np.argsort(score_hybrid_full[i, :])[-10:][::-1]))
    for i in range(n_users)
]
results.to_csv("test_sem_sum_cf_the_92208001.csv", index=False)

# 4. Add sequence

In [37]:
import torch
import torch.nn as nn
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

MAX_INPUT_LEN = 10
PREDICT_LEN = 5
BATCH_SIZE = 64
EPOCHS = 1000
EMB_DIM = 64
HIDDEN_DIM = 128
VOCAB_SIZE = 15298

Using device: mps


In [38]:
class TransformerPredictor(nn.Module):
    def __init__(
        self, vocab_size, emb_dim, hidden_dim, output_len, max_len=MAX_INPUT_LEN
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, emb_dim))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=emb_dim, nhead=4, dim_feedforward=hidden_dim
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.output_layer = nn.Sequential(
            nn.Linear(emb_dim, output_len), nn.ReLU()  # 保证 offset ≥ 0
        )

    def forward(self, x):
        pad_mask = x == 0
        seq_len = x.size(1)
        emb = self.embedding(x) + self.positional_encoding[:, :seq_len, :]
        emb = emb.permute(1, 0, 2)
        encoded = self.encoder(emb, src_key_padding_mask=pad_mask)
        pooled = encoded[-1]
        return self.output_layer(pooled)

In [None]:
checkpoint = torch.load("transformer_checkpoint.pt", map_location=device)
model = TransformerPredictor(
    vocab_size=VOCAB_SIZE,
    emb_dim=EMB_DIM,
    hidden_dim=HIDDEN_DIM,
    output_len=PREDICT_LEN
)
model.load_state_dict(checkpoint)
model.to(device)
model.eval()

In [40]:
train_user_history = train_df.groupby('u')['i'].apply(list).apply(lambda x: x[-10:] if len(x) >= 10 else x)
train_user_history

u
0                 [9, 10, 11, 13, 12, 14, 15, 16, 17, 18]
1                [31, 32, 33, 34, 35, 36, 36, 36, 37, 37]
2                [81, 81, 81, 81, 83, 84, 81, 81, 85, 86]
3       [150, 151, 152, 153, 154, 155, 156, 157, 158, ...
4       [195, 196, 197, 198, 199, 200, 201, 202, 203, ...
                              ...                        
7833                                          [975, 7322]
7834               [15276, 13891, 7128, 7128, 7128, 7128]
7835                                         [4820, 3055]
7836                                        [14550, 3471]
7837                                           [2191, 88]
Name: i, Length: 7838, dtype: object

In [41]:
def get_pred(history):
    history_tensor = torch.tensor([history], dtype=torch.long).to(device)
    with torch.no_grad():
        offset_pred = model(history_tensor)
        base = history_tensor[0, -1]
        pred_item_ids = (offset_pred + base).round()
    return [int(x) for x in pred_item_ids.cpu().tolist()[0]]

In [42]:
get_pred([9, 10, 11, 13, 12, 14, 15, 16, 17, 18])

[19, 20, 21, 22, 23]

In [301]:
results_train = pd.DataFrame(columns=["user_id", "recommendation"])
results_train["user_id"] = np.arange(n_users)

In [None]:
for idx, row in tqdm(results_train.iterrows()):
    current_user = row["user_id"]
    if current_user < 1200:
        pred = get_pred(train_user_history.iloc[current_user])[:3]
        from_scores = np.argsort(score_hybrid[current_user, :])[-7:][::-1].tolist()
        final = pred + from_scores
    else:
        final = np.argsort(score_hybrid[current_user, :])[-10:][::-1].tolist()
    results_train.at[idx, "recommendation"] = " ".join(str(x) for x in final)

7838it [00:21, 368.67it/s]


In [303]:
print_map_10_from_csv(test_df, results_train)

MAP@10: 0.1488


0.14883691439656957

In [295]:
results_train

Unnamed: 0,user_id,recommendation
0,0,19 20 13 12 15 17 16 14 11 18
1,1,38 39 37 36 29 34 33 35 38 5314
2,2,87 88 53 58 81 52 76 77 82 54
3,3,160 161 132 155 156 152 153 149 158 159
4,4,205 206 202 203 201 204 200 197 199 193
...,...,...
7833,7833,7322 975 5956 7318 3833 7306 806 4887 619 9493
7834,7834,7128 13891 15276 13952 7114 3057 4429 114 7123...
7835,7835,3055 13952 13414 2102 9054 13388 11058 3053 30...
7836,7836,3471 14550 3816 6998 611 14557 5787 1289 3960 ...


In [43]:
full_user_history = full_data_with_rating.groupby('u')['i'].apply(list).apply(lambda x: x[-10:] if len(x) >= 10 else x)
full_user_history

u
0                [15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
1                [30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
2                [85, 86, 87, 88, 89, 90, 91, 92, 93, 94]
3       [163, 164, 165, 166, 167, 168, 169, 170, 171, ...
4       [198, 199, 200, 201, 202, 203, 204, 205, 206, ...
                              ...                        
7833                                    [975, 7322, 7760]
7834                           [1367, 7128, 13891, 15276]
7835                                   [3055, 4820, 6791]
7836                                        [3471, 14550]
7837                                     [88, 2191, 2209]
Name: i, Length: 7838, dtype: object

In [44]:
results_full = pd.DataFrame(columns=["user_id", "recommendation"])
results_full["user_id"] = np.arange(n_users)

In [None]:
for idx, row in tqdm(results_full.iterrows()):
    current_user = row["user_id"]
    if current_user < 1200:
        pred = get_pred(full_user_history.iloc[current_user])[:2]
        from_scores = np.argsort(score_hybrid_full[current_user, :])[-8:][::-1].tolist()
        final = pred + from_scores
    else:
        final = np.argsort(score_hybrid_full[current_user, :])[-10:][::-1].tolist()
    results_full.at[idx, "recommendation"] = " ".join(str(x) for x in final)

7838it [00:02, 2746.67it/s] 


In [300]:
results_full.to_csv("test_pred_3.csv", index=False)