# Learning-to-Rank Content-Based Recommender

In [6]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
from xgboost import XGBRanker

## Feature Extraction

### Loading Data

In [7]:
%%bash
# Check if KuaiRec.zip already exists
if [ ! -f KuaiRec.zip ]; then
    wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1qe5hOSBxzIuxBb1G_Ih5X-O65QElollE&export=download&confirm=t&uuid=b2002093-cc6e-4bd5-be47-9603f0b33470
' -O KuaiRec.zip
    unzip KuaiRec.zip -d data_final_project
fi

In [8]:
interactions_raw = pd.read_csv("data_final_project/KuaiRec 2.0/data/small_matrix.csv")
user_features = pd.read_csv("data_final_project/KuaiRec 2.0/data/user_features.csv")
item_daily_features = pd.read_csv("data_final_project/KuaiRec 2.0/data/item_daily_features.csv")
item_categories = pd.read_csv("data_final_project/KuaiRec 2.0/data/item_categories.csv")
big_matrix = pd.read_csv("data_final_project/KuaiRec 2.0/data/big_matrix.csv")

### Creation of the "Combined" Column

In [9]:
interactions = interactions_raw.copy()
interactions["is_like"] = interactions["watch_ratio"].apply(lambda x: 1 if x >= 2 else 0)
interactions = interactions.drop(columns=["play_duration", "video_duration", "time", "date", "timestamp"])

interactions.head()

Unnamed: 0,user_id,video_id,watch_ratio,is_like
0,14,148,0.722103,0
1,14,183,1.907377,0
2,14,3649,2.063311,1
3,14,5262,0.566388,0
4,14,8234,0.418364,0


In [10]:
item_daily_features_raw = pd.read_csv("data_final_project/KuaiRec 2.0/data/item_daily_features.csv")

cols = set(item_daily_features_raw.columns)
cols_to_keep = set(["video_id", "author_id", "music_id", "comment_cnt", "like_cnt", "share_cnt", "show_cnt"])
cols_to_drop = cols - set(cols_to_keep)

video_features = item_daily_features_raw.copy()
video_features = video_features.drop(columns=cols_to_drop)

video_features = video_features.groupby(by=["video_id"]).first().reset_index()

video_features.head()

Unnamed: 0,video_id,author_id,music_id,show_cnt,like_cnt,comment_cnt,share_cnt
0,0,3309,3350323409,14665,573,11,2
1,1,4978,1812462382,17829,1748,14,16
2,2,939,0,43615,244,3,1
3,3,5889,0,1309,132,1,7
4,4,4284,3442844592,103,1,0,0


In [11]:
item_categories_raw = pd.read_csv("data_final_project/KuaiRec 2.0/data/item_categories.csv")
video_tags = item_categories_raw.copy()

In [12]:
videos = video_features.merge(video_tags, on="video_id", how="left")
videos = videos.rename(columns={"author_id": "creator_id", "feat": "tag_list"})

videos.head()

Unnamed: 0,video_id,creator_id,music_id,show_cnt,like_cnt,comment_cnt,share_cnt,tag_list
0,0,3309,3350323409,14665,573,11,2,[8]
1,1,4978,1812462382,17829,1748,14,16,"[27, 9]"
2,2,939,0,43615,244,3,1,[9]
3,3,5889,0,1309,132,1,7,[26]
4,4,4284,3442844592,103,1,0,0,[5]


In [13]:
# Combine tag_list, music_id, creator_id as features
videos['tag_list'] = videos['tag_list'].fillna('').apply(lambda x: x.replace(',', ' '))
videos['tag_list'] = videos['tag_list'].apply(lambda x: x.replace('[', ''))
videos['tag_list'] = videos['tag_list'].apply(lambda x: x.replace(']', ''))
#videos['combined'] = videos['music_id'].astype(str) + ' ' + videos['creator_id'].astype(str) + ' ' + videos['tag_list'] + ' ' + videos["comment_cnt"].astype(str) + ' ' + videos["share_cnt"].astype(str) + ' ' + videos["like_cnt"].astype(str)
videos['combined'] = ''
for col in cols_to_keep - {'video_id', 'author_id', 'feat'}:
    videos['combined'] =  videos['combined'] + ' ' + videos[col].astype(str)
videos['combined'] = videos['combined'] + videos['creator_id'].astype(str) + ' ' + videos['tag_list']

#videos['combined'] = videos['tag_list']

# Vectorize using TF-IDF
tfidf = TfidfVectorizer(max_features=100)
video_features = tfidf.fit_transform(videos['combined'])

# Build mapping from video_id to vector
from scipy.sparse import csr_matrix
import numpy as np

video_id_to_idx = {vid: i for i, vid in enumerate(videos['video_id'])}
video_feature_dict = {
    vid: video_features[video_id_to_idx[vid]] for vid in videos['video_id']
}


In [14]:
videos.head()

Unnamed: 0,video_id,creator_id,music_id,show_cnt,like_cnt,comment_cnt,share_cnt,tag_list,combined
0,0,3309,3350323409,14665,573,11,2,8,14665 3350323409 11 573 23309 8
1,1,4978,1812462382,17829,1748,14,16,27 9,17829 1812462382 14 1748 164978 27 9
2,2,939,0,43615,244,3,1,9,43615 0 3 244 1939 9
3,3,5889,0,1309,132,1,7,26,1309 0 1 132 75889 26
4,4,4284,3442844592,103,1,0,0,5,103 3442844592 0 1 04284 5


## Prepare LTR Dataset

In [15]:

# For this example, use binary label: is_like
interactions = interactions[interactions['video_id'].isin(video_id_to_idx)]
interactions = interactions[['user_id', 'video_id', 'is_like']]

# Generate (user, video) features
user_video_pairs = []
X = []
y = []

for user_id, user_df in tqdm(interactions.groupby('user_id'), desc="Building features"):
    for _, row in user_df.iterrows():
        video_id = row['video_id']
        if video_id in video_feature_dict:
            X.append(video_feature_dict[video_id].toarray()[0])  # convert from sparse to dense
            y.append(row['is_like'])  # label: 1 if liked, else 0
            user_video_pairs.append(user_id)

X = np.array(X)
y = np.array(y)

Building features: 100%|██████████| 1411/1411 [00:41<00:00, 33.93it/s]


## Group Structure for Learning-to-Rank

In [16]:
user_counts = Counter(user_video_pairs)
group = [user_counts[u] for u in sorted(user_counts.keys(), key=lambda x: user_video_pairs.index(x))]

# Train XGBoost Ranker

In [17]:
ltr_model = XGBRanker(
    objective='rank:ndcg',
    eval_metric='ndcg',
    booster='gbtree',
    eta=0.1,
    max_depth=5,
    n_estimators=100,
    random_state=42
)

ltr_model.fit(X, y, group=group)

## Make Recommendations

In [18]:
def recommend_for_user(user_id, candidate_video_ids, top_k=10):
    features = []
    vids = []

    for vid in candidate_video_ids:
        if vid in video_feature_dict:
            features.append(video_feature_dict[vid].toarray()[0])
            vids.append(vid)

    preds = ltr_model.predict(np.array(features))
    top_indices = np.argsort(preds)[-top_k:][::-1]
    return [vids[i] for i in top_indices]

## Evaluation

In [19]:
def evaluate_topk_metrics(y_true, top_k_preds, k=5):
    top_k = top_k_preds[:k]
    relevant = set(y_true)
    hits = [1 if item in relevant else 0 for item in top_k]

    precision = sum(hits) / k
    recall = sum(hits) / len(relevant) if relevant else 0.0
    dcg = sum(hit / np.log2(i + 2) for i, hit in enumerate(hits))
    ideal_hits = [1] * min(len(relevant), k)
    idcg = sum(1 / np.log2(i + 2) for i in range(len(ideal_hits)))
    ndcg = dcg / idcg if idcg != 0 else 0.0

    # MAP@k: mean average precision
    ap_sum = 0.0
    hit_count = 0
    for i, hit in enumerate(hits):
        if hit:
            hit_count += 1
            ap_sum += hit_count / (i + 1)
    map_k = ap_sum / min(len(relevant), k) if relevant else 0.0

    print(f"Precision@{k}: {precision}")
    print(f"Recall@{k}: {recall}")
    print(f"NDCG@{k}: {ndcg}")
    print(f"MAP@{k}: {map_k}")

In [20]:
seen_videos = interactions[interactions['user_id'] == user_id]['video_id'].tolist()

In [21]:
k = 10
user_id = list(user_counts.keys())[0]

top_recommendations = recommend_for_user(user_id, seen_videos, k)
print(f"Top Recommendations for user {user_id}:", top_recommendations)

y_true = interactions[(interactions['user_id'] == user_id) & (interactions["is_like"] == 1)]["video_id"].tolist()
evaluate_topk_metrics(y_true, top_recommendations, k)

Top Recommendations for user 14: [4040, 4123, 723, 8524, 166, 9157, 7594, 9261, 7737, 619]
Precision@10: 0.7
Recall@10: 0.03910614525139665
NDCG@10: 0.7503359712972109
MAP@10: 0.5833333333333333
