In [1]:
import os, math, numpy as np, pandas as pd
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

## Parameters

In [2]:
DATA_DIR = "../data"
Z_THRESHOLD = 0.0
RATING_MIN = 0.5
RATING_MAX = 5.0
TEST_SIZE = 0.10
VAL_SIZE = 0.10
SEED = 42
SAMPLE_USER = 72313
TOP_N = 10
SHOW_ROWS = 50
N_FEATURES_FS = 300

## Load data

In [3]:
ratings = pd.read_csv(f"{DATA_DIR}/ratings.csv")
movies = pd.read_csv(f"{DATA_DIR}/movies.csv")

genome_scores = pd.read_csv(f"{DATA_DIR}/genome-scores.csv")
genome_tags = pd.read_csv(f"{DATA_DIR}/genome-tags.csv")


In [4]:
df = pd.merge(ratings, movies, on='movieId')
display(df.head())
print(f"Dataset: {len(df):,} ratings | {df['userId'].nunique():,} users | {df['movieId'].nunique():,} movies")

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance


Dataset: 25,000,095 ratings | 162,541 users | 59,047 movies


## Per‑user z‑score (remove bias)

Mỗi người dùng có thang chấm điểm khác nhau. Ví dụ, một người có thể chấm 4/5 là "rất thích", trong khi người khác chấm 4/5 là "bình thường". Chuẩn hóa z-score giúp so sánh đánh giá giữa các người dùng một cách công bằng.

In [5]:
user_stats = df.groupby("userId")["rating"].agg(["mean", "std"]).rename(columns={"mean": "mu", "std": "sigma"})
user_stats

Unnamed: 0_level_0,mu,sigma
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.814286,1.004235
2,3.630435,1.457728
3,3.697409,0.599854
4,3.378099,1.116927
5,3.752475,0.931729
...,...,...
162537,4.039604,0.958340
162538,3.415584,1.216452
162539,4.510638,0.718463
162540,3.829545,1.200781


**Z-score:** `z = (x - mu) / sigma`, trong đó `x` là đánh giá, `mu` là trung bình đánh giá của người dùng, `sigma` là độ lệch chuẩn.

- Z-score biểu thị đánh giá lệch bao nhiêu so với trung bình của người dùng, chuẩn hóa về đơn vị độ lệch chuẩn.

- Nếu `sigma=0` (người dùng chỉ chấm một giá trị duy nhất), phép chia sẽ gây lỗi (chia cho 0). Thay bằng 1e-6 để tránh lỗi và giữ z-score hợp lý.

In [6]:
df = df.join(user_stats, on="userId")
df["rating_z"] = (df["rating"] - df["mu"]) / df["sigma"].replace(0, 1e-6)
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,mu,sigma,rating_z
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,3.814286,1.004235,1.180714
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama,3.814286,1.004235,-0.31296
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,3.814286,1.004235,1.180714
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War,3.814286,1.004235,1.180714
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance,3.814286,1.004235,-0.31296


In [7]:
df.describe()

Unnamed: 0,userId,movieId,rating,timestamp,mu,sigma,rating_z
count,25000100.0,25000100.0,25000100.0,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0,3.533854,0.9193482,-3.0581639999999995e-19
std,46791.72,39198.86,1.060744,226875800.0,0.4784993,0.238886,0.9964306
min,1.0,1.0,0.5,789652000.0,0.5,0.0,-28.24892
25%,40510.0,1196.0,3.0,1011747000.0,3.25,0.7540528,-0.6097788
50%,80914.0,2947.0,3.5,1198868000.0,3.552746,0.9008775,0.1105046
75%,121557.0,8623.0,4.0,1447205000.0,3.848468,1.063137,0.7170283
max,162541.0,209171.0,5.0,1574328000.0,5.0,2.308451,34.59683


## Encode

Các `userId` và `movieId` có thể không liên tục (ví dụ: 1, 3, 7). Mã hóa thành chỉ số liên tục (0, 1, 2, ...) giúp dễ xử lý trong ma trận và tiết kiệm bộ nhớ.

In [8]:
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

user_encoder.fit(df['userId'])
movie_encoder.fit(movies['movieId'])

df['user_idx'] = user_encoder.transform(df['userId'])
df['movie_idx'] = movie_encoder.transform(df['movieId'])

df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,mu,sigma,rating_z,user_idx,movie_idx
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,3.814286,1.004235,1.180714,0,292
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama,3.814286,1.004235,-0.31296,0,302
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,3.814286,1.004235,1.180714,0,303
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War,3.814286,1.004235,1.180714,0,654
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance,3.814286,1.004235,-0.31296,0,878


In [9]:
n_users = df["user_idx"].nunique()
n_items = df["movie_idx"].nunique()
print(f"Dataset: {len(df):,} ratings | {n_users:,} users | {n_items:,} movies")

Dataset: 25,000,095 ratings | 162,541 users | 59,047 movies


- Train: Dùng để xây dựng hồ sơ người dùng (user profile).

- Validation: Đánh giá hiệu suất mô hình trong quá trình phát triển, điều chỉnh tham số (như `Z_THRESHOLD`).

- Test: Đánh giá cuối cùng để báo cáo hiệu suất thực tế.

`stratify` đảm bảo rằng tỉ lệ người dùng trong các tập train, validation, test tương tự nhau. Điều này quan trọng vì mỗi người dùng có số lượng đánh giá khác nhau, và ta muốn mô hình được huấn luyện trên dữ liệu đại diện.

In [10]:
def train_val_test_plit(
        data_frame,                 # Bộ dữ liệu cần chia train/set/val được lưu dưới dạng DataFrame
        test_size=TEST_SIZE,              # Tỉ lệ tập test so với cả bộ
        val_size=VAL_SIZE,               # Tỉ lệ tập val so với cả bộ
        col="user_idx",    
        seed=SEED            
    ):
    train_val, test = train_test_split(
        data_frame, 
        test_size=test_size, 
        random_state=seed,
        stratify=data_frame[col]  
    )
    train, val = train_test_split(
        train_val, 
        test_size=val_size, 
        random_state=seed,
        stratify=train_val[col]
    ) 
    return train.reset_index(drop=True), val.reset_index(drop=True), test.reset_index(drop=True)

train_df, val_df, test_df = train_val_test_plit(df)
print("Split sizes - ", {k: len(v) for k, v in zip(["train", "val", "test"], [train_df, val_df, test_df])})

Split sizes -  {'train': 20250076, 'val': 2250009, 'test': 2500010}


## Build genome CSR matrix

Mỗi phim được biểu diễn bằng một vector các điểm `relevance` tương ứng với các tag. Ta cần tạo một ma trận **phim × tag**, trong đó mỗi ô là điểm `relevance`.

In [11]:
merged_genome = pd.merge(genome_scores, genome_tags, on='tagId', how='left')  
merged_genome['movie_idx'] = movie_encoder.transform(merged_genome['movieId'])

merged_genome

Unnamed: 0,movieId,tagId,relevance,tag,movie_idx
0,1,1,0.02875,007,0
1,1,2,0.02375,007 (series),0
2,1,3,0.06250,18th century,0
3,1,4,0.07575,1920s,0
4,1,5,0.14075,1930s,0
...,...,...,...,...,...
15584443,206499,1124,0.11000,writing,61660
15584444,206499,1125,0.04850,wuxia,61660
15584445,206499,1126,0.01325,wwii,61660
15584446,206499,1127,0.14025,zombie,61660


In [12]:
genome_matrix = merged_genome.pivot(index='movieId', columns='tag', values='relevance').fillna(0)
genome_matrix

tag,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.02875,0.02375,0.06250,0.07575,0.14075,0.14675,0.06350,0.20375,0.20200,0.03075,...,0.04050,0.01425,0.03050,0.03500,0.14125,0.05775,0.03900,0.02975,0.08475,0.02200
2,0.04125,0.04050,0.06275,0.08275,0.09100,0.06125,0.06925,0.09600,0.07650,0.05250,...,0.05250,0.01575,0.01250,0.02000,0.12225,0.03275,0.02100,0.01100,0.10525,0.01975
3,0.04675,0.05550,0.02925,0.08700,0.04750,0.04775,0.04600,0.14275,0.02850,0.03875,...,0.06275,0.01950,0.02225,0.02300,0.12200,0.03475,0.01700,0.01800,0.09100,0.01775
4,0.03425,0.03800,0.04050,0.03100,0.06500,0.03575,0.02900,0.08650,0.03200,0.03150,...,0.05325,0.02800,0.01675,0.03875,0.18200,0.07050,0.01625,0.01425,0.08850,0.01500
5,0.04300,0.05325,0.03800,0.04100,0.05400,0.06725,0.02775,0.07650,0.02150,0.02975,...,0.05350,0.02050,0.01425,0.02550,0.19225,0.02675,0.01625,0.01300,0.08700,0.01600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205072,0.02050,0.01775,0.11400,0.03650,0.31225,0.03675,0.10700,0.37925,0.01725,0.36350,...,0.18675,0.03650,0.03025,0.12900,0.13975,0.42425,0.03400,0.02350,0.41725,0.09100
205076,0.03825,0.03150,0.03200,0.05325,0.20850,0.07050,0.06625,0.27825,0.00950,0.02750,...,0.27775,0.03225,0.04675,0.03175,0.23025,0.06300,0.04175,0.04125,0.07275,0.02350
205383,0.04100,0.04025,0.02750,0.07850,0.19750,0.17825,0.17125,0.30475,0.16825,0.04850,...,0.16525,0.03250,0.02400,0.03575,0.20400,0.08525,0.04600,0.02900,0.11725,0.03925
205425,0.04525,0.04125,0.04250,0.07425,0.11550,0.10500,0.08275,0.13575,0.16125,0.05875,...,0.25075,0.04550,0.01425,0.03925,0.21700,0.06000,0.07250,0.01500,0.11050,0.02850


In [13]:
# movieId2row = dict(zip(genome_matrix.index.values, np.arange(genome_matrix.shape[0], dtype=np.int32)))
# row2movieId = genome_matrix.index.values

## Genres Vectorization (TF-IDF)

In [14]:
tfidf = TfidfVectorizer(token_pattern=r'[^|]+')
genres_tfidf = tfidf.fit_transform(movies['genres'])
genres_df = pd.DataFrame(genres_tfidf.toarray(), columns=[f'genre:{g}' for g in tfidf.get_feature_names_out()],
                                                    index=movies['movieId'])

In [15]:
describe_matrix = pd.concat([genome_matrix, genres_df], axis=1).fillna(0)
describe_matrix

Unnamed: 0_level_0,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,genre:film-noir,genre:horror,genre:imax,genre:musical,genre:mystery,genre:romance,genre:sci-fi,genre:thriller,genre:war,genre:western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.02875,0.02375,0.06250,0.07575,0.14075,0.14675,0.06350,0.20375,0.2020,0.03075,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.04125,0.04050,0.06275,0.08275,0.09100,0.06125,0.06925,0.09600,0.0765,0.05250,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.04675,0.05550,0.02925,0.08700,0.04750,0.04775,0.04600,0.14275,0.0285,0.03875,...,0.0,0.0,0.0,0.0,0.0,0.801149,0.0,0.0,0.0,0.0
4,0.03425,0.03800,0.04050,0.03100,0.06500,0.03575,0.02900,0.08650,0.0320,0.03150,...,0.0,0.0,0.0,0.0,0.0,0.719344,0.0,0.0,0.0,0.0
5,0.04300,0.05325,0.03800,0.04100,0.05400,0.06725,0.02775,0.07650,0.0215,0.02975,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209157,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
209159,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
209163,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
209169,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [24]:
std_scaler = StandardScaler(with_mean=False)
describe_matrix_scaled = std_scaler.fit_transform(describe_matrix) 
print(describe_matrix_scaled)

[[0.7411119  0.68198834 1.0122178  ... 0.         0.         0.        ]
 [1.06333447 1.16296959 1.01626667 ... 0.         0.         0.        ]
 [1.2051124  1.59369907 0.47371793 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


## Feature Selection

In [25]:
# Tính rating trung bình cho mỗi movieId
movie_avg_rating = ratings.groupby('movieId')['rating'].mean()

# Lọc các movieId hợp lệ có trong genome data
genome_df_filtered = merged_genome[merged_genome['movieId'].isin(movie_avg_rating.index)].copy()

# Xoá trùng movieId nếu cần (giữ lại 1 dòng duy nhất)
genome_df_filtered = genome_df_filtered.drop_duplicates(subset='movieId')

# Chỉ giữ lại cột movieId
genome_df_filtered = genome_df_filtered[['movieId']].copy().reset_index(drop=True)

# Thêm cột trung bình rating
genome_df_filtered['avg_rating'] = genome_df_filtered['movieId'].map(movie_avg_rating)

In [26]:
genome_df_filtered.head()

Unnamed: 0,movieId,avg_rating
0,1,3.893708
1,2,3.251527
2,3,3.142028
3,4,2.853547
4,5,3.058434


In [27]:
def rating_to_class(rating):
    if rating <= 2.5:
        return 0  # rating thấp
    elif rating <= 4.0:
        return 1  # rating trung bình
    else:
        return 2  # rating cao

rating_class = movie_avg_rating.apply(rating_to_class)

In [28]:
y_sup = rating_class.reindex(describe_matrix.index).fillna(1).astype(int).values

if sparse.issparse(describe_matrix_scaled):
    X_dense = describe_matrix_scaled.toarray()
    X_chi = MinMaxScaler().fit_transform(describe_matrix_scaled)  
else:
    X_dense = describe_matrix_scaled        
    X_chi = MinMaxScaler().fit_transform(describe_matrix_scaled)  


# tạo 4 ma trận đặc trưng giám sát
feature_matrices = {}

# 1. Chi-squared
sel_chi = SelectKBest(chi2, k=N_FEATURES_FS).fit(X_chi, y_sup)
feature_matrices['chi2'] = sel_chi.transform(describe_matrix_scaled)

# 2. Information Gain (Entropy)
tree_e = DecisionTreeClassifier(criterion='entropy', random_state=0).fit(X_dense, y_sup)
idx_e = np.argsort(tree_e.feature_importances_)[-N_FEATURES_FS:]
feature_matrices['entropy'] = X_dense[:, idx_e]

# 3. Gini Index
tree_g = DecisionTreeClassifier(criterion='gini', random_state=0).fit(X_dense, y_sup)
idx_g = np.argsort(tree_g.feature_importances_)[-N_FEATURES_FS:]
feature_matrices['gini'] = X_dense[:, idx_g]

# 4. Normalized Deviation
std_col = np.std(X_dense, axis=0)
idx_std = np.argsort(std_col)[-N_FEATURES_FS:]
feature_matrices['deviation'] = X_dense[:, idx_std]

## Build user profiles

**Làm thế nào để biểu diễn sở thích của người dùng?**

- Hồ sơ người dùng là một vector tổng hợp các tag genome của những phim họ thích (dựa trên `rating_z ≥ Z_THRESHOLD`). Vector này được tính bằng trung bình có trọng số của các vector phim.

In [30]:
def build_user_profiles(ratings_df, rating_col="rating_z", threshold=Z_THRESHOLD, eps=1e-8):
    profiles = {}
    good = ratings_df.loc[ratings_df[rating_col] >= threshold, ["userId", "movieId", rating_col]]
    
    for uid, grp in tqdm(good.groupby("userId"), desc="profiles", unit="user"):
        rows = [movieId2row[m] for m in grp.movieId if m in movieId2row]
        if not rows:
            continue

        w = grp[rating_col].values[:, None]
        
        if w.shape[0] == len(rows):
            w_sum = w.sum()

            if w_sum < eps:
                prof_dense = genome_csr[rows].mean(axis=0)
            else:
                prof_dense = (genome_csr[rows].multiply(w)).sum(axis=0) / w_sum

            prof_dense = np.nan_to_num(np.asarray(prof_dense).ravel())
            profiles[uid] = sparse.csr_matrix(prof_dense)

    return profiles

In [31]:
def content_score(uid, mid):
    if uid not in profiles or mid not in movieId2row:
        return 0.0
    vec = profiles[uid]
    if vec.nnz == 0 or np.isnan(vec.data).any():
        return 0.0
    return float(cosine_similarity(vec, genome_csr[movieId2row[mid]])[0, 0])

def scale_to_rating(sim, a=0, b=1, c=RATING_MIN, d=RATING_MAX):
    return (sim - a) / (b - a) * (d - c) + c

## Evaluation

In [32]:
def evaluate(df_subset):
    y_true = df_subset["rating"].values
    y_pred = [scale_to_rating(content_score(u, m)) for u, m in zip(df_subset["userId"], df_subset["movieId"])]
    return math.sqrt(mean_squared_error(y_true, y_pred)), mean_absolute_error(y_true, y_pred)

def _test_matrix(mat):
    global genome_csr, movieId2row, row2movieId, profiles
    genome_csr = normalize(sparse.csr_matrix(mat), axis=1, copy=False)
    movieId2row = {mid: i for i, mid in enumerate(describe_matrix.index.values)}
    row2movieId = describe_matrix.index.values
    profiles = build_user_profiles(train_df)
    return evaluate(test_df)

In [33]:
results = {}
print("\n=== Đánh giá 4 phương pháp chọn đặc trưng ===")
for name, mat in feature_matrices.items():
    rmse, mae = _test_matrix(mat)
    results[name] = (rmse, mae)
    print(f"{name:<9}: RMSE = {rmse:.4f} | MAE = {mae:.4f}")


=== Đánh giá 4 phương pháp chọn đặc trưng ===


profiles:   0%|          | 0/162533 [00:00<?, ?user/s]

chi2     : RMSE = 1.2901 | MAE = 0.9820


profiles:   0%|          | 0/162533 [00:00<?, ?user/s]

entropy  : RMSE = 1.0747 | MAE = 0.8326


profiles:   0%|          | 0/162533 [00:00<?, ?user/s]

gini     : RMSE = 1.0690 | MAE = 0.8250


profiles:   0%|          | 0/162533 [00:00<?, ?user/s]

deviation: RMSE = 1.0797 | MAE = 0.8327


In [34]:
best_method = min(results, key=lambda k: results[k][0])
print(f"\nPhương pháp tốt nhất: {best_method}  (RMSE = {results[best_method][0]:.4f})")


Phương pháp tốt nhất: gini  (RMSE = 1.0690)


In [35]:
genome_csr = normalize(sparse.csr_matrix(feature_matrices[best_method]), axis=1, copy=False)

movieId2row = {mid: i for i, mid in enumerate(describe_matrix.index.values)}
row2movieId = describe_matrix.index.values

profiles = build_user_profiles(train_df) 

profiles:   0%|          | 0/162533 [00:00<?, ?user/s]

In [None]:
# val_rmse, val_mae = evaluate(val_df)
# print(f"Val   RMSE={val_rmse:.4f} | MAE={val_mae:.4f}")
# test_rmse, test_mae = evaluate(test_df)
# print(f"Test  RMSE={test_rmse:.4f} | MAE={test_mae:.4f}")

## Recommendation

In [36]:
def print_actual_pred(df_, n=SHOW_ROWS):
    print(f"Actual vs Predicted (first {n} rows of supplied set):")
    for _, row in df_.head(n).iterrows():
        pred = scale_to_rating(content_score(row.userId, row.movieId))
        print(f"uid={row.userId:6.0f} | mid={row.movieId:6.0f} | actual={row.rating:3.1f} | pred={pred:3.2f}")

print_actual_pred(test_df, SHOW_ROWS)

Actual vs Predicted (first 50 rows of supplied set):
uid= 32250 | mid= 27773 | actual=4.5 | pred=4.41
uid= 20576 | mid=  1356 | actual=4.0 | pred=3.35
uid= 11930 | mid=    22 | actual=4.0 | pred=4.00
uid=137863 | mid=   208 | actual=2.5 | pred=3.32
uid= 49403 | mid=  3511 | actual=1.0 | pred=3.70
uid= 80484 | mid=  2915 | actual=3.5 | pred=3.20
uid= 67024 | mid= 88744 | actual=4.0 | pred=3.35
uid=160203 | mid=   595 | actual=3.0 | pred=3.36
uid= 62149 | mid=  5816 | actual=3.5 | pred=3.07
uid=113743 | mid=  4105 | actual=4.0 | pred=3.47
uid= 92892 | mid= 81845 | actual=5.0 | pred=4.14
uid= 54644 | mid=   527 | actual=3.0 | pred=3.96
uid= 50956 | mid=   589 | actual=4.0 | pred=3.23
uid= 56924 | mid= 44195 | actual=4.0 | pred=3.86
uid=161714 | mid=  2745 | actual=3.5 | pred=2.79
uid=156015 | mid=  1293 | actual=3.0 | pred=3.19
uid= 38158 | mid=  2762 | actual=5.0 | pred=3.91
uid=120282 | mid= 64030 | actual=4.5 | pred=3.30
uid= 66720 | mid=   736 | actual=3.0 | pred=3.31
uid=155025 | mid

In [37]:
def recommend_content(uid, N=TOP_N):
    if uid not in profiles or not profiles[uid].nnz:
        return []
    seen = set(train_df.loc[train_df.userId == uid, "movieId"])
    sims = cosine_similarity(profiles[uid], genome_csr).ravel()
    for m in seen:
        if m in movieId2row:
            sims[movieId2row[m]] = -1.0
    idx = np.argpartition(sims, -N)[-N:]
    idx = idx[np.argsort(sims[idx])[::-1]]
    return [(int(row2movieId[i]), float(sims[i])) for i in idx]

def user_genre_preference(uid, top_k=10):
    fav = train_df[(train_df.userId == uid) & (train_df["rating_z"] >= Z_THRESHOLD)]
    counts = {}
    for mid in fav.movieId:
        for g in movies.loc[movies.movieId == mid, "genres"].iat[0].split("|"):
            counts[g] = counts.get(g, 0) + 1
    return dict(sorted(counts.items(), key=lambda kv: kv[1], reverse=True)[:top_k])

def show_recs(uid=SAMPLE_USER, N=TOP_N):
    print(f"\nTop-{N} recommendations for user {uid}")
    for mid, sim in recommend_content(uid, N):
        mv = movies[movies.movieId == mid].iloc[0]
        print(f"mid={mid:6d} | sim={sim:.3f} | rating≈{scale_to_rating(sim):.2f} | {mv.title} | {mv.genres}")
    print("\nUser's favourite genres:")
    for g, c in user_genre_preference(uid).items():
        print(f"{g}: {c} movies")

In [38]:
if __name__ == "__main__":
    show_recs(SAMPLE_USER, TOP_N)


Top-10 recommendations for user 72313
mid=184257 | sim=0.890 | rating≈4.50 | Making a Murderer (2015) | Crime|Documentary
mid=192803 | sim=0.874 | rating≈4.43 | Bohemian Rhapsody (2018) | Drama
mid=193950 | sim=0.873 | rating≈4.43 | Free Solo (2018) | Documentary
mid=183897 | sim=0.873 | rating≈4.43 | Isle of Dogs (2018) | Animation|Comedy
mid=179135 | sim=0.871 | rating≈4.42 | Blue Planet II (2017) | Documentary
mid=201588 | sim=0.869 | rating≈4.41 | Toy Story 4 (2019) | Adventure|Animation|Children|Comedy
mid=205383 | sim=0.868 | rating≈4.41 | El Camino: A Breaking Bad Movie (2019) | Crime|Drama|Thriller
mid=196891 | sim=0.868 | rating≈4.40 | The Lego Movie 2: The Second Part (2019) | Action|Adventure|Animation|Children|Comedy
mid=204698 | sim=0.867 | rating≈4.40 | Joker (2019) | Crime|Drama|Thriller
mid=192385 | sim=0.861 | rating≈4.38 | A Star Is Born (2018) | Drama|Romance

User's favourite genres:
Action: 2 movies
Comedy: 2 movies
Crime: 2 movies
Drama: 2 movies
Documentary: 1 m