# **Content-Based**

#### **Sử dụng 2 phương pháp:**

- **Cosine similarity:** Tính độ tương đồng giữa hồ sơ người dùng (dựa trên các phim họ thích) và đặc trưng của phim (genome tags và thể loại) để gợi ý phim tương tự.

- **Ridge Regression:** Học một mô hình tuyến tính cho mỗi người dùng để dự đoán điểm đánh giá dựa trên đặc trưng phim, từ đó gợi ý phim có điểm dự đoán cao.

### Import thư viện

In [1]:
import os
import math
import time
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from scipy import sparse

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import normalize

### Save/Load 

In [2]:
PROFILE_CACHE_DIR = "./cache_profiles"
os.makedirs(PROFILE_CACHE_DIR, exist_ok=True)

def save_profiles(profiles_dict, method_name):
    file_path = os.path.join(PROFILE_CACHE_DIR, f"profiles_{method_name}.pkl")
    with open(file_path, "wb") as f:
        pickle.dump(profiles_dict, f)

def load_profiles(method_name):
    file_path = os.path.join(PROFILE_CACHE_DIR, f"profiles_{method_name}.pkl")
    if os.path.exists(file_path):
        with open(file_path, "rb") as f:
            return pickle.load(f)
    return None

In [3]:
GENOME_CACHE_PATH = os.path.join(PROFILE_CACHE_DIR, "genome_csr_best.pkl") 

def save_genome_csr(csr):  
    with open(GENOME_CACHE_PATH, "wb") as f:
        pickle.dump(csr, f)

def load_genome_csr():  
    if os.path.exists(GENOME_CACHE_PATH):
        with open(GENOME_CACHE_PATH, "rb") as f:
            return pickle.load(f)
    return None

### 2. Parameters

In [4]:
DATA_DIR = "../data"
Z_THRESHOLD = 0.0
RATING_MIN = 0.5
RATING_MAX = 5.0
TEST_SIZE = 0.10
VAL_SIZE = 0.10
SEED = 42
SAMPLE_USER = 72313
TOP_N = 10
SHOW_ROWS = 50
N_FEATURES_FS = 300

### 3. Load & Prepare data

In [None]:
# ratings = pd.read_csv(f"{DATA_DIR}/ratings.csv")
# movies = pd.read_csv(f"{DATA_DIR}/movies.csv")

# genome_scores = pd.read_csv(f"{DATA_DIR}/genome-scores.csv")
# genome_tags = pd.read_csv(f"{DATA_DIR}/genome-tags.csv")

In [None]:
df = pd.merge(ratings, movies, on='movieId')
print(f"Original dataset: {len(df):,} ratings | {df['userId'].nunique():,} users | {df['movieId'].nunique():,} movies")
display(df.head())

'''
total movies 
rated movie - 59k
in trong tập tag in movie
'''

Original dataset: 25,000,095 ratings | 162,541 users | 59,047 movies


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance


In [None]:
# # Encode
# user_encoder = LabelEncoder()
# user_encoder.fit(df['userId'])
# df['user_idx'] = user_encoder.transform(df['userId'])

# display(df.head())

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,user_idx
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,0
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama,0
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,0
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War,0
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance,0


### 4. Split data

In [8]:
split_dir = os.path.join(DATA_DIR, "splits")
os.makedirs(split_dir, exist_ok=True)

train_path = os.path.join(split_dir, "train.csv")
val_path = os.path.join(split_dir, "val.csv")
test_path = os.path.join(split_dir, "test.csv")

In [9]:
if os.path.exists(train_path) and os.path.exists(val_path) and os.path.exists(test_path):
    print("Loading cached train/val/test splits...")
    train_df = pd.read_csv(train_path)
    val_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)
    
    train_df['user_idx'] = user_encoder.transform(train_df['userId'])
    val_df['user_idx'] = user_encoder.transform(val_df['userId'])
    test_df['user_idx'] = user_encoder.transform(test_df['userId'])
else:
    print("Splitting train/val/test...")
    def train_val_test_split_func(data_frame, test_size=TEST_SIZE, val_size=VAL_SIZE, col="user_idx", seed=SEED):
        train_val, test = train_test_split(
            data_frame, 
            test_size=test_size, 
            random_state=seed, 
            stratify=data_frame[col]
        )
        train, val = train_test_split(
            train_val, 
            test_size=val_size, 
            random_state=seed, 
            stratify=train_val[col]
        )
        return train.reset_index(drop=True), val.reset_index(drop=True), test.reset_index(drop=True)

    train_df, val_df, test_df = train_val_test_split_func(df)
    
    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)
    test_df.to_csv(test_path, index=False)

print("Split sizes:", {k: len(v) for k, v in zip(["train", "val", "test"], [train_df, val_df, test_df])})


Splitting train/val/test...
Split sizes: {'train': 20250076, 'val': 2250009, 'test': 2500010}


### 5. Per‑user z‑score (remove bias)

Mỗi người dùng có thang chấm điểm khác nhau. Ví dụ, một người có thể chấm 4/5 là "rất thích", trong khi người khác chấm 4/5 là "bình thường". Chuẩn hóa z-score giúp so sánh đánh giá giữa các người dùng một cách công bằng.

In [10]:
# Scale rating ở tập train
train_user_stats = train_df.groupby("userId")["rating"].agg(["mean", "std"]).rename(columns={"mean": "mu", "std": "sigma"})
train_user_stats

Unnamed: 0_level_0,mu,sigma
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.833333,1.049376
2,3.674497,1.450683
3,3.703390,0.606621
4,3.334184,1.096290
5,3.756098,0.950126
...,...,...
162537,4.036585,0.948620
162538,3.332000,1.257147
162539,4.473684,0.761820
162540,3.866197,1.221702


**Z-score:** `z = (x - mu) / sigma`, trong đó `x` là đánh giá, `mu` là trung bình đánh giá của người dùng, `sigma` là độ lệch chuẩn.

- Z-score biểu thị đánh giá lệch bao nhiêu so với trung bình của người dùng, chuẩn hóa về đơn vị độ lệch chuẩn.

- Nếu `sigma=0` (người dùng chỉ chấm một giá trị duy nhất), phép chia sẽ gây lỗi (chia cho 0). Thay bằng 1e-6 để tránh lỗi và giữ z-score hợp lý.

In [11]:
def add_z_scores(df, user_stats):
    df_with_stats = df.merge(user_stats, on="userId", how="left")

    ratings_mean = train_df["rating"].mean()
    ratings_std = train_df["rating"].std()

    df_with_stats["mu"] = df_with_stats["mu"].fillna(ratings_mean)
    df_with_stats["sigma"] = df_with_stats["sigma"].fillna(ratings_std)
    df_with_stats["rating_z"] = (df_with_stats["rating"] - df_with_stats["mu"]) / df_with_stats["sigma"].replace(0, 1e-6)
    
    return df_with_stats

train_df = add_z_scores(train_df, train_user_stats)
val_df = add_z_scores(val_df, train_user_stats)
test_df = add_z_scores(test_df, train_user_stats)

### 6. Build genome CSR matrix

In [12]:
merged_genome = pd.merge(genome_scores, genome_tags, on='tagId', how='left')
merged_genome

Unnamed: 0,movieId,tagId,relevance,tag
0,1,1,0.02875,007
1,1,2,0.02375,007 (series)
2,1,3,0.06250,18th century
3,1,4,0.07575,1920s
4,1,5,0.14075,1930s
...,...,...,...,...
15584443,206499,1124,0.11000,writing
15584444,206499,1125,0.04850,wuxia
15584445,206499,1126,0.01325,wwii
15584446,206499,1127,0.14025,zombie


In [13]:
genome_matrix = merged_genome.pivot(index='movieId', columns='tag', values='relevance').fillna(0)
genome_matrix

tag,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.02875,0.02375,0.06250,0.07575,0.14075,0.14675,0.06350,0.20375,0.20200,0.03075,...,0.04050,0.01425,0.03050,0.03500,0.14125,0.05775,0.03900,0.02975,0.08475,0.02200
2,0.04125,0.04050,0.06275,0.08275,0.09100,0.06125,0.06925,0.09600,0.07650,0.05250,...,0.05250,0.01575,0.01250,0.02000,0.12225,0.03275,0.02100,0.01100,0.10525,0.01975
3,0.04675,0.05550,0.02925,0.08700,0.04750,0.04775,0.04600,0.14275,0.02850,0.03875,...,0.06275,0.01950,0.02225,0.02300,0.12200,0.03475,0.01700,0.01800,0.09100,0.01775
4,0.03425,0.03800,0.04050,0.03100,0.06500,0.03575,0.02900,0.08650,0.03200,0.03150,...,0.05325,0.02800,0.01675,0.03875,0.18200,0.07050,0.01625,0.01425,0.08850,0.01500
5,0.04300,0.05325,0.03800,0.04100,0.05400,0.06725,0.02775,0.07650,0.02150,0.02975,...,0.05350,0.02050,0.01425,0.02550,0.19225,0.02675,0.01625,0.01300,0.08700,0.01600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205072,0.02050,0.01775,0.11400,0.03650,0.31225,0.03675,0.10700,0.37925,0.01725,0.36350,...,0.18675,0.03650,0.03025,0.12900,0.13975,0.42425,0.03400,0.02350,0.41725,0.09100
205076,0.03825,0.03150,0.03200,0.05325,0.20850,0.07050,0.06625,0.27825,0.00950,0.02750,...,0.27775,0.03225,0.04675,0.03175,0.23025,0.06300,0.04175,0.04125,0.07275,0.02350
205383,0.04100,0.04025,0.02750,0.07850,0.19750,0.17825,0.17125,0.30475,0.16825,0.04850,...,0.16525,0.03250,0.02400,0.03575,0.20400,0.08525,0.04600,0.02900,0.11725,0.03925
205425,0.04525,0.04125,0.04250,0.07425,0.11550,0.10500,0.08275,0.13575,0.16125,0.05875,...,0.25075,0.04550,0.01425,0.03925,0.21700,0.06000,0.07250,0.01500,0.11050,0.02850


### Genres Vectorization (TF-IDF)

In [14]:
tfidf = TfidfVectorizer(token_pattern=r'[^|]+')
genres_tfidf = tfidf.fit_transform(movies['genres'])
genres_df = pd.DataFrame(
    genres_tfidf.toarray(), 
    columns=[f'genre:{g}' for g in tfidf.get_feature_names_out()],
    index=movies['movieId']
)

In [15]:
all_movies = pd.Index(sorted(set(genome_matrix.index).union(genres_df.index)))
all_movies

Index([     1,      2,      3,      4,      5,      6,      7,      8,      9,
           10,
       ...
       209145, 209147, 209151, 209153, 209155, 209157, 209159, 209163, 209169,
       209171],
      dtype='int64', length=62423)

In [16]:
# Phim thiếu genome
missing_genome = all_movies.difference(genome_matrix.index)
if len(missing_genome):
    zero_genome = pd.DataFrame(0, index=missing_genome, columns=genome_matrix.columns)
    genome_matrix = pd.concat([genome_matrix, zero_genome])

# Phim thiếu genres
missing_genre = all_movies.difference(genres_df.index)
if len(missing_genre):
    zero_genre = pd.DataFrame(0, index=missing_genre, columns=genres_df.columns)
    genres_df = pd.concat([genres_df, zero_genre])

In [17]:
# chỉnh lại index
genome_matrix = genome_matrix.loc[all_movies]
genres_df = genres_df.loc[all_movies]

In [18]:
# Ghép hai ma trận -> describe_matrix
describe_matrix = pd.concat([genome_matrix, genres_df], axis=1).fillna(0)
print("Describe matrix:", describe_matrix.shape)    

Describe matrix: (62423, 1148)


In [19]:
describe_matrix

Unnamed: 0,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,genre:film-noir,genre:horror,genre:imax,genre:musical,genre:mystery,genre:romance,genre:sci-fi,genre:thriller,genre:war,genre:western
1,0.02875,0.02375,0.06250,0.07575,0.14075,0.14675,0.06350,0.20375,0.2020,0.03075,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.04125,0.04050,0.06275,0.08275,0.09100,0.06125,0.06925,0.09600,0.0765,0.05250,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.04675,0.05550,0.02925,0.08700,0.04750,0.04775,0.04600,0.14275,0.0285,0.03875,...,0.0,0.0,0.0,0.0,0.0,0.801149,0.0,0.0,0.0,0.0
4,0.03425,0.03800,0.04050,0.03100,0.06500,0.03575,0.02900,0.08650,0.0320,0.03150,...,0.0,0.0,0.0,0.0,0.0,0.719344,0.0,0.0,0.0,0.0
5,0.04300,0.05325,0.03800,0.04100,0.05400,0.06725,0.02775,0.07650,0.0215,0.02975,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209157,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
209159,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
209163,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
209169,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [28]:
describe_matrix.index

Index([     1,      2,      3,      4,      5,      6,      7,      8,      9,
           10,
       ...
       209145, 209147, 209151, 209153, 209155, 209157, 209159, 209163, 209169,
       209171],
      dtype='int64', length=62423)

In [20]:
scaler = StandardScaler(with_mean=False)
describe_matrix_scaled = scaler.fit_transform(describe_matrix) 
print(describe_matrix_scaled)

[[0.7411119  0.68198834 1.0122178  ... 0.         0.         0.        ]
 [1.06333447 1.16296959 1.01626667 ... 0.         0.         0.        ]
 [1.2051124  1.59369907 0.47371793 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


### Feature Selection

In [21]:
movie_avg_train = train_df.groupby("movieId")["rating"].mean()

def rating_to_class(rating):
    if rating <= 2.5:
        return 0  # rating thấp
    elif rating <= 4.0:
        return 1  # rating trung bình
    else:
        return 2  # rating cao

train_movies_fs = [m for m in movie_avg_train.index if m in describe_matrix.index]

In [22]:
X_fs = describe_matrix_scaled[[describe_matrix.index.get_loc(m) for m in train_movies_fs]]
y_fs = movie_avg_train.loc[train_movies_fs].apply(rating_to_class).values
k = min(N_FEATURES_FS, X_fs.shape[1])

# tạo 4 ma trận đặc trưng giám sát
feature_matrices = {}

# 1. Chi-squared
sel_chi = SelectKBest(chi2, k=k).fit(X_fs, y_fs)
feature_matrices["chi2"] = sel_chi.transform(describe_matrix_scaled)

if sparse.issparse(X_fs):        
    X_fs_dense = X_fs.toarray()
else:
    X_fs_dense = X_fs

# 2. Information Gain (Entropy)
tree_e = DecisionTreeClassifier(criterion="entropy", random_state=SEED).fit(X_fs_dense, y_fs)
idx_e = np.argsort(tree_e.feature_importances_)[-k:]
feature_matrices["entropy"] = describe_matrix_scaled[:, idx_e]

# 3. Gini Index
tree_g = DecisionTreeClassifier(criterion="gini", random_state=SEED).fit(X_fs_dense, y_fs)
idx_g = np.argsort(tree_g.feature_importances_)[-k:]
feature_matrices["gini"] = describe_matrix_scaled[:, idx_g]

# 4. Normalized Deviation
std_col = np.std(X_fs_dense, axis=0)
idx_std = np.argsort(std_col)[-k:]
feature_matrices["deviation"] = describe_matrix_scaled[:, idx_std]

# **Cosine Similarity**

**Làm thế nào để biểu diễn sở thích của người dùng?**

- Hồ sơ người dùng là một vector tổng hợp các tag genome của những phim họ thích (dựa trên `rating_z ≥ Z_THRESHOLD`). Vector này được tính bằng trung bình có trọng số của các vector phim.

In [23]:
movieId2row = {mid: i for i, mid in enumerate(describe_matrix.index.values)}
row2movieId = describe_matrix.index.values

In [24]:
def build_user_profiles_sim(ratings_df, rating_col="rating_z", threshold=Z_THRESHOLD, eps=1e-8):
    profiles = {}
    good = ratings_df.loc[ratings_df[rating_col] >= threshold, ["userId", "movieId", rating_col]]
    
    for uid, grp in tqdm(good.groupby("userId"), desc="profiles", unit="user"):
        rows = [movieId2row[m] for m in grp.movieId if m in movieId2row]
        if not rows:
            continue

        w = grp[rating_col].values[:, None]
        
        if w.shape[0] == len(rows):
            w_sum = w.sum()

            if w_sum < eps:
                prof_dense = genome_csr[rows].mean(axis=0)
            else:
                prof_dense = (genome_csr[rows].multiply(w)).sum(axis=0) / w_sum

            prof_dense = np.nan_to_num(np.asarray(prof_dense).ravel())
            profiles[uid] = sparse.csr_matrix(prof_dense)

    return profiles

def eval_cosine(df_eval, profiles):
    df_eval = df_eval[df_eval.movieId.isin(movieId2row)]
    y_true, y_pred = [], []
    iterator = zip(df_eval.userId.values, df_eval.movieId.values, df_eval.rating.values)
    for u, m, r in tqdm(iterator, total=len(df_eval), desc="eval_cosine", unit="rec"):
        y_true.append(r)
        if u in profiles:
            sim = cosine_similarity(profiles[u], genome_csr[movieId2row[m]])[0, 0]
            pred = sim * (RATING_MAX - RATING_MIN) + RATING_MIN
        else:
            pred = RATING_MIN
        y_pred.append(pred)
    return math.sqrt(mean_squared_error(y_true, y_pred)), mean_absolute_error(y_true, y_pred)

def test_cosine_method(mat, method_name):
    global genome_csr, profiles
    genome_csr = normalize(sparse.csr_matrix(mat), axis=1, copy=False)

    profiles = load_profiles(method_name)
    if profiles is None:
        profiles = build_user_profiles_sim(train_df)
        save_profiles(profiles, method_name)

    rmse_val, mae_val = eval_cosine(val_df,  profiles)
    rmse_tst, mae_tst = eval_cosine(test_df, profiles)
    return rmse_val, mae_val, rmse_tst, mae_tst

# **Ridge Regression**

In [None]:
def get_items_rated_by_user(ratings_df, user_id):
    user_df = ratings_df[ratings_df['userId'] == user_id]
    movies_list = user_df['movieId'].values
    ratings_list = user_df['rating'].values
    return movies_list, ratings_list

def build_user_profiles_ridge(ratings_df, descriptions_df, min_ratings_for_grid=20, verbose=True):
    start = time.time()
    profiles = {}
    users_ids = ratings_df['userId'].unique()

    if verbose:
        print(f"Training {len(users_ids)} users...")

    param_grid = {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}

    for uid in tqdm(users_ids) if verbose else users_ids:
        uid_movies, uid_ratings = get_items_rated_by_user(ratings_df, uid)

        valid_movies = [m for m in uid_movies if m in descriptions_df.index]
        if not valid_movies:
            continue

        X = descriptions_df.loc[valid_movies].values
        y = uid_ratings[:len(valid_movies)] 

        if len(y) >= min_ratings_for_grid:
            try:
                grid = GridSearchCV(
                    Ridge(), 
                    param_grid, 
                    scoring='neg_mean_squared_error',
                    cv=min(5, len(y)), 
                    n_jobs=-1
                )
                grid.fit(X, y)
                profiles[uid] = grid.best_estimator_
            except:
                fallback_model = Ridge(alpha=1.0)
                fallback_model.fit(X, y)
                profiles[uid] = fallback_model
        else:
            fallback_model = Ridge(alpha=1.0)
            fallback_model.fit(X, y)
            profiles[uid] = fallback_model

    if verbose:
        print(f'Training time: {(time.time() - start):.4f}s')
    return profiles

def predict_rating_ridge(user_id, movie_ids, profiles, descriptions_df, normalize_rating=True):
    if user_id not in profiles:
        return np.array([])

    model = profiles[user_id]
    valid_movie_ids = [m for m in movie_ids if m in descriptions_df.index]
    
    if not valid_movie_ids:
        return np.array([])

    X = descriptions_df.loc[valid_movie_ids].values
    pred_rating = model.predict(X)

    if normalize_rating:
        pred_rating = np.clip(pred_rating, RATING_MIN, RATING_MAX)
    
    return pred_rating, valid_movie_ids



def eval_ridge(ratings_df, profiles, descriptions, user_id=None):
    if user_id is None:
        users_ids = ratings_df['userId'].unique()
        y_true, y_pred = [], []

        for uid in tqdm(users_ids):
            test_movies, true_ratings = get_items_rated_by_user(ratings_df, uid)
            pred_ratings = predict_rating_ridge(uid, test_movies, profiles, descriptions)
            if len(pred_ratings) == len(true_ratings):
                y_true.extend(true_ratings)
                y_pred.extend(pred_ratings)
    else:
        test_movies, true_ratings = get_items_rated_by_user(ratings_df, user_id)
        pred_ratings = predict_rating_ridge(user_id, test_movies, profiles, descriptions, ratings_df)
        y_true, y_pred = true_ratings, pred_ratings

    return math.sqrt(mean_squared_error(y_true, y_pred)), mean_absolute_error(y_true, y_pred)


In [27]:
results = {}
for name, mat in feature_matrices.items():
    print(f"Đánh giá feature matrix: {name}")

    # Cosine 
    genome_csr = normalize(sparse.csr_matrix(mat), axis=1, copy=False)
    profiles_cosine = build_user_profiles_sim(train_df) 
    rmse_val_cosine, mae_val_cosine = eval_cosine(val_df, profiles_cosine)
    rmse_test_cosine, mae_test_cosine = eval_cosine(test_df, profiles_cosine)

    # Ridge 
    desc_df = pd.DataFrame(
        data=normalize(sparse.csr_matrix(mat), axis=1, copy=False),     
        index=describe_matrix.index
    )

    profiles_ridge = build_user_profiles_ridge(train_df, desc_df)
    rmse_val_ridge, mae_val_ridge = eval_ridge(val_df, profiles_ridge, desc_df)
    rmse_test_ridge, mae_test_ridge = eval_ridge(test_df, profiles_ridge, desc_df)

    results[name] = {
    'cosine': (rmse_val_cosine, mae_val_cosine, rmse_test_cosine, mae_test_cosine),
    'ridge' : (rmse_val_ridge, mae_val_ridge, rmse_test_ridge, mae_test_ridge)
    }

    print(f"Cosine - Val RMSE {rmse_val_cosine:.4f} | Test RMSE {rmse_test_cosine:.4f}")
    print(f"Ridge - Val RMSE {rmse_val_ridge:.4f} | Test RMSE {rmse_test_ridge:.4f}")

Đánh giá feature matrix: chi2


profiles:   0%|          | 0/162541 [00:00<?, ?user/s]

eval_cosine:   0%|          | 0/2250009 [00:00<?, ?rec/s]

eval_cosine:   0%|          | 0/2500010 [00:00<?, ?rec/s]

Bắt đầu huấn luyện 162541 người dùng...


  0%|          | 0/162541 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
best_method_cosine = min(results, key=lambda n: results[n]['cosine'][0])   
best_method_ridge  = min(results, key=lambda n: results[n]['ridge'][0])    

print(f"Best Cosine feature matrix: {best_method_cosine}")
print(f"Best Ridge  feature matrix: {best_method_ridge}")

In [None]:
mat_sparse_best = (feature_matrices[best_method_ridge]
                   if sparse.issparse(feature_matrices[best_method_ridge])
                   else sparse.csr_matrix(feature_matrices[best_method_ridge]))  

desc_df_sparse_best = pd.DataFrame.sparse.from_spmatrix(
    mat_sparse_best, index=describe_matrix.index
)

print(desc_df_sparse_best)

In [None]:
genome_csr_cosine = normalize(sparse.csr_matrix(feature_matrices[best_method_cosine]), axis=1, copy=False)
print(genome_csr_cosine)
save_genome_csr(genome_csr_cosine)     

In [None]:
profiles_cosine_best = load_profiles(f"{best_method_cosine}_cosine")
if profiles_cosine_best is None:
    profiles_cosine_best = build_user_profiles_sim(train_df)
    save_profiles(profiles_cosine_best, f"{best_method_cosine}_cosine")

rmse_val_c_best, mae_val_c_best = eval_cosine(val_df, profiles_cosine_best)
rmse_test_c_best, mae_test_c_best = eval_cosine(test_df, profiles_cosine_best)
print(f"Cosine ({best_method_cosine})  Val RMSE={rmse_val_c_best:.4f} | Test RMSE={rmse_test_c_best:.4f}")

In [None]:
profiles_ridge_best = load_profiles(f"{best_method_ridge}_ridge")
if profiles_ridge_best is None:
    profiles_ridge_best = build_user_profiles_ridge(train_df, desc_df_sparse_best)
    save_profiles(profiles_ridge_best, f"{best_method_ridge}_ridge")

rmse_val_r_best, mae_val_r_best = eval_ridge(val_df, profiles_ridge_best, desc_df_sparse_best)
rmse_test_r_best, mae_test_r_best = eval_ridge(test_df, profiles_ridge_best, desc_df_sparse_best)
print(f"Ridge  ({best_method_ridge}) Val RMSE={rmse_val_r_best:.4f} | Test RMSE={rmse_test_r_best:.4f}")