In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.sparse import hstack
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [2]:
RESOURCE_PATH = os.path.join('../','resources')
FILM_PATH = '../resources/cleaned_data/cleaned_data.csv'
RATINGS_PATH = os.path.join(RESOURCE_PATH, 'data', 'train_val_test')

rating_files = ['ratings_test.csv','ratings_train.csv','ratings_val.csv']
ratings_dataframes = []
for file in rating_files:
    file_path = os.path.join(RATINGS_PATH, file)
    try:
        df = pd.read_csv(file_path, delimiter=',', header=None, names=['user_id','film_id', 'rating'])
        ratings_dataframes.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")

ratings_data = pd.concat(ratings_dataframes, ignore_index=True)
data = pd.read_csv(FILM_PATH)


In [3]:
print(ratings_data.columns)
print(data.columns)

Index(['user_id', 'film_id', 'rating'], dtype='object')
Index(['fid', 'name', 'description', 'ratingCount', 'ratingValue',
       'contentRating', 'genre', 'keywords', 'duration', 'datePublished',
       'actor', 'director', 'image'],
      dtype='object')


In [4]:
columns_to_keep = ['fid', 'contentRating', 
                   'genre', 'keywords', 'duration', 'actor', 'director']

film_data = data[columns_to_keep]
print(film_data.info())
film_data.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9814 entries, 0 to 9813
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fid            9814 non-null   int64  
 1   contentRating  9814 non-null   object 
 2   genre          9814 non-null   object 
 3   keywords       9814 non-null   object 
 4   duration       9814 non-null   float64
 5   actor          9814 non-null   object 
 6   director       9814 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 536.8+ KB
None


Unnamed: 0,fid,contentRating,genre,keywords,duration,actor,director
0,468569,PG,"Action,Crime,Drama","psychopath,superhero,moral dilemma,clown,crimi...",9120.0,"Christian Bale,Heath Ledger,Aaron Eckhart",Christopher Nolan
1,1375666,PG,"Action,Adventure,Sci-Fi","dream,ambiguous ending,subconscious,mindbender...",8880.0,"Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot ...",Christopher Nolan


In [5]:
# Tính số lượng đánh giá của mỗi người dùng
user_rating_counts = ratings_data['user_id'].value_counts()

# Chia người dùng thành các nhóm theo số lượng phim đã đánh giá
users_1_50 = user_rating_counts[(user_rating_counts >= 1) & (user_rating_counts <= 50)].index.to_numpy()
users_51_100 = user_rating_counts[(user_rating_counts >= 51) & (user_rating_counts <= 100)].index.to_numpy()
users_101_200 = user_rating_counts[(user_rating_counts >= 101) & (user_rating_counts <= 200)].index.to_numpy()
users_201_500 = user_rating_counts[(user_rating_counts >= 201) & (user_rating_counts <= 500)].index.to_numpy()
users_501_1000 = user_rating_counts[(user_rating_counts >= 501) & (user_rating_counts <= 1000)].index.to_numpy()
users_1001_2000 = user_rating_counts[(user_rating_counts >= 1001) & (user_rating_counts <= 2000)].index.to_numpy()
users_2001_12000 = user_rating_counts[(user_rating_counts >= 2001) & (user_rating_counts <= 12000)].index.to_numpy()

# Train-test split cho từng nhóm
users_1_50_train, users_1_50_test = train_test_split(users_1_50, test_size=0.2, random_state=24)
users_51_100_train, users_51_100_test = train_test_split(users_51_100, test_size=0.2, random_state=24)
users_101_200_train, users_101_200_test = train_test_split(users_101_200, test_size=0.2, random_state=24)
users_201_500_train, users_201_500_test = train_test_split(users_201_500, test_size=0.2, random_state=24)
users_501_1000_train, users_501_1000_test = train_test_split(users_501_1000, test_size=0.2, random_state=24)
users_1001_2000_train, users_1001_2000_test = train_test_split(users_1001_2000, test_size=0.2, random_state=24)
users_2001_12000_train, users_2001_12000_test = train_test_split(users_2001_12000, test_size=0.2, random_state=24)


In [6]:
# tfidf_description = TfidfVectorizer(stop_words='english')
# tfidf_genre = TfidfVectorizer(stop_words='english')
# tfidf_keywords = TfidfVectorizer(stop_words='english')
# tfidf_actor = TfidfVectorizer(stop_words='english')
# tfidf_director = TfidfVectorizer(stop_words='english')

# # Vectorizer
# description_matrix = tfidf_description.fit_transform(film_data['description'])
# genre_matrix = tfidf_genre.fit_transform(film_data['genre'])
# keywords_matrix = tfidf_keywords.fit_transform(film_data['keywords'])
# actor_matrix = tfidf_actor.fit_transform(film_data['actor'])
# director_matrix = tfidf_director.fit_transform(film_data['director'])

# # One hot encoder
# onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')
# contentRating_matrix = onehot_encoder.fit_transform(film_data[['contentRating']])

# scaler = StandardScaler()
# duration_matrix = scaler.fit_transform(film_data[['duration']])

# items_combined = hstack([
#     description_matrix, genre_matrix, keywords_matrix, 
#     actor_matrix, director_matrix, contentRating_matrix, duration_matrix
# ])





In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.sparse import hstack

film_data['combined_text'] = film_data['genre']+ " " + film_data['director'] #" " + film_data['actor'] # #+ " " + film_data['keywords']# 
tfidf = TfidfVectorizer(stop_words='english')
combined_text_matrix = tfidf.fit_transform(film_data['combined_text'])

# # One hot encoder cho contentRating
# onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')
# contentRating_matrix = onehot_encoder.fit_transform(film_data[['contentRating']])

# scaler = StandardScaler()
# duration_matrix = scaler.fit_transform(film_data[['duration']])

# items_combined = hstack([
#     combined_text_matrix, contentRating_matrix, duration_matrix
# ])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  film_data['combined_text'] = film_data['genre']+ " " + film_data['director'] #" " + film_data['actor'] # #+ " " + film_data['keywords']#


In [25]:
# Normalize all columns
scaler = StandardScaler(with_mean=False)
items_scaled = scaler.fit_transform(combined_text_matrix)
print(f"Items shape: {items_scaled.shape}")

Items shape: (9814, 4382)


In [26]:
items_sparse = csr_matrix(items_scaled)
print(items_sparse.shape)
svd = TruncatedSVD(n_components=8, random_state=42) 
items_reduced = svd.fit_transform(items_sparse)
print(f"Reduced items shape: {items_reduced.shape}")

(9814, 4382)
Reduced items shape: (9814, 8)


In [27]:
# Data scaled for Ridge since it has a built-in svd solver
items_combined_dense = items_sparse.toarray()
data_scaled = pd.DataFrame(items_combined_dense).assign(fid=film_data['fid'])
data_reduced = pd.DataFrame(items_reduced).assign(fid=film_data['fid'])


In [28]:
print(f"Reduced items shape: {data_reduced.shape}")

Reduced items shape: (9814, 9)


In [29]:
# Get feature_vector, real rating for each user
def get_items_rated_by_user(film_data, filt_ratings, user_id):
    movie_ids = filt_ratings[filt_ratings['user_id'] == user_id]['film_id'].values
    feature_vector = film_data[film_data['fid'].isin(movie_ids)].drop(columns='fid')
    scores = filt_ratings[filt_ratings['user_id'] == user_id]['rating'].values
    return feature_vector, scores

In [30]:
def predict_known_ratings_for_user(film_data, user_id, model):
    X, y = get_items_rated_by_user(film_data, ratings_data, user_id)
    model.fit(X, y)
    return np.clip(model.predict(X), 1, 10)
def get_not_seen_movies_from_user(film_data, user_id):
    user_ratings = ratings_data[ratings_data['user_id'] == user_id]
    seen_movies = film_data['fid'].isin(user_ratings['fid'])
    return film_data[~seen_movies].drop(columns='fid')
def predict_unknown_ratings_for_user(film_data, user_id, model):
    X, y = get_items_rated_by_user(film_data, ratings_data, user_id)
    X_test = get_not_seen_movies_from_user(user_id)
    model.fit(X, y)
    return np.clip(model.predict(X_test), 1, 10)

In [31]:
# demo of hyperparameters for tuning
ridge_param_grid = {
  'alpha': [40, 50, 60, 70, 80, 90],
}

knn_param_grid = {
  'n_neighbors': [30],
}


rf_param_grid = {
  'n_estimators': [300],
}
lgbm_param_grid = {
    'num_leaves': [2],
    'min_data_in_leaf': [10],
    'boosting_type': ['gbdt']
}

In [32]:
from tqdm import tqdm
def customized_grid_train(users, estimator, param_grid, param_grid_size, film_data):
    final_rmse = np.zeros(param_grid_size)
    final_mae = np.zeros(param_grid_size)
    grid_model = GridSearchCV(estimator=estimator, param_grid=param_grid,
                                scoring=('neg_root_mean_squared_error', 'neg_mean_absolute_error'), refit=False,
                                cv=KFold(5, shuffle=True), n_jobs=-1)
    
    for user in tqdm(users, desc="Processing users", unit="user"):
        X, y = get_items_rated_by_user(film_data, ratings_data, user)
        grid_model.fit(X, y)
        # mse = grid_model.cv_results_['mean_test_neg_mean_squared_error']
        # rmse = np.sqrt(-mse)
        rmse = grid_model.cv_results_['mean_test_neg_root_mean_squared_error']
        mae = grid_model.cv_results_['mean_test_neg_mean_absolute_error']
        final_rmse += rmse
        final_mae += mae
    best_params = grid_model.best_params_
    param_df = pd.DataFrame(grid_model.cv_results_['params'])
    param_df['rmse'] = final_rmse / len(users)
    param_df['mae'] = final_mae / len(users)
    print(f"Best parameters: {best_params}")
    
    return param_df, best_params

In [33]:
# demo of a parameter dataframe for tuning
param_df, best_params = customized_grid_train(users_1_50, Ridge(), ridge_param_grid, 1, data_scaled)
param_df.to_csv('../output/ridge_1_50_param.csv')

Processing users: 100%|██████████| 417/417 [01:20<00:00,  5.16user/s]


In [None]:
def train_model(model, users,film_data, best_params):
    final_rmse = 0
    final_mae = 0
    for user in tqdm(users, desc="Eval users", unit="user"):
        X, y = get_items_rated_by_user(film_data, ratings_data, user)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

In [34]:
# get mean rmse and mae of a model on a dataset
def evaluate_model(model, users, film_data):
    final_rmse = 0
    final_mae = 0
    for user in tqdm(users, desc="Eval users", unit="user"):
        X, y = get_items_rated_by_user(film_data, ratings_data, user)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        final_rmse += root_mean_squared_error(y_test, y_pred)
        final_mae += mean_absolute_error(y_test, y_pred)
        
    return final_rmse/len(users), final_mae/len(users)

In [35]:
# demo of evaluating model after finding optimal hyperparameter
train_rmse, train_mae = evaluate_model(RandomForestRegressor(max_depth=20, n_jobs=-1),users_101_200_train, data_reduced)
print("done training")
test_rmse, test_mae = evaluate_model(RandomForestRegressor(max_depth=20, n_jobs=-1) ,users_101_200_test, data_reduced)
print(f'Train MAE: {train_mae}')
print(f'Train RMSE: {train_rmse}')
print(f'Test MAE: {test_mae}')
print(f'Test RMSE: {test_rmse}')

Eval users: 100%|██████████| 1202/1202 [04:45<00:00,  4.22user/s]


done training


Eval users: 100%|██████████| 301/301 [01:14<00:00,  4.03user/s]

Train MAE: 1.6966043095476206
Train RMSE: 2.1229881434771296
Test MAE: 1.7095795556282152
Test RMSE: 2.1508818003297057



