# PROJECT 2 : RECOMMENDER SYSTEM (COLLABORATIVE FILTERING - SURPRISE)

## Import libraries

In [None]:
!pip install surprise



In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [None]:
!pip install numpy==1.23.5



In [None]:
from surprise import *
import pandas as pd
import numpy as np
from surprise.model_selection.validation import cross_validate
import matplotlib.pyplot as plt
import seaborn as sns
from surprise.model_selection import GridSearchCV, train_test_split

## EDA

In [None]:
data=pd.read_csv("/content/gdrive/MyDrive/DL07_K302_NguyenNhatToTran_NguyenVuMaiPhuong/PROJECT 2/cung_cap_HV/Products_ThoiTrangNam_rating_raw.csv", sep='\t')
data.head()

Unnamed: 0,product_id,user_id,user,rating
0,190,1,karmakyun2nd,5
1,190,2,tranquangvinh_vv,5
2,190,3,nguyenquoctoan2005,5
3,190,4,nguyenthuyhavi,5
4,190,5,luonganh5595,5


In [None]:
data.shape

(1024482, 4)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1024482 entries, 0 to 1024481
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   product_id  1024482 non-null  int64 
 1   user_id     1024482 non-null  int64 
 2   user        1024482 non-null  object
 3   rating      1024482 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 31.3+ MB


In [None]:
data.isnull().sum()

Unnamed: 0,0
product_id,0
user_id,0
user,0
rating,0


In [None]:
data.duplicated().any()

True

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.shape

(999815, 4)

In [None]:
n_ratings = len(data)
n_products = len(data['product_id'].unique())
n_users = len(data['user_id'].unique())
print(f'Có {n_ratings} đánh giá được thực hiện, với {n_products} sản phẩm và {n_users} người dùng')

Có 999815 đánh giá được thực hiện, với 31267 sản phẩm và 650636 người dùng


## BUILD MODELS

In [None]:
reader = Reader()
df = Dataset.load_from_df(data[['user_id', 'product_id', 'rating']], reader)

In [None]:
# algorithms = [KNNBasic(), KNNBaseline(), KNNWithMeans(), KNNWithZScore(), SVD(), NMF(), CoClustering(), BaselineOnly()]

# results_list = []

# for algo in algorithms:
#     results = cross_validate(algo, df, measures=['RMSE', 'MAE'], cv=5, verbose=False)
#     results_list.append({'Algorithm': algo.__class__.__name__,
#                          'RMSE testset': results['test_rmse'].mean(),
#                          'MAE testset': results['test_mae'].mean(),
#                          'Fit time': np.mean(results['fit_time']),
#                          'Test time': np.mean(results['test_time'])})

# results_df = pd.DataFrame(results_list).sort_values(by='RMSE', ascending=True)

# print(results_df)


In [None]:
# SVD
results_svd = cross_validate(SVD(), df, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print(results_svd)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8884  0.8871  0.8909  0.8874  0.8862  0.8880  0.0016  
MAE (testset)     0.5678  0.5670  0.5697  0.5678  0.5664  0.5677  0.0011  
Fit time          30.84   30.46   28.59   29.08   29.18   29.63   0.87    
Test time         2.30    2.74    2.26    2.00    1.39    2.14    0.44    
{'test_rmse': array([0.88843462, 0.88710454, 0.89086134, 0.8873821 , 0.88616494]), 'test_mae': array([0.56776623, 0.56704453, 0.56970018, 0.56776617, 0.56635403]), 'fit_time': (30.844258069992065, 30.460536003112793, 28.587213277816772, 29.078890562057495, 29.181021213531494), 'test_time': (2.303952932357788, 2.7411274909973145, 2.2573015689849854, 1.9976961612701416, 1.3873417377471924)}


In [None]:
# Baseline Only
results_blo = cross_validate(BaselineOnly(), df, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print(results_blo)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8766  0.8809  0.8833  0.8837  0.8809  0.8811  0.0025  
MAE (testset)     0.5754  0.5771  0.5789  0.5794  0.5776  0.5777  0.0014  
Fit time          9.88    10.02   10.48   12.68   10.30   10.67   1.03    
Test time         1.71    1.00    1.58    1.65    0.99    1.39    0.32    
{'test_rmse': array([0.87658833, 0.88094714, 0.88328873, 0.88371379, 0.88092665]), 'test_mae': array([0.57537335, 0.57707979, 0.5789398 , 0.57936981, 0.57761198]), 'fit_time': (9.882397413253784, 10.01610255241394, 10.484216690063477, 12.684213876724243, 10.303423881530762), 'test_time': (1.71378755569458, 0.9990239143371582, 1.5849668979644775, 1.6496505737304688, 0.99310302734375)}


In [None]:
# NMF
results_nmf = cross_validate(NMF(), df, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print(results_nmf)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0423  1.0396  1.0388  1.0395  1.0374  1.0395  0.0016  
MAE (testset)     0.7687  0.7662  0.7677  0.7678  0.7661  0.7673  0.0010  
Fit time          86.49   82.96   84.54   84.14   84.63   84.55   1.14    
Test time         2.28    2.67    2.18    1.48    2.18    2.16    0.39    
{'test_rmse': array([1.04227852, 1.03957726, 1.03880899, 1.03953397, 1.03737326]), 'test_mae': array([0.76872945, 0.76621118, 0.76769024, 0.76783656, 0.76607786]), 'fit_time': (86.48768067359924, 82.95695614814758, 84.53690123558044, 84.13898086547852, 84.63054466247559), 'test_time': (2.2837116718292236, 2.670491933822632, 2.182537317276001, 1.4758739471435547, 2.1796412467956543)}


In [None]:
# CoClustering
results_coclus = cross_validate(CoClustering(), df, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print(results_coclus)

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9925  0.9824  1.0212  0.9883  0.9825  0.9934  0.0144  
MAE (testset)     0.6597  0.6553  0.6642  0.6616  0.6529  0.6587  0.0041  
Fit time          85.77   90.22   86.28   86.13   87.05   87.09   1.62    
Test time         2.52    1.77    1.11    1.07    1.11    1.51    0.57    
{'test_rmse': array([0.99246611, 0.98239174, 1.02123832, 0.98829898, 0.98247952]), 'test_mae': array([0.6597335 , 0.65529869, 0.66419717, 0.66157629, 0.65292015]), 'fit_time': (85.76608729362488, 90.21934604644775, 86.27616024017334, 86.12587261199951, 87.04739284515381), 'test_time': (2.5182299613952637, 1.7668602466583252, 1.1124916076660156, 1.0668468475341797, 1.1086502075195312)}


- BaselineOnly là thuật toán tốt nhất, cho ra kết quả RMSE thấp nhất

### Lọc dữ liệu cho các thuật toán KNN

In [None]:
# Lọc các sản phẩm có trên 5 lượt rating
products_voted = data['product_id'].value_counts()
filtered_products = products_voted[products_voted > 5].index

# Lọc các user_id vote trên 5 lượt
users_voted = data['user_id'].value_counts()
filtered_users = users_voted[users_voted > 5].index

# Lọc DataFrame theo điều kiện
filtered_df = data[(data['product_id'].isin(filtered_products)) & (data['user_id'].isin(filtered_users))]

filtered_df.head()

Unnamed: 0,product_id,user_id,user,rating
7,190,8,x*****5,5
27,190,28,mxduy23,5
30,190,30,z*****n,5
31,190,31,q*****2,5
32,190,32,r*****0,5


In [None]:
filtered_df.shape

(207464, 4)

In [None]:
reader = Reader()
filtered_df = Dataset.load_from_df(filtered_df[['user_id', 'product_id', 'rating']], reader)

In [None]:
# KNN Basic
results_knnbs = cross_validate(KNNBasic(), filtered_df, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print(results_knnbs)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0100  1.0174  1.0158  1.0209  1.0195  1.0167  0.0038  
MAE (testset)     0.6166  0.6173  0.6177  0.6226  0.6224  0.6193  0.0026  
Fit time          0.75    0.76    0.86    1.01    0.76    0.83    0.10    
Test time         1.66    1.45    2.17    1.64    1.68    1.72    0.24    
{'test_rmse': array([1.01004045, 1.01735882, 1.01578507, 1.02087993, 1.01950603]), 'test_mae': array([0.61655673, 0.61725772, 0.61767064, 0.62260128, 0.62242745]), 'fit_time': (0.7535066604614258, 0.75

In [None]:
# KNN Baseline
results_knnbl = cross_validate(KNNBaseline(), filtered_df, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print(results_knnbl)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0122  1.0098  1.0105  1.0088  1.0157  1.0114  0.0024  
MAE (testset)     0.6261  0.6251  0.6221  0.6227  0.6289  0.6250  0.0025  
Fit time          2.37    1.98    2.26    2.36    2.05    2.20    0.16    
Test time         2.47    2.01    2.07    2.91    1.80    2.25    0.39    
{'test_rmse': array([1.01220665, 1.00976935

In [None]:
# KNN With Mean
results_knnmean = cross_validate(KNNWithMeans(), filtered_df, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print(results_knnmean)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0030  1.0121  1.0128  1.0082  1.0157  1.0104  0.0044  
MAE (testset)     0.6245  0.6305  0.6285  0.6275  0.6332  0.6288  0.0029  
Fit time          0.83    0.90    1.15    1.05    1.37    1.06    0.19    
Test time         1.53    2.15    2.15    1.81    1.63    1.85    0.26    
{'test_rmse': array([1.00296305, 1.01208772, 1.01284936, 1.00819986, 1.0157146 ]), 'test_mae': array([0.62445252, 0.63053275, 0.62847704, 0.62753534, 0.63315535]), 'fit_time': (0.8315427303314209, 

In [None]:
# KNN With ZScore
results_knnzscore = cross_validate(KNNWithZScore(), filtered_df, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print(results_knnzscore)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithZScore on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0103  1.0172  1.0104  1.0149  1.0125  1.0131  0.0026  
MAE (testset)     0.6225  0.6263  0.6225  0.6273  0.6276  0.6252  0.0023  
Fit time          1.39    1.40    1.15    1.33    1.58    1.37    0.14    
Test time         3.04    1.89    1.71    2.08    2.13    2.17    0.46    
{'test_rmse': array([1.0102929 , 1.01716292, 1.01044339, 1.01492426, 1.01253797]), 'test_mae': array([0.62248734, 0.6263233 , 0.62245989, 0.62725668, 0.6276201 ]), 'fit_time': (1.3896558284759521,

### Tunning parameter

#### BaselineOnly

In [None]:
grid_params = {'bsl_options':
                  {'method': ['als'],
                   'reg_u': [5, 10, 15, 20],
                   'reg_i': [5, 10, 15, 20]}}

model_grid = GridSearchCV(BaselineOnly, grid_params, measures=['rmse', 'mae'], cv=5)
model_grid.fit(df)

In [None]:
print('RMSE score:', model_grid.best_score['rmse'])
print('MAE score:', model_grid.best_score['mae'])
print('Parameters:',model_grid.best_params['rmse'])

RMSE score: 0.8751469829756996
MAE score: 0.5598821876670089
Parameters: {'bsl_options': {'method': 'als', 'reg_u': 5, 'reg_i': 5}}


#### SVD

In [None]:
grid_params = {'n_epochs': [20, 25, 30],
               'reg_all': [0.02, 0.1]}

model_grid = GridSearchCV(SVD, grid_params, measures=['rmse', 'mae'], cv=5)
model_grid.fit(df)

In [None]:
print('RMSE score:', model_grid.best_score['rmse'])
print('MAE score:', model_grid.best_score['mae'])
print('Parameters:',model_grid.best_params['rmse'])

- Kết quả thấp hơn BaselineOnly 1 ít

- BaselineOnly là thuật toán cho ra kết quả tốt nhất

### Fit model

In [None]:
trainset = df.build_full_trainset()
algo=BaselineOnly(bsl_options={'method': 'als', 'reg_u': 5, 'reg_i': 5})
algo.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7c23955e4c90>

### Recommend for specific user

In [None]:
user_id = 132
df_select = data[(data['user_id'] == user_id) & (data['rating'] >=4)]
df_select = df_select.set_index('product_id')
df_select.head(df_select.shape[0])

Unnamed: 0_level_0,user_id,user,rating
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
195,132,n*****0,5
1920,132,n*****0,5
1921,132,n*****0,5
1934,132,n*****0,5
1945,132,n*****0,5
...,...,...,...
171011,132,n*****0,5
171045,132,n*****0,5
171049,132,n*****0,5
171056,132,n*****0,5


In [None]:
df_select.shape

(1124, 3)

In [None]:
df_score = data[["product_id"]]
df_score

Unnamed: 0,product_id
0,190
1,190
2,190
3,190
4,190
...,...
1024477,171107
1024478,171107
1024479,171107
1024480,171107


In [None]:
df_score['EstimateScore'] = df_score['product_id'].apply(lambda x: algo.predict(user_id, x).est)
df_score = df_score.sort_values(by=['EstimateScore'], ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_score['EstimateScore'] = df_score['product_id'].apply(lambda x: algo.predict(user_id, x).est)


In [None]:
df_score = df_score.drop_duplicates()
df_score.head()

Unnamed: 0,product_id,EstimateScore
325060,231234,5.0
514372,251028,5.0
917549,171663,5.0
514550,251034,5.0
1011247,17893,5.0


In [None]:
df_score[df_score.EstimateScore>=4]

Unnamed: 0,product_id,EstimateScore
325060,231234,5.000000
514372,251028,5.000000
917549,171663,5.000000
514550,251034,5.000000
1011247,17893,5.000000
...,...,...
580231,1142,4.002452
623020,11230,4.001698
784940,1451,4.000913
492389,24419,4.000345
