In [78]:
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp
from surprise import BaselineOnly
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import GridSearchCV

In [67]:
def data_preprocessing(data):
    print("======= data preprocessing start ========")

    # 데이터 전처리
    df = pd.DataFrame(data, columns=['title', 'score', 'NAME2'])
    df['NAME2'] = df['NAME2'].apply(lambda x: x[:3])
    df = df.drop_duplicates(['title', 'NAME2'], keep='first')[['title', 'score', 'NAME2']]
    df.columns = ["movieId", "rating", "userId"]
    df = df[['userId', 'movieId', 'rating']]

    print("data shape : {}".format(df.shape))
    print("data sample")
    print(df.head(2))

    print("======= data preprocessing start ========")
    return df

In [68]:
data = pd.read_csv('111.csv',encoding = 'cp949')
df = data_preprocessing(data)

data shape : (9201, 3)
data sample
  userId   movieId  rating
0    pig  미운 오리 새끼      10
1    edw       부산행       8


In [69]:
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(df[['userId','movieId','rating']], reader)

In [70]:
algorithms = [NormalPredictor(), KNNBasic(), KNNWithMeans(), KNNWithZScore(),
             KNNBaseline(), SVD(), SVDpp(), BaselineOnly(), NMF(),
             SlopeOne(), CoClustering()]

In [71]:
bench = []

for algorithm in algorithms:
    results = cross_validate(algorithm, data, measures=['RMSE'], cv = 3, verbose=False)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split('.')[-1].split()[0]], index=['Algorithm']))
    bench.append(tmp)
    
result_rmse = pd.DataFrame(bench).set_index('Algorithm').sort_values('test_rmse')

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


In [72]:
result_rmse

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,3.062224,0.012312,0.012956
SVD,3.086725,0.339757,0.01629
KNNBaseline,3.177243,0.237034,0.257312
SVDpp,3.212833,0.829115,0.034919
KNNBasic,3.276113,0.238375,0.239348
CoClustering,3.454347,0.423203,0.014294
KNNWithMeans,3.513572,0.257311,0.233709
KNNWithZScore,3.52008,0.31948,0.244698
SlopeOne,3.56425,0.050199,0.01895
NMF,3.806685,0.592748,0.014641


In [159]:
# SVD 최적 파라미터 찾기
# defaults : 20, 100, 0.005, 0.02
svd_param_grid = {'n_epochs': [10, 20, 30], 
                  'n_factors' : [80,100,120],
                  'lr_all': [0.002, 0.005, 0.007],
                  'reg_all': [0.02, 0.05, 0.1, 0.15]}

In [160]:
svd = GridSearchCV(SVD, svd_param_grid, cv=3, measures=['rmse'])

svd.fit(data)

In [161]:
print(svd.best_score['rmse'])
print(svd.best_params['rmse'])

3.0691171868356366
{'n_epochs': 20, 'n_factors': 80, 'lr_all': 0.005, 'reg_all': 0.1}


In [179]:
pd.DataFrame(svd.cv_results)[['mean_test_rmse','params']].sort_values(['mean_test_rmse']).head()

Unnamed: 0,mean_test_rmse,params
42,3.069117,"{'n_epochs': 20, 'n_factors': 80, 'lr_all': 0...."
93,3.069263,"{'n_epochs': 30, 'n_factors': 100, 'lr_all': 0..."
53,3.070018,"{'n_epochs': 20, 'n_factors': 100, 'lr_all': 0..."
59,3.070345,"{'n_epochs': 20, 'n_factors': 100, 'lr_all': 0..."
105,3.070402,"{'n_epochs': 30, 'n_factors': 120, 'lr_all': 0..."


In [182]:
# BaselineOnly 최적 파라미터 찾기
# defaults : als, 10, 15, 10
BaselineOnly_param_grid = {'bsl_options': {'method': ['als'],
                                           'n_epochs': [5,7,10],
                                           'reg_u' : [15,20,30,40],
                                           'reg_i': [5,7,10]
                                          }}

In [183]:
baselineonly = GridSearchCV(BaselineOnly, BaselineOnly_param_grid, cv=3, measures=['rmse'])

baselineonly.fit(data)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

In [184]:
print(baselineonly.best_score['rmse'])
print(baselineonly.best_params['rmse'])

3.041295344607057
{'bsl_options': {'method': 'als', 'n_epochs': 5, 'reg_u': 15, 'reg_i': 5}}


In [185]:
pd.DataFrame(baselineonly.cv_results)[['mean_test_rmse','params']].sort_values(['mean_test_rmse']).head()

Unnamed: 0,mean_test_rmse,params
0,3.041295,"{'bsl_options': {'method': 'als', 'n_epochs': ..."
12,3.041295,"{'bsl_options': {'method': 'als', 'n_epochs': ..."
24,3.041295,"{'bsl_options': {'method': 'als', 'n_epochs': ..."
3,3.041347,"{'bsl_options': {'method': 'als', 'n_epochs': ..."
15,3.041347,"{'bsl_options': {'method': 'als', 'n_epochs': ..."


In [186]:
# BaselineOnly 최적 파라미터 찾기
# defaults : sgd, 20, 0.02, 0.005
BaselineOnly_param_grid = {'bsl_options': {'method': ['sgd'],
                                           'n_epochs': [10,20,30],
                                           'reg' : [0.01, 0.02, 0.05],
                                           'learning_rate': [0.005,0.007,0.01]
                                          }}

In [187]:
baselineonly = GridSearchCV(BaselineOnly, BaselineOnly_param_grid, cv=3, measures=['rmse'])

baselineonly.fit(data)

Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimati

In [188]:
print(baselineonly.best_score['rmse'])
print(baselineonly.best_params['rmse'])

3.050553500702403
{'bsl_options': {'method': 'sgd', 'n_epochs': 20, 'reg': 0.05, 'learning_rate': 0.007}}


In [189]:
pd.DataFrame(baselineonly.cv_results)[['mean_test_rmse','params']].sort_values(['mean_test_rmse']).head()

Unnamed: 0,mean_test_rmse,params
16,3.050554,"{'bsl_options': {'method': 'sgd', 'n_epochs': ..."
13,3.050638,"{'bsl_options': {'method': 'sgd', 'n_epochs': ..."
10,3.050743,"{'bsl_options': {'method': 'sgd', 'n_epochs': ..."
24,3.051333,"{'bsl_options': {'method': 'sgd', 'n_epochs': ..."
21,3.051447,"{'bsl_options': {'method': 'sgd', 'n_epochs': ..."
