# reference 

https://github.com/EmirKorkutUnal/Python-Surprise-Predictions-on-Custom-Dataset

In [1]:
from surprise import SVD, Reader, Dataset, NormalPredictor, accuracy
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
import pandas as pd 
import my_db_setting 
# from surprise.model_selection import train_test_split

In [2]:
# Connect to the PostgreSQL database
conn = my_db_setting.my_db_setting()
cur = conn.cursor()

# for every user, for 14 days estimate a rating matrix of each user 
rating_query = f"""
                select temp.food_id_id, temp.user_id, temp.rating, temp.meal_time 
                from (
                select food_id_id, user_id, rating, meal_time, meals_id 
                from pha_meal 
                ORDER BY food_id_id, user_id, meal_time
                ) as temp 
                where temp.meal_time BETWEEN ( DATE(NOW()) - INTERVAL '1' month) AND DATE(NOW());
                """

cur.execute(rating_query)
result = cur.fetchall()

In [5]:
from sklearn.model_selection import train_test_split 
# dataframe의 dataset을 surprise의 train_test_split으로 하면 cross validation이 안돼서 sklearn것을 이용해야 함. 

rating = pd.DataFrame(result)
rating.columns = ['food_id_id', 'user_id', 'rating', 'meal_time']
rating.rename(columns = {'food_id_id': 'item', 'user_id' :'user','meal_time': 'timestamp'}, inplace = True)
#data = Dataset.load_from_df(rating[['user', 'item', 'rating']], reader)


# Split the dataset into training and testing sets
df_train, df_test = train_test_split(rating, test_size=.25, random_state = 7)

In [14]:
# make changes to make our database work with surprise 

reader = Reader(rating_scale=(1, 5))
#Dfsv = Dataset.load_from_df(rating[['user','item','rating']], reader)
cv_train = Dataset.load_from_df(df_train[['user','item','rating']], reader)

## dataset

rating (1811, 4)
    - food_id_id
    - user_id
    - rating
    - meal_time 
  - testset (5974, 3)
  - traing set (17922, 3)

In [17]:
# Run 5-fold cross-validation and print results.
# We can now use this dataset as we please, e.g. calling cross_validate

#cross_validate(NormalPredictor(), df_train, cv=5)
#cross_validate(algo, testset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# algo = SVD()

# # define a cross-validation iterator
# kf = KFold(n_splits=5)

# result = []
# for trainset, testset in kf.split(data):

#     # train and test algorithm.
#     algo.fit(trainset)
#     predictions = algo.test(testset)

#     # Compute and print Root Mean Squared Error
#     result_i = accuracy.rmse(predictions, verbose=True)

#     result.append(result_i)

In [29]:
from surprise import SVD, NMF, SlopeOne, KNNBasic, KNNWithMeans, KNNBaseline, CoClustering, BaselineOnly, NormalPredictor
import datetime
import numpy as np

CVResults = pd.DataFrame(columns = ['Model','RMSE','MAE','Timespan'])
classes = (SVD, NMF, SlopeOne, KNNBasic, KNNWithMeans, KNNBaseline, CoClustering, BaselineOnly, NormalPredictor)


data = cv_train
kf = KFold(2, random_state=0)


for model in classes:


    start = datetime.datetime.now()
    out = cross_validate(model(), data, ['rmse', 'mae'], kf)
    mean_rmse = '%.3f' % np.mean(out['test_rmse'])
    mean_mae = '%.3f' % np.mean(out['test_mae'])
    cv_time = str(datetime.datetime.now() - start)[:-3]


    #CVResults = CVResults.concat({'Model': model.name, 'RMSE': mean_rmse, 'MAE': mean_mae, 'Timespan': cv_time}, ignore_index=True)
    CVResults.loc[len(CVResults)] = {'Model': model.__name__, 'RMSE': mean_rmse, 'MAE': mean_mae, 'Timespan': cv_time}

print('All models have run. Call the CVResults dataframe for results.')

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
All models have run. Call the CVResults dataframe for results.


In [30]:
CVResults.sort_values(by = ['RMSE'])

Unnamed: 0,Model,RMSE,MAE,Timespan
7,BaselineOnly,1.42,1.145,0:00:00.138
2,SlopeOne,1.426,1.147,0:00:00.648
3,KNNBasic,1.437,1.151,0:00:02.026
5,KNNBaseline,1.443,1.157,0:00:02.596
4,KNNWithMeans,1.453,1.166,0:00:02.205
6,CoClustering,1.461,1.177,0:00:00.464
0,SVD,1.47,1.18,0:00:00.398
1,NMF,1.521,1.224,0:00:00.414
8,NormalPredictor,2.032,1.666,0:00:00.183


In [None]:
from surprise import SVD
from surprise.model_selection import GridSearchCV


param_grid = {"n_factors": [5, 10, 20],
              "n_epochs": [5, 10, 20, 50], 
              "lr_all": [0.001, 0.002, 0.005], 
              "reg_all": [0.4, 0.6]}
#gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)
gs = GridSearchCV(SVD, param_grid, cv=5)

gs.fit(cv_train)

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])