In [1]:
import json
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from functools import reduce

from surprise import Reader, Dataset, KNNBasic, SVD, NMF, accuracy
from surprise.model_selection import GridSearchCV, cross_validate

import warnings

import YelpCleanData as cd # User Defined

warnings.simplefilter('ignore')
color = '#539ecd'

In [11]:
# def split_most_recent(df, col='user_id'):
#     user_date = df.loc[:, [col,'date']]
#     user_date['date'] = pd.to_datetime(user_date['date'])
    
#     recent_idx = user_date[user_date.groupby([col], sort=False)['date'].transform(max) == user_date['date']].index
#     recent_df = df.iloc[recent_idx].reset_index(drop=True)
#     previous_df = df[~df.index.isin(recent_idx)].reset_index(drop=True)
#     return previous_df, recent_df

In [21]:
def generate_surprise_train_test(trainDf, testDf, scale=(1.0, 5.0), cols=['user_id','business_id','stars']):
    reader = Reader(rating_scale=scale)
    train_data = Dataset.load_from_df(trainDf[cols], reader=reader)
    test_data = Dataset.load_from_df(testDf[cols], reader=reader)
    
    train = train_data.build_full_trainset()
    test_pre = test_data.build_full_trainset()
    test = test_pre.build_testset()
    return train, test

In [4]:
df_city_review = cd.load_data('yelp_dataset/city_review.csv')
thres_user_id = cd.select_index_with_thres(df_city_review, 'user_id', 20)
thres_buss_id = cd.select_index_with_thres(df_city_review, 'business_id', 20)
non_sparse_df = cd.filter_index(df_city_review, [['user_id',thres_user_id], ['business_id',thres_buss_id]])
user_business_df = cd.reset_index_cols(non_sparse_df, ['user_id','business_id','stars','date'])

(4712, 4409)

In [12]:
train_df, test_df = cd.split_most_recent(user_business_df)

In [22]:
train, test = generate_surprise_train_test(train_df, test_df)

In [173]:
# param_grid = {'n_epochs' : [10, 50, 100], 'n_factors' : [10, 50, 100], 
#               'lr_all' : [0.003, 0.01, 0.03], 'reg_all' : [0.01, 0.04, 0.1]}

# grid_cv = GridSearchCV(algo_class=SVD, measures=['RMSE'], param_grid=param_grid)
# grid_cv.fit(train_data)


# # Best rmse score and params
# print('Best Score :', grid_cv.best_score['rmse'])
# print('Best Parameters :', grid_cv.best_params['rmse'])

Best Score : 0.9819518064920245
Best Parameters : {'n_epochs': 50, 'n_factors': 10, 'lr_all': 0.003, 'reg_all': 0.1}


In [34]:
# Train Model
algo = SVD()
algo.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f00308359f0>

In [35]:
# Test & Evaluate
preds = algo.test(test)
accuracy.rmse(preds)

RMSE: 1.1338


1.133766876071366