In [2]:
import pandas as pd
from surprise import accuracy, Dataset
from surprise.model_selection import train_test_split

# data = Dataset.load_builtin("ml-100k")
# trainset, testset = train_test_split(data, test_size=0.3)


In [7]:
movies = pd.read_csv('./data/ml-latest-small/movies.csv')
ratings = pd.read_csv('./data/ml-latest-small/ratings.csv')
movies_with_ratings = movies.merge(ratings, on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [8]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import GridSearchCV
from surprise import SVD, SlopeOne, KNNBasic, SVD, SVDpp, NormalPredictor, BaselineOnly, NMF, KNNWithMeans, KNNWithZScore
from surprise.model_selection import train_test_split, cross_validate

In [10]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})
dataset.head()

Unnamed: 0,uid,iid,rating
0,1,Toy Story (1995),4.0
1,5,Toy Story (1995),4.0
2,7,Toy Story (1995),4.5
3,15,Toy Story (1995),2.5
4,17,Toy Story (1995),4.5


In [11]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [12]:
trainset, testset = train_test_split(data, test_size=0.3, random_state=1)

In [None]:
param_grid = {
    'k': [2, 5, 20, 50, 100, 200],
    'sim_options__name': ['cosine', 'msd'], 
    'sim_options__user_based': [True, False]
}

grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=5, n_jobs=-1, joblib_verbose=0);
grid_search.fit(data);

print( pd.DataFrame( grid_search.cv_results )[['params', 'mean_test_rmse']].sort_values(by=['mean_test_rmse'])
    .head(5) )
    
print( grid_search.best_score['rmse'] )


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [50]:
res = []

for alg in [KNNWithZScore, KNNWithMeans, KNNBasic]:
    param_grid = {
        'k': [2, 10, 20, 50, 100, 200],
        'sim_options__name': ['cosine', 'msd'],
        'sim_options__user_based': [True, False]
    }
    grid_search = GridSearchCV(alg, param_grid, measures=['rmse'], cv=5, n_jobs=-1, joblib_verbose=0);
    grid_search.fit(data);
    res.append([alg.__name__, grid_search.best_score['rmse'], grid_search.best_params])

for alg in [SVD, NMF]:
    param_grid = {
        'n_factors': [50, 100, 200],
        'n_epochs': [10, 20, 50]
    }
    grid_search = GridSearchCV(alg, param_grid, measures=['rmse'], cv=5, n_jobs=-1, joblib_verbose=0);
    grid_search.fit(data);
    res.append([alg.__name__, grid_search.best_score['rmse'], grid_search.best_params])
        
print(res)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Computing th

In [51]:
pd.DataFrame(res)

Unnamed: 0,0,1,2
0,KNNWithZScore,0.897181,"{'rmse': {'k': 50, 'sim_options__name': 'cosin..."
1,KNNWithMeans,0.896621,"{'rmse': {'k': 50, 'sim_options__name': 'cosin..."
2,KNNBasic,0.941457,"{'rmse': {'k': 10, 'sim_options__name': 'cosin..."
3,SVD,0.871173,"{'rmse': {'n_factors': 50, 'n_epochs': 20}}"
4,NMF,0.970308,"{'rmse': {'n_factors': 50, 'n_epochs': 50}}"


SVD показал себя лучше всех, будем тюнить его! :)

In [52]:
param_grid = {
    'n_factors': [45, 50, 55],
    'n_epochs': [10, 20],
    'lr_all': [0.001, 0.005, 0.01, 0.02],
    'reg_all': [0.02, 0.1, 0.05, 0.01],
    'biased': [True, False]
}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=-1, joblib_verbose=0);
grid_search.fit(data);
print([alg.__name__, grid_search.best_score['rmse'], grid_search.best_params])

['NMF', 0.8552874669140769, {'rmse': {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.02, 'reg_all': 0.1, 'biased': True}}]


Ура! 0.85!