In [1]:
# load the ml-latest full dataset into a pandas dataframe
import pandas as pd
ratings = pd.read_csv('ml-latest/ml-latest/ratings.csv')

print(ratings.head())
print(ratings.shape)

   userId  movieId  rating   timestamp
0       1        1     4.0  1225734739
1       1      110     4.0  1225865086
2       1      158     4.0  1225733503
3       1      260     4.5  1225735204
4       1      356     5.0  1225735119
(33832162, 4)


In [2]:
# We will be doing 10 fold cross validation as that seems to be a fair number
# for assessing the accuracy of our splitting methods
k = 10

# split the ratings into folds
from sklearn.model_selection import KFold
kf = KFold(n_splits=k, shuffle=True, random_state=1)

try:
    del folds
except:
    pass

folds = []
for train_index, test_index in kf.split(ratings):
    test_df = ratings.iloc[test_index].values
    folds.append(test_df)

for i, fold in enumerate(folds):
    print(f"Fold {i+1}")
    print("shape:", fold.shape)

Fold 1
shape: (3383217, 4)
Fold 2
shape: (3383217, 4)
Fold 3
shape: (3383216, 4)
Fold 4
shape: (3383216, 4)
Fold 5
shape: (3383216, 4)
Fold 6
shape: (3383216, 4)
Fold 7
shape: (3383216, 4)
Fold 8
shape: (3383216, 4)
Fold 9
shape: (3383216, 4)
Fold 10
shape: (3383216, 4)


In [3]:
# import the functions for different splitting methods that we have
from filter import one, two, three, four, five 

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 

In [4]:
# function to train and evaluate the KNN model, and return the RMSE and MAE value
from surprise import Dataset, Reader, KNNBasic

def train_and_evaluate(train, test):
    reader = Reader(rating_scale=(1, 5))
    train_data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)
    trainset = train_data.build_full_trainset()
    test_data = Dataset.load_from_df(test[['userId', 'movieId', 'rating']], reader)
    testset = test_data.build_full_trainset().build_testset()
    algo = KNNBasic(
        k=40,   # maximum number of neighbors to consider for aggregation (when making predictions)
        sim_options={
            'name': 'cosine',
            'user_based': False
        }
    )
    algo.fit(trainset)
    predictions = algo.test(testset)
    from surprise import accuracy
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae

In [5]:
def limit_user_rating(num_user=1000, num_movies=10000):
    #limit the user-item matrix to include number of users and number of movies
#     get the list of movies and randomly sample
#     get the list of users and randomly sample
    movies = ratings['movieId'].unique()
    users = ratings['userId'].unique()
    # randomly sample the movies and users
    import random
    movies = random.sample(list(movies), num_movies)
    users = random.sample(list(users), num_user)
    ratings_sample = ratings[ratings['movieId'].isin(movies) & ratings['userId'].isin(users)]
    return ratings_sample