In [109]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [110]:
!pip install numpy
!pip install scikit-surprise
import os
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.accuracy import rmse
from collections import defaultdict
from surprise import KNNBasic
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import accuracy
from surprise import AlgoBase



In [111]:
# Reading the file ratings and storing it in a dataframe
ratings = pd.read_csv('/content/drive/MyDrive/data/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [112]:
ratings_map = {}
ratings_map['itemID'] = list(ratings.movieId)
ratings_map['userID'] = list(ratings.userId)
ratings_map['rating'] = list(ratings.rating)

df = pd.DataFrame(ratings_map)
df.shape

(100836, 3)

In [113]:
# Create 5 folds
from surprise.model_selection import KFold
rr = Reader(rating_scale=(0.5, 5.0))
rating_map_keys = ['userID', 'itemID', 'rating']
filter = df[rating_map_keys]
data = Dataset.load_from_df(filter, rr)

kf = KFold(n_splits=5)
kf.split(data)

<generator object KFold.split at 0x7fcbbc1f40d0>

In [114]:
# class to predict ratings using a standard stochastic gradient descent algo
class SGDMatixSelf(AlgoBase):
    '''A basic rating prediction algorithm based on matrix factorization.'''
    
    def __init__(self, l_r, n_e, n_f):
        self.n_f = n_f
        self.n_e = n_e
        self.lr = l_r
        
    def fit(self, trainset):
        # print('Fit started')
        
        p = np.random.normal(0, .1, (trainset.n_users, self.n_f))
        q = np.random.normal(0, .1, (trainset.n_items, self.n_f))
        
        for z in range(self.n_e):
            for i, j, k in trainset.all_ratings():
                e = k - np.dot(p[i], q[j])
                p[i] = p[i] + q[j] * e * self.lr
                q[j] = q[j] + p[i] * e * self.lr
        
        self.p = p
        self.q = q
        self.trainset = trainset

    def estimate(self, i, j):
        if  self.trainset.knows_user(i) and self.trainset.knows_item(j):
            return np.dot(self.p[i], self.q[j])
        else:
            return self.trainset.global_mean

In [115]:
import numpy as np
#removing timestamp
rating_map_keys = ['userID', 'itemID', 'rating']
filter = df[rating_map_keys]
data = Dataset.load_from_df(filter, reader)

eval = []
for i in [SGDMatixSelf(.01, 10,10),SVD(), NMF(), KNNBasic()]:
    cva = cross_validate(i, data, measures=['RMSE'], cv=4, verbose=False)
    tempdataframe = pd.DataFrame.from_dict(cva).mean(axis=0)
    algoSplitArray = str(i).split(' ')
    finalAlgoSplitArray = algoSplitArray[0].split('.')
    tempdataframe.append(pd.Series([finalAlgoSplitArray[-1]],index=['Algorithm']))
    eval.append(tempdataframe)

                     
                                        

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [116]:
eval

[test_rmse    0.957558
 fit_time     8.521351
 test_time    0.217707
 dtype: float64, test_rmse    0.876448
 fit_time     4.953104
 test_time    0.198612
 dtype: float64, test_rmse    0.925663
 fit_time     6.600372
 test_time    0.169065
 dtype: float64, test_rmse    0.950206
 fit_time     0.169732
 test_time    2.131648
 dtype: float64]

In [117]:
# Grid search to find best hyperparameter, commented as takes longer time
# grid_options = {'n_factors': [10, 15, 20, 30], 'n_epochs': [10, 20, 30], 'lr_all': [0.002, 0.005, 0.008, 0.01],
#               'reg_all': [0.08, 0.1, 0.12]}
# search = GridSearchCV(SVD, grid_options, measures=['rmse'], cv=3)
# search.fit(data)
# algo = search.best_estimator['rmse']
# print(search.best_score['rmse'])
# print(search.best_params['rmse'])

In [118]:
trainset, testset = train_test_split(data, test_size=0.2)
algo = SVD(n_factors=30, n_epochs=20, lr_all=0.008, reg_all=0.08)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.8642


0.8641962007659825

In [119]:
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)

{'fit_time': (2.8562979698181152,
  2.835466146469116,
  2.7853949069976807,
  2.864041566848755,
  2.911994695663452),
 'test_rmse': array([0.86153062, 0.8730939 , 0.85858519, 0.86952612, 0.86571288]),
 'test_time': (0.14764404296875,
  0.19179987907409668,
  0.16220808029174805,
  0.16651272773742676,
  0.15889978408813477)}

In [120]:
# algo1 = KNNBasic(n_factors=30, n_epochs=20)
# predictions2 = algo1.fit(trainset).test(testset)
# accuracy.rmse(predictions2)

In [121]:
# predictions.sort()
# predictions2.sort()
# rui = []
# p1 = []
# p2 = []
# for i in predictions:
#   rui.append(i[2])
#   p1.append(i[3])

# for j in predictions2:
#   p2.append(j[3])

# p1 = [x * 0.6 for x in p1]
# p2 = [x * 0.3 for x in p2]
# fin = [sum(x) for x in zip(p1, p2)]

  

In [122]:
# import numpy as np
# def rmse(predictions, targets):
#     return np.sqrt(((predictions - targets) ** 2).mean())
# rmse(np.array(fin),np.array(rui))

In [123]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):

    userid_to_estimate_map = defaultdict(list)
    for user_id, _, ratings_true, estimate_value, _ in predictions:
        userid_to_estimate_map[user_id].append((estimate_value, ratings_true))

    precisions = dict()
    recalls = dict()
    for user_id, user_ratings in userid_to_estimate_map.items():

        user_ratings.sort(key=lambda x: x[0], reverse=True)

        n_rel = sum((ratings_true >= threshold) for (_, ratings_true) in user_ratings)

        n_rec_k = sum((estimate_value >= threshold) for (estimate_value, _) in user_ratings[:k])

        n_rel_and_rec_k = sum(((ratings_true >= threshold) and (estimate_value >= threshold))
                              for (estimate_value, ratings_true) in user_ratings[:k])

        precisions[user_id] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        recalls[user_id] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [124]:
kf = KFold(n_splits=4)

algo = SVD(n_factors=30, n_epochs=20, lr_all=0.008, reg_all=0.08)
i = 1
for trainset, testset in kf.split(data):
    print("Split:", i)
    predictions = algo.fit(trainset).test(testset)
    accuracy.rmse(predictions, verbose=True)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    print("Precision:", sum(prec for prec in precisions.values()) / len(precisions))
    print("Recall:", sum(rec for rec in recalls.values()) / len(recalls))
    i+=1

Split: 1
RMSE: 0.8650
Precision: 0.8572677595628425
Recall: 0.23350174352693853
Split: 2
RMSE: 0.8741
Precision: 0.8479508196721325
Recall: 0.22023470048547578
Split: 3
RMSE: 0.8651
Precision: 0.8573497267759578
Recall: 0.22316820732092577
Split: 4
RMSE: 0.8635
Precision: 0.8810928961748645
Recall: 0.2092987029246781


In [125]:
def getpreds(predictions):
    
    fin = defaultdict(list)    
    for user_id, id, ratings_true, estimate_value, _ in predictions:
        fin[user_id].append((id, estimate_value))

    for user_id, user_ratings in fin.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)

    return fin


In [126]:
trainset = data.build_full_trainset()
algo = SVD(n_factors=30, n_epochs=20, lr_all=0.008, reg_all=0.08)
algo.fit(trainset)

testset = trainset.build_anti_testset()
predictions = algo.test(testset)
all_pred = getpreds(predictions)



#### Now as we have all the predicted rating, We'll subset to only top " " movies for every user

In [127]:
#setting recommendation size to 10
n = 10

for user_id, user_ratings in all_pred.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    all_pred[user_id] = user_ratings[:n]


In [128]:
tmp = pd.DataFrame.from_dict(all_pred)
tmp_transpose = tmp.transpose()


In [129]:
res = []
for user_id,user_ratings in all_pred.items():
  res.append(tmp_transpose.loc[user_id])

In [130]:
#movieids of reommended movies
recomml = []
for i in res:
  recommended_movie_ids=[]
  for x in range(0, n):
    recommended_movie_ids.append(i[x][0])
  recomml.append(recommended_movie_ids)

In [131]:
recomml[0]

[318, 48516, 720, 914, 898, 904, 750, 858, 3435, 3275]

In [132]:
finall[0]

['Shawshank Redemption, The (1994)',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)',
 'Godfather, The (1972)',
 'Philadelphia Story, The (1940)',
 'Rear Window (1954)',
 'Ran (1985)',
 'Seventh Seal, The (Sjunde inseglet, Det) (1957)',
 "Guess Who's Coming to Dinner (1967)",
 'There Will Be Blood (2007)',
 'Three Billboards Outside Ebbing, Missouri (2017)']

In [133]:
movies = pd.read_csv('/content/drive/MyDrive/data/movies.csv')
finall = []
for i in recomml:
  df = movies[movies['movieId'].isin(i)]
  temp = df['title'].tolist()
  finall.append(temp)

In [108]:
fin = pd.DataFrame(finall)

In [None]:
#Saving recommendations to a file
fin.to_csv('/content/drive/MyDrive/data/file2.csv',index = False)

In [None]:
r = pd.read_csv('/content/drive/MyDrive/data/file2.csv')