In [53]:
import pandas as pd
import numpy as np
import implicit
import nmslib
from scipy.sparse import csr_matrix

from surprise import NMF, KNNBasic, KNNWithMeans
from surprise.model_selection import GridSearchCV
from surprise import Dataset
from surprise import Reader






In [2]:
# dataFolder = 'ml-latest-small'
dataFolder = 'ml-latest-small'
links = pd.read_csv(dataFolder + '/links.csv')
movies = pd.read_csv(dataFolder + '/movies.csv')
ratings = pd.read_csv(dataFolder + '/ratings.csv')
tags = pd.read_csv(dataFolder + '/tags.csv')

In [25]:
ratings.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movieIds = movies['movieId'].unique()
numMovies = len(movieIds)
print('We have {} movies'.format(numMovies))

We have 9742 movies


In [5]:
userIds = ratings['userId'].unique()
numUsers = len(userIds)
print('We have {} users'.format(numUsers))

We have 610 users


In [6]:
movieUserMatrix = np.empty((numMovies, numUsers), dtype=int)

userIdToUserIndex = {}
for index, userId in enumerate(userIds):
    userIdToUserIndex[userId] = index

movieIdToMovieIndex = {}
for index, movieId in enumerate(movieIds):
    movieIdToMovieIndex[movieId] = index

for row in ratings.iterrows():
    userId = row[1]['userId']
    movieId = row[1]['movieId']
    rating = row[1]['rating']

    userIndex = int(userIdToUserIndex[userId])
    movieIndex = int(movieIdToMovieIndex[movieId])

    movieUserMatrix[movieIndex][userIndex] = int(rating)

In [7]:
movieUserMatrixSparse = csr_matrix(movieUserMatrix)
movieUserMatrixSparse

<9742x610 sparse matrix of type '<class 'numpy.int64'>'
	with 99466 stored elements in Compressed Sparse Row format>

In [33]:
#Matrix Factorization


# initialize a model
modelMatrixFactorization = implicit.als.AlternatingLeastSquares(factors=50)

# train the model on a sparse matrix of item/user/confidence weights
modelMatrixFactorization.fit(movieUserMatrixSparse)

# recommend items for a user
user_items = movieUserMatrixSparse.T.tocsr()
userId = 1
recommendations = modelMatrixFactorization.recommend(userId, user_items)

# find related items
movieId = 1
related = modelMatrixFactorization.similar_items(movieId)


  0%|          | 0/15 [00:00<?, ?it/s]

 13%|█▎        | 2.0/15 [00:00<00:00, 16.40it/s]

 27%|██▋       | 4.0/15 [00:00<00:00, 16.94it/s]

 37%|███▋      | 5.5/15 [00:00<00:00, 16.21it/s]

 47%|████▋     | 7.0/15 [00:00<00:00, 15.58it/s]

 60%|██████    | 9.0/15 [00:00<00:00, 15.74it/s]

 73%|███████▎  | 11.0/15 [00:00<00:00, 16.56it/s]

 87%|████████▋ | 13.0/15 [00:00<00:00, 16.99it/s]

100%|██████████| 15.0/15 [00:00<00:00, 16.89it/s]




In [34]:
# Approximate Nearest Neighbors


# initialize a model
modelNearestNeighbor = implicit.approximate_als.NMSLibAlternatingLeastSquares()

# train the model on a sparse matrix of item/user/confidence weights
modelNearestNeighbor.fit(movieUserMatrixSparse)

# recommend items for a user
user_items = movieUserMatrixSparse.T.tocsr()
userId = 1
recommendations = modelNearestNeighbor.recommend(userId, user_items)

# find related items
movieId = 1
related = modelNearestNeighbor.similar_items(movieId)



  0%|          | 0/15 [00:00<?, ?it/s]

  7%|▋         | 1.0/15 [00:00<00:01,  9.85it/s]

 17%|█▋        | 2.5/15 [00:00<00:01, 10.49it/s]

 23%|██▎       | 3.5/15 [00:00<00:01,  9.66it/s]

 30%|███       | 4.5/15 [00:00<00:01,  8.79it/s]

 40%|████      | 6.0/15 [00:00<00:00,  9.37it/s]

 50%|█████     | 7.5/15 [00:00<00:00, 10.25it/s]

 57%|█████▋    | 8.5/15 [00:00<00:00,  9.99it/s]

 63%|██████▎   | 9.5/15 [00:00<00:00,  9.86it/s]

 70%|███████   | 10.5/15 [00:01<00:00,  9.80it/s]

 77%|███████▋  | 11.5/15 [00:01<00:00,  9.70it/s]

 83%|████████▎ | 12.5/15 [00:01<00:00,  9.47it/s]

 90%|█████████ | 13.5/15 [00:01<00:00,  9.43it/s]

 97%|█████████▋| 14.5/15 [00:01<00:00,  9.50it/s]

100%|██████████| 15.0/15 [00:01<00:00,  9.70it/s]




In [9]:
# find related items
movieIndex = 0


def getMovieName(movieIndex):
    return movies[movies['movieId'] == movieIds[movieIndex]].iloc[0]['title']


movieName = getMovieName(movieIndex)

print('Looking at {}'.format(movieName))
print('')

relatedMovies = modelMatrixFactorization.similar_items(movieIndex)

for movieIndex, confidence in relatedMovies:
    movieName = getMovieName(movieIndex)
    print('We are {:.0f}% confident that {} is similar'.format(confidence * 100, movieName))

Looking at Toy Story (1995)

We are 30% confident that Toy Story (1995) is similar
We are 18% confident that Aladdin (1992) is similar
We are 17% confident that Lion King, The (1994) is similar
We are 17% confident that Twister (1996) is similar
We are 17% confident that Toy Story 2 (1999) is similar
We are 16% confident that Jurassic Park (1993) is similar
We are 16% confident that Sense and Sensibility (1995) is similar
We are 16% confident that Independence Day (a.k.a. ID4) (1996) is similar
We are 16% confident that Mission: Impossible (1996) is similar
We are 16% confident that Twelve Monkeys (a.k.a. 12 Monkeys) (1995) is similar


In [35]:
# find related items
movieIndex = 0


def getMovieName(movieIndex):
    return movies[movies['movieId'] == movieIds[movieIndex]].iloc[0]['title']


movieName = getMovieName(movieIndex)

print('Looking at {}'.format(movieName))
print('')

relatedMovies = modelNearestNeighbor.similar_items(movieIndex)

for movieIndex, confidence in relatedMovies:
    movieName = getMovieName(movieIndex)
    print('We are {:.0f}% confident that {} is similar'.format(confidence * 100, movieName))

Looking at Toy Story (1995)

We are 100% confident that Toy Story (1995) is similar
We are 46% confident that Once Upon a Time... When We Were Colored (1995) is similar
We are 43% confident that Toy Story 2 (1999) is similar
We are 42% confident that Toy Story 3 (2010) is similar
We are 41% confident that Dear God (1996) is similar
We are 41% confident that Thin Line Between Love and Hate, A (1996) is similar
We are 36% confident that Cry, the Beloved Country (1995) is similar
We are 36% confident that 1-900 (06) (1994) is similar
We are 36% confident that Shrek (2001) is similar
We are 35% confident that Anne Frank Remembered (1995) is similar


In [16]:
from annoy import AnnoyIndex

f = 40
model = AnnoyIndex(numMovies)  # Length of item vector that will be indexed
    
for index, movieRow in enumerate(movieUserMatrix.transpose()):
    model.add_item(index, movieRow)

model.build(10) # 10 trees
model.save('test.ann')

u = AnnoyIndex(f)
u.load('test.ann') # super fast, will just mmap the file
print(u.get_nns_by_item(0, 10)) # will find the 1000 nearest neighbors

IndexError: Item index larger than the largest item index

In [43]:
# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
ratingsData = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [51]:
def runModel(ratingsData, model, paramGrid):
    
    crossValidationSplits = 5
    gs = GridSearchCV(model, paramGrid, measures=['rmse', 'mae'], cv=crossValidationSplits)
    
    gs.fit(ratingsData)
    
    # best RMSE score
    print('The best RMSE is {}'.format(gs.best_score['rmse']))
    
    # combination of parameters that gave the best RMSE score
    print('The best params are {}'.format(gs.best_params['rmse']))

In [47]:
model = NMF
paramGrid = {
    'n_epochs': [5, 10], 
    'n_factors': [10, 50],
    'reg_pu': [0.4, 0.6]
}
runModel(ratingsData, model, paramGrid)

0.9280846209716946
{'n_epochs': 5, 'n_factors': 10, 'reg_pu': 0.6}


In [54]:
model = KNNWithMeans
paramGrid = {'n_epochs': [5, 10], 'n_factors': [10, 50],
                  'reg_pu': [0.4, 0.6]}
runModel(ratingsData, model, paramGrid)

Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


Computing the msd similarity matrix...
Done computing similarity matrix.


The best RMSE is 0.9052067005143343
The best params are {'n_epochs': 5, 'n_factors': 10, 'reg_pu': 0.4}
