In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

CUTOFF = 10 # Minimum number of ratings for a movie to be considered
IDX = 10    # Index of user to get recommendations for

# Read data
train = pd.read_csv('../data/train.csv')
val = pd.read_csv('../data/val.csv')
test = pd.read_csv('../data/test.csv')


# Data Preprocessing
train.drop(columns=['timestamp'],inplace=True)
rating_count = train.groupby('title')['rating'].count().reset_index()
rating_count.rename(columns={'rating':'rating_count'},inplace=True)
df = train.merge(rating_count, on='title')
df = df[df['rating_count']>=CUTOFF] 
df.drop_duplicates(['title','userId'],inplace=True)
pivot = df.pivot_table(index='userId', columns='movieId', values='rating')
pivot = pivot.fillna(0)
csr_pivot = csr_matrix(pivot.values)


# KNN model
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(csr_pivot)

# 10 recommendations of user index
distances, recommendations = model.kneighbors(pivot.iloc[IDX,:].values.reshape(1, -1), n_neighbors=11)

# Print top 10 recommendations
for movie in range(0, len(distances.flatten())):
    if (movie == 0):
        print("Recommendations for userId {0}:\n".format(pivot.index[IDX]))
    else:
        print("{0}: movieId {1}, with distance of {2}:".format(movie, pivot.index[recommendations.flatten()[movie]], distances.flatten()[movie]))


Recommendations for userId 11:

1: movieId 176, with distance of 0.6460420995567477:
2: movieId 486, with distance of 0.7044585254811317:
3: movieId 133, with distance of 0.7045454545454546:
4: movieId 602, with distance of 0.7073192634993539:
5: movieId 93, with distance of 0.7142094682854392:
6: movieId 235, with distance of 0.719926303448362:
7: movieId 229, with distance of 0.7259779660536856:
8: movieId 81, with distance of 0.7261468098918477:
9: movieId 485, with distance of 0.726905219663981:
10: movieId 33, with distance of 0.7315346713854005:


In [3]:
distances, recommendations = model.kneighbors(pivot.values, n_neighbors=101)

In [3]:
pd.DataFrame(recommendations).to_csv('../data/baseline.csv',index=False)