In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline

In [2]:
# movies dataset having movieId along with the movie title
movies = pd.read_csv('movies.csv',usecols=['movieId','title'])
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [3]:
print(movies.shape)
print("\n")
print(movies.describe())
print("\n")
print(movies.info())

(9742, 2)


             movieId
count    9742.000000
mean    42200.353623
std     52160.494854
min         1.000000
25%      3248.250000
50%      7300.000000
75%     76232.000000
max    193609.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
dtypes: int64(1), object(1)
memory usage: 152.3+ KB
None


In [4]:
# dataset having movieId, userId along with the corresponding rating given
ratings = pd.read_csv('ratings.csv',usecols=['userId','movieId','rating'])
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
print(ratings.shape)
print("\n")
print(ratings.describe())
print("\n")
print(ratings.info())

(100836, 3)


              userId        movieId         rating
count  100836.000000  100836.000000  100836.000000
mean      326.127564   19435.295718       3.501557
std       182.618491   35530.987199       1.042529
min         1.000000       1.000000       0.500000
25%       177.000000    1199.000000       3.000000
50%       325.000000    2991.000000       3.500000
75%       477.000000    8122.000000       4.000000
max       610.000000  193609.000000       5.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int64  
 1   movieId  100836 non-null  int64  
 2   rating   100836 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB
None


In [6]:
# Merging the 2 datasets on the basis of movieId
df = pd.merge(movies,ratings,on = 'movieId')
df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [7]:
# Finding number of ratings for each movie and sorting them in descending order
df.groupby('title')['rating'].count().sort_values(ascending=False).head()

title
Forrest Gump (1994)                 329
Shawshank Redemption, The (1994)    317
Pulp Fiction (1994)                 307
Silence of the Lambs, The (1991)    279
Matrix, The (1999)                  278
Name: rating, dtype: int64

In [8]:
# Define in which columns to look for missing values.
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
print(combine_movie_rating.head())

# creating a new dataframe having movie-title along with the total rating count for that movie
movie_rating_count = (combine_movie_rating.groupby(by = ['title'])['rating'].count().reset_index().
rename(columns = {'rating' : 'total_rating_count'}))
print(movie_rating_count.head())

   movieId             title  userId  rating
0        1  Toy Story (1995)       1     4.0
1        1  Toy Story (1995)       5     4.0
2        1  Toy Story (1995)       7     4.5
3        1  Toy Story (1995)      15     2.5
4        1  Toy Story (1995)      17     4.5
                                     title  total_rating_count
0                               '71 (2014)                   1
1  'Hellboy': The Seeds of Creation (2004)                   1
2                   'Round Midnight (1986)                   2
3                      'Salem's Lot (2004)                   1
4                'Til There Was You (1997)                   2


In [9]:
# Combining rating with totalRatingCount
rating_with_total_rating_count = combine_movie_rating.merge(movie_rating_count,left_on='title',right_on='title',how = 'left')
rating_with_total_rating_count.head()

Unnamed: 0,movieId,title,userId,rating,total_rating_count
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [10]:
# Taking threshold value as 50,i.e, only those movies will be considered where ratingCount >= 50
popularity_threshold = 50
rating_popular_movie = rating_with_total_rating_count.query('total_rating_count >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,movieId,title,userId,rating,total_rating_count
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [11]:
rating_popular_movie.shape

(41362, 5)

In [12]:
# Creating a sparse matrix having user_id's as columns and Movie titles as rows. Using pivot table function 
# to create sparse matrix. 
# Also, filling all Nan values as 0
userMoviePivotTable = rating_popular_movie.pivot_table(index='title',columns='userId',values = 'rating').fillna(0)
userMoviePivotTable.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [13]:
userMoviePivotTable.shape

(450, 606)

In [14]:
# Converting pivot table into Compressed Sparse Row(CSR) matrix
from scipy.sparse import csr_matrix
movieMatrix = csr_matrix(userMoviePivotTable.values)
#print(movieMatrix)

In [15]:
# Using Nearest Neighbors algorithm. Note: this is an unsupervised algorithm
from sklearn.neighbors import NearestNeighbors
knn_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute' , n_jobs=-1)
knn_model.fit(movieMatrix)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1)

In [16]:
# Testing for a new movie
test_movie_index = np.random.choice(userMoviePivotTable.shape[0])
print(test_movie_index)
# Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. 
distances, indices = knn_model.kneighbors(userMoviePivotTable.iloc[test_movie_index,:].values.reshape(1,-1),n_neighbors=6)

321


In [17]:
print(distances)
print(indices)

[[1.11022302e-16 4.27712043e-01 4.58294974e-01 4.64220281e-01
  4.72757042e-01 4.90559755e-01]]
[[321 417 137 120 178 155]]


In [18]:
# flatten() function returns a copy of the array collapsed into one dimension.
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for', userMoviePivotTable.index[test_movie_index], ':\n')
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, userMoviePivotTable.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Rain Man (1988) :

1: Truman Show, The (1998), with distance of 0.4277120427138269:
2: E.T. the Extra-Terrestrial (1982), with distance of 0.4582949744963236:
3: Dead Poets Society (1989), with distance of 0.4642202807620909:
4: Good Will Hunting (1997), with distance of 0.47275704171173083:
5: Finding Nemo (2003), with distance of 0.4905597548142484:


<h1> Issues with KNN-Based Collaborative Filtering </h1>

<b>popularity bias:</b> The system is biased towards movies that have the most user interaction (i.e. ratings and reviews).<br>
<b>item cold-start problem</b>: When a new movie is added to the list, it has a lot less user interaction and thus will rarely occur as a recommendation.<br>
<b>scalability issue</b>: The issue of managing a movie-user dataset matrix as the count of users and movies increase, since the matrix that we will deal with will have 90% of the values being 0. Storing such a sparse matrix wastes space when the database accommodates millions of users and movies.