In [1]:
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
import sklearn
from sklearn.neighbors import NearestNeighbors

In [2]:
movieRatings = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv")
movieRatings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
#Cutting out the unnecessary columns
movieRatings=movieRatings.drop('timestamp', axis=1)

In [4]:
#NEW DATA TABLE
movieRatings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
movieinfo = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv")
movieinfo.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
#Cutting out the unnecessary columns
movies=movieinfo.drop('genres', axis=1)

In [7]:
 #NEW DATA TABLE
 movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [8]:
moviesRatingCount = pd.DataFrame(movieRatings.groupby('movieId').size(), columns=['count'])
moviesRatingCount.head()

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49


In [9]:
#to make more accurate recommendations, we need more than 2-3 common rated movies(eğer kullanıcı sadece 4-5 filmi puanlandırmışşsa, ona istabetli öneri yapmak mümkün olmaz. O yüzden aktif olmayan kullancıları ve filmleri kullanmamamız lazım)
#CLEANING MOVIES
Minnumofratings = 50 #threshhold number
suitablemovies = list(set(moviesRatingCount.query('count >= @Minnumofratings').index))
updatedmovieratings = movieRatings[movieRatings.movieId.isin(suitablemovies)]
print('Original ratings data : ', movieRatings.shape)
print('Ratings data after removing unpopular movies: ', updatedmovieratings.shape)

Original ratings data :  (100836, 3)
Ratings data after removing unpopular movies:  (41360, 3)


In [10]:
# number of ratings given by every user, we will need this value to eleminate inactive users
userActivity = pd.DataFrame(updatedmovieratings.groupby('userId').size(), columns=['count'])
userActivity.head()

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
1,117
2,15
3,6
4,84
5,34


In [11]:
#CLEANING USERS
userInteractions = 20 #threshhold number
suitableusers = list(set(userActivity.query('count >= @userInteractions').index))
updatedestuserratings = updatedmovieratings[updatedmovieratings.userId.isin(suitableusers)]
print('Original Ratings Data : ', movieRatings.shape)
print('Ratings data after dropping both unpopular movies and inactive users: ', updatedestuserratings.shape)

Original Ratings Data :  (100836, 3)
Ratings data after dropping both unpopular movies and inactive users:  (39954, 3)


In [12]:
#with this function we will create the 'finalMatrix' which we will use to implement KNN on.
def user_ratingMatrix(moviedata):

    #number of uniqe movies and users
    uniqueUsers = len(moviedata['userId'].unique())
    uniqueMovies = len(moviedata['movieId'].unique())

    #these dictionaries map which row/column of the utility matrix responds to which user and movie ID'S.
    # user_mapper: maps user id to user index, movie_mapper: maps movie id to movie index
    user_mapper = dict(zip(np.unique(moviedata["userId"]), list(range(uniqueUsers))))
    movie_mapper = dict(zip(np.unique(moviedata["movieId"]), list(range(uniqueMovies))))

    # user_inv_mapper: maps user index to user id, movie_inv_mapper: maps movie index to movie id
    user_inv_mapper = dict(zip(list(range(uniqueUsers)), np.unique(moviedata["userId"])))
    movie_inv_mapper = dict(zip(list(range(uniqueMovies)), np.unique(moviedata["movieId"])))

    user_index = [user_mapper[i] for i in moviedata['userId']]
    movie_index = [movie_mapper[i] for i in moviedata['movieId']]

     #scipy sparse matrix
    finalMatrix =csr_matrix((moviedata["rating"], (movie_index, user_index)), shape=(uniqueMovies, uniqueUsers))
    return finalMatrix, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper


finalMatrix, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = user_ratingMatrix(movieRatings)

print(finalMatrix)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [13]:
def RecommendMovies(movie_id, finalMatrix, recNumber, metric='cosine', show_distance=False):

    neighborMovies = []

    movieINDEX = movie_mapper[movie_id]
    movie_vec = finalMatrix[movieINDEX]

    recNumber+=1 #this value is the amount of movies the model will recommend
    kNN = NearestNeighbors(n_neighbors=recNumber, algorithm="brute", metric=metric)
    kNN.fit(finalMatrix)
    movie_vec = movie_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)

    for i in range(0,recNumber):
        n = neighbour.item(i)
        neighborMovies.append(movie_inv_mapper[n])
    neighborMovies.pop(0)
    return neighborMovies





In [14]:

#this is the variable that helps us get movie title with movie ID
movieTITLES = dict(zip(movies['movieId'], movies['title']))

likedMovie = 1

similarMOVIES = RecommendMovies(likedMovie, finalMatrix, recNumber=6)
MovieName = movieTITLES[likedMovie]

print(f"Because You Liked -> {MovieName}")
print(f'You Might Like These :')
for i in similarMOVIES:
    print(movieTITLES[i])

Because You Liked -> Toy Story (1995)
You Might Like These :
Toy Story 2 (1999)
Jurassic Park (1993)
Independence Day (a.k.a. ID4) (1996)
Star Wars: Episode IV - A New Hope (1977)
Forrest Gump (1994)
Lion King, The (1994)
