In [1]:
! pip install fuzzywuzzy



In [2]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process

import warnings
warnings.filterwarnings('ignore')



In [3]:
# Importing the datasets
movies = pd.read_csv(r'C:\Users\Admin\Downloads\movies.csv')
tags = pd.read_csv(r'C:\Users\Admin\Downloads\tags.csv')
ratings = pd.read_csv(r'C:\Users\Admin\Downloads\ratings.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))', expand = False)   #extracting year

# Removing ()
movies['year'] = movies.title.str.extract('(\d\d\d\d)', expand = False)


In [6]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [7]:
#Replace year from title column with white space
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')

#using the strip funtion we can get rid of any ending whitespaces
movies['title'] = movies['title'].str.strip()

movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [8]:
ratings.groupby('movieId').rating.mean()
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [9]:
df = movies.join(ratings, lsuffix = 'N', rsuffix = 'K')   # to join movies and ratings dataframe
df

Unnamed: 0,movieIdN,title,genres,year,userId,movieIdK,rating,timestamp
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,1,296,5.0,1147880044
1,2,Jumanji,Adventure|Children|Fantasy,1995,1,306,3.5,1147868817
2,3,Grumpier Old Men,Comedy|Romance,1995,1,307,5.0,1147868828
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995,1,665,5.0,1147878820
4,5,Father of the Bride Part II,Comedy,1995,1,899,3.5,1147868510
...,...,...,...,...,...,...,...,...
62418,209157,We,Drama,2018,494,102903,3.5,1532809067
62419,209159,Window of the Soul,Documentary,2001,494,103042,3.0,1532809256
62420,209163,Bad Poems,Comedy|Drama,2018,494,103249,3.0,1532809116
62421,209169,A Girl Thing,(no genres listed),2001,494,103372,3.0,1532911673


In [10]:
movie_users = df.pivot(index = 'movieIdN', columns = 'userId', values = 'rating').fillna(0)

matrix_movies_users = csr_matrix(movie_users.values)
print(matrix_movies_users)

  (0, 0)	5.0
  (1, 0)	3.5
  (2, 0)	5.0
  (3, 0)	5.0
  (4, 0)	3.5
  (5, 0)	4.0
  (6, 0)	3.5
  (7, 0)	3.5
  (8, 0)	5.0
  (9, 0)	4.0
  (10, 0)	3.5
  (11, 0)	4.0
  (12, 0)	2.5
  (13, 0)	2.5
  (14, 0)	2.5
  (15, 0)	3.5
  (16, 0)	4.5
  (17, 0)	4.0
  (18, 0)	5.0
  (19, 0)	5.0
  (20, 0)	4.5
  (21, 0)	4.0
  (22, 0)	5.0
  (23, 0)	5.0
  (24, 0)	5.0
  :	:
  (62398, 493)	3.0
  (62399, 493)	3.5
  (62400, 493)	4.0
  (62401, 493)	3.5
  (62402, 493)	3.5
  (62403, 493)	2.5
  (62404, 493)	3.0
  (62405, 493)	3.5
  (62406, 493)	3.0
  (62407, 493)	0.5
  (62408, 493)	2.5
  (62409, 493)	2.0
  (62410, 493)	1.5
  (62411, 493)	3.5
  (62412, 493)	3.5
  (62413, 493)	3.0
  (62414, 493)	3.5
  (62415, 493)	3.5
  (62416, 493)	3.0
  (62417, 493)	3.5
  (62418, 493)	3.5
  (62419, 493)	3.0
  (62420, 493)	3.0
  (62421, 493)	3.0
  (62422, 493)	4.0


In [11]:
# Applying the kNN algorithm in matrix_movies_users
model = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20 )
model.fit(matrix_movies_users)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=20)

In [12]:
# Defining an user defined function to recommend movies
def recommender (movie_name, data, n):
    idx = process.extractOne(movie_name, movies['title'])[2]
    print('Movie Selected : ',movies['title'][idx], 'Index : ', idx)
    print('Searching for Recommendation.........')
    distance , indices = model.kneighbors(data[idx], n_neighbors = n)
    for i in indices:
        print(movies['title'][i].where(i!=idx))

In [14]:
# Calling the function
recommender('iron man',matrix_movies_users, 10 )

Movie Selected :  Iron Man Index :  12324
Searching for Recommendation.........
12363                             Nina's Heavenly Delights
12364                                       Following Sean
12356                                            King Corn
12357    Indiana Jones and the Kingdom of the Crystal S...
12358                                    How to Rob a Bank
12359                     Hey, Hey, It's Esther Blueburger
12360                                  Quiet American, The
12361                           Patti Smith: Dream of Life
12362                                    Day of the Outlaw
12354                  Year of the Wolf, The (Suden vuosi)
Name: title, dtype: object
