# Recommendation System for MoveLens Dataset using SVD

In [1]:
# Import libraries
import numpy as np
import pandas as pd

# To load the 'ratings' and 'movies' dataset after uploading them to Jupyter notebook

In [3]:
# Reading ratings file

ratings = pd.read_csv('ratings.csv', usecols=['userId','movieId','rating','timestamp'])

In [5]:
# Reading movies file

movies = pd.read_csv('movies.csv', usecols=['movieId','title','genres'])

In [6]:
# Print first five rows of movies datset

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
# Print first five rows of ratings datset

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


# To find the unique number of users and movies in the 'ratings' dataset

In [8]:

n_users = ratings.userId.unique().shape[0]

n_movies = ratings.movieId.unique().shape[0]

print(f'Number of users = {n_users} and Number of movies = {n_movies}')

Number of users = 7120 and Number of movies = 14026


# To create a rating matrix for the 'ratings' dataset

In [9]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,129350,129354,129428,129707,130052,130073,130219,130462,130490,130642
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# To install the scikit-surprise library for implementing SVD

### Run the following command in the Anaconda Prompt to install surprise package

In [13]:
#conda install -c conda-forge scikit-surprise

In [12]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357227 sha256=d152a94dea04ba2ee622333baca6d04cc200779de1629bd38606831cce2092aa
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succe

In [14]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

# Load Reader library
reader = Reader()

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Use the SVD algorithm.
svd = SVD()

# Compute the RMSE of the SVD algorithm.
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8466  0.8463  0.8434  0.8454  0.0015  
MAE (testset)     0.6480  0.6480  0.6459  0.6473  0.0010  
Fit time          14.40   14.54   14.87   14.60   0.20    
Test time         4.25    3.79    5.07    4.37    0.53    


{'test_rmse': array([0.84661039, 0.84627691, 0.84336341]),
 'test_mae': array([0.64800163, 0.64799611, 0.64588   ]),
 'fit_time': (14.403439283370972, 14.537676572799683, 14.873088598251343),
 'test_time': (4.253359317779541, 3.793494701385498, 5.068759202957153)}

In [15]:
# Print the head of ratings dataset
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


# To find all the movies rated as more than 4 stars by user with userId = 1

In [16]:
ratings_1 = ratings[(ratings['userId'] == 1) & (ratings['rating'] >= 4)]
ratings_1 = ratings_1.set_index('movieId')
ratings_1 = ratings_1.join(movies)['title']
ratings_1.head(10)

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
151,Batman Forever (1995)
223,Dream Man (1995)
253,Junior (1994)
260,Ladybird Ladybird (1994)
293,Pulp Fiction (1994)
296,Priest (1994)
318,Strawberry and Chocolate (Fresa y chocolate) (1993)
541,Harem (1985)
1036,Jude (1996)
1079,Top Gun (1986)


# Train an SVD to predict ratings for user with userId = 1

In [17]:
# Create a shallow copy for the movies dataset
user_1 = movies.copy()

#Reset the index for user_1 dataset
user_1 = user_1.reset_index()



# getting full dataset
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)


#create a training set for svd
trainset = data.build_full_trainset()
svd.fit(trainset)

#Predict the ratings for user1
user_1['Estimate_Score'] = user_1['movieId'].apply(lambda x: svd.predict(1, x).est)

#Drop extra columns from the user1 data frame
user_1 = user_1.drop(['movieId','genres','index'], axis = 1)

# Sort predicted ratings for user1 in descending order
user_1 = user_1.sort_values('Estimate_Score', ascending=False)

#Print top 10 recommendations
print(user_1.head(10))

                                                   title  Estimate_Score
5853       Lord of the Rings: The Two Towers, The (2002)        4.592742
7041   Lord of the Rings: The Return of the King, The...        4.583871
4897   Lord of the Rings: The Fellowship of the Ring,...        4.546152
5473                               Thesis (Tesis) (1996)        4.519751
7660                        Au revoir les enfants (1987)        4.509782
5508      Son of the Bride (Hijo de la novia, El) (2001)        4.483365
18990                                Black Mirror (2011)        4.477737
6873   Passion of Joan of Arc, The (Passion de Jeanne...        4.463514
3953                                   Gettysburg (1993)        4.462474
4239                                        Yi Yi (2000)        4.462185
