In [6]:
pip install scikit-surprise



In [7]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy

In [8]:
ratings = pd.read_csv('/content/ratings.csv')
movies = pd.read_csv('/content/movies.csv')

In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [10]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [25]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [26]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [27]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


In [28]:
ratings['userId'].value_counts()

userId
668    5678
575    2837
458    2086
232    1421
310    1287
       ... 
58       20
51       20
288      20
388      20
257      20
Name: count, Length: 668, dtype: int64

In [29]:
ratings['movieId'].value_counts()

movieId
296       325
356       311
318       308
480       294
593       290
         ... 
111732      1
112279      1
113630      1
59621       1
142507      1
Name: count, Length: 10325, dtype: int64

In [11]:
# Use the Reader class to parse the data file
reader = Reader(rating_scale=(0.5, 5.0))

In [12]:
# Load the dataset into a Surprise format
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [13]:
# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.25)

In [14]:
# Build the SVD model (Singular Value Decomposition)
model = SVD()

In [15]:
# Train the model on the training set
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9e299f3160>

In [16]:
# Make predictions on the test set
predictions = model.test(testset)

In [17]:
# Calculate RMSE (Root Mean Squared Error)
rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse:.4f}")

RMSE: 0.8801
Test RMSE: 0.8801


In [18]:
def get_top_n_recommendations(predictions, user_id, n=10):
    # First map the predictions to each user.
    from collections import defaultdict
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the n highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    # Return the top n recommendations for the specified user
    return top_n[user_id]

In [21]:
# Get top 10 recommendations for a user
user_id = 2
recommendations = get_top_n_recommendations(predictions, user_id, n=10)

In [22]:
# Print the recommended movies
print("Top 10 movie recommendations for user", user_id)
for movie_id, estimated_rating in recommendations:
    movie_title = movies[movies['movieId'] == movie_id]['title'].values[0]
    print(f"{movie_title}: {estimated_rating:.2f}")

Top 10 movie recommendations for user 2
Twelve Monkeys (a.k.a. 12 Monkeys) (1995): 4.26
Postman, The (Postino, Il) (1994): 4.21
Willy Wonka & the Chocolate Factory (1971): 4.04
Mighty Aphrodite (1995): 4.04
Dead Man Walking (1995): 3.82
James and the Giant Peach (1996): 3.74
Rumble in the Bronx (Hont faan kui) (1995): 3.73
Independence Day (a.k.a. ID4) (1996): 3.71
Mission: Impossible (1996): 3.68
Juror, The (1996): 3.49


In [23]:
user_id = 5
recommendations = get_top_n_recommendations(predictions, user_id, n=10)

In [24]:
print("Top 10 movie recommendations for user", user_id)
for movie_id, estimated_rating in recommendations:
    movie_title = movies[movies['movieId'] == movie_id]['title'].values[0]
    print(f"{movie_title}: {estimated_rating:.2f}")

Top 10 movie recommendations for user 5
Matrix, The (1999): 4.12
Clockwork Orange, A (1971): 3.77
Finding Nemo (2003): 3.75
Toy Story (1995): 3.63
Deliverance (1972): 3.48
Thin Red Line, The (1998): 3.37
Just My Luck (2006): 3.23
Witches of Eastwick, The (1987): 3.22
Get Smart (2008): 3.10
Batman (1989): 3.08
