# SVD

In [2]:
!unzip "ratings.zip"

Archive:  ratings.zip
replace ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [1]:
import pandas as pd

df = pd.read_csv('ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [2]:
len(df)

33832162

In [3]:
df.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [4]:
df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,33832160.0,33832160.0,33832160.0,33832160.0
mean,165438.0,28313.48,3.54254,1269362000.0
std,95341.22,49928.65,1.063959,254102300.0
min,1.0,1.0,0.5,789652000.0
25%,82953.0,1219.0,3.0,1046718000.0
50%,166129.0,3263.0,4.0,1264740000.0
75%,247450.0,40491.0,4.0,1496919000.0
max,330975.0,288983.0,5.0,1689843000.0


In [5]:
num_users = len(df['userId'].unique())
num_movies = len(df['movieId'].unique())

num_users, num_movies

(330975, 83239)

In [6]:
!pip install scikit-surprise



In [7]:
import gc
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

In [8]:
# Get the list of all movie IDs
all_movie_ids = df['movieId'].unique()
print(len(all_movie_ids))

# Get the list of movies that user 1 has not watched
watched_movies = df[df['userId'] == 1]['movieId']
unwatched_movies = [movie_id for movie_id in all_movie_ids if movie_id not in watched_movies]
print(len(unwatched_movies))

83239
83178


In [9]:
# Define a Reader object to parse the ratings
reader = Reader(rating_scale=(0.5, 5))

# Load the data into the Surprise Dataset format
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

del df

In [10]:
gc.collect()

30

In [11]:
# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

del data

In [13]:
gc.collect()

0

In [15]:
# Initialize the SVD algorithm
algo = SVD()

# Train the algorithm on the training set
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7a341821d690>

In [16]:
# Make predictions for the unwatched movies
predictions = [algo.predict(1, movie_id) for movie_id in unwatched_movies]

# Sort the predictions by estimated rating in descending order
sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

In [17]:
# Print the top N recommendations
top_n = 100
print(f"Top {top_n} Recommendations for User 1:")
for i, pred in enumerate(sorted_predictions[:top_n]):
    movie_id = pred.iid
    movie_rating = pred.est
    print(f"#{i+1}: Movie ID {movie_id}, Predicted Rating: {movie_rating}")

Top 100 Recommendations for User 1:
#1: Movie ID 122912, Predicted Rating: 4.87011927742651
#2: Movie ID 122914, Predicted Rating: 4.823283162352283
#3: Movie ID 261955, Predicted Rating: 4.696020625085525
#4: Movie ID 7153, Predicted Rating: 4.683374477233732
#5: Movie ID 6539, Predicted Rating: 4.6822581657920965
#6: Movie ID 318, Predicted Rating: 4.678484395121456
#7: Movie ID 228881, Predicted Rating: 4.621061738928383
#8: Movie ID 58376, Predicted Rating: 4.617607611986646
#9: Movie ID 122916, Predicted Rating: 4.614038600600312
#10: Movie ID 147326, Predicted Rating: 4.604009969937634
#11: Movie ID 88125, Predicted Rating: 4.592002569515453
#12: Movie ID 45722, Predicted Rating: 4.584026876307281
#13: Movie ID 1721, Predicted Rating: 4.58310192867858
#14: Movie ID 117555, Predicted Rating: 4.579241599651569
#15: Movie ID 3578, Predicted Rating: 4.578343341572244
#16: Movie ID 5952, Predicted Rating: 4.575075880822988
#17: Movie ID 40629, Predicted Rating: 4.570900553499388
#18: 

In [18]:
from surprise.dump import dump

# Save the trained model
dump_file = 'trained_model.dump'
dump(dump_file, algo=algo)

In [21]:
from surprise.dump import load

# Load the trained model from the dump file
loaded_model = load(dump_file)[1]

# Make recommendations using the loaded model
predictions = [loaded_model.predict(1, movie_id) for movie_id in unwatched_movies]

# Sort the predictions by estimated rating in descending order
sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

# Print the top N recommendations
top_n = 100
print(f"Top {top_n} Recommendations for User 1 using the loaded model:")
for i, pred in enumerate(sorted_predictions[:top_n]):
    movie_id = pred.iid
    movie_rating = pred.est
    print(f"#{i+1}: Movie ID {movie_id}, Predicted Rating: {movie_rating}")

Top 100 Recommendations for User 1 using the loaded model:
#1: Movie ID 122912, Predicted Rating: 4.87011927742651
#2: Movie ID 122914, Predicted Rating: 4.823283162352283
#3: Movie ID 261955, Predicted Rating: 4.696020625085525
#4: Movie ID 7153, Predicted Rating: 4.683374477233732
#5: Movie ID 6539, Predicted Rating: 4.6822581657920965
#6: Movie ID 318, Predicted Rating: 4.678484395121456
#7: Movie ID 228881, Predicted Rating: 4.621061738928383
#8: Movie ID 58376, Predicted Rating: 4.617607611986646
#9: Movie ID 122916, Predicted Rating: 4.614038600600312
#10: Movie ID 147326, Predicted Rating: 4.604009969937634
#11: Movie ID 88125, Predicted Rating: 4.592002569515453
#12: Movie ID 45722, Predicted Rating: 4.584026876307281
#13: Movie ID 1721, Predicted Rating: 4.58310192867858
#14: Movie ID 117555, Predicted Rating: 4.579241599651569
#15: Movie ID 3578, Predicted Rating: 4.578343341572244
#16: Movie ID 5952, Predicted Rating: 4.575075880822988
#17: Movie ID 40629, Predicted Rating: 

In [19]:
def accuracy(y_true, y_pred, threshold=0.25):
    correct = [abs(true - pred) < threshold for true, pred in zip(y_true, y_pred)]
    return sum(correct) / len(correct)

# Get true ratings from the test set
true_ratings = [rating for (_, _, rating) in testset]

# Predict ratings using the model
predicted_ratings = [algo.predict(uid, iid).est for (uid, iid, _) in testset]

# Calculate accuracy
acc = accuracy(true_ratings, predicted_ratings)
print("Accuracy:", acc)

Accuracy: 0.29619771599009403


In [20]:
acc = accuracy(true_ratings, predicted_ratings, threshold=0.5)
print("Accuracy:", acc)

Accuracy: 0.5374153560672218


---