In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy
import requests
import zipfile
import io
import os

In [3]:
def download_and_extract_movielens():
    if not os.path.exists('ml-100k'):
        print("Downloading MovieLens 100K dataset...")
        url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
        r = requests.get(url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall()
        print("Movielens 100K dataset downloaded and extracted successfully.")
    else:
        print("The dataset already exists. Download skipped.")

In [4]:
download_and_extract_movielens()

ratings_df = pd.read_csv('ml-100k/u.data', sep='\t',
                       names=['user_id', 'item_id', 'rating', 'timestamp'])

print(f"Dataset shape: {ratings_df.shape}")
print(f"Number of unique users: {ratings_df['user_id'].nunique()}")
print(f"Number of unique movies: {ratings_df['item_id'].nunique()}")
print(f"Range of ratings: {ratings_df['rating'].min()} to {ratings_df['rating'].max()}")

Downloading MovieLens 100K dataset...
Movielens 100K dataset downloaded and extracted successfully.
Dataset shape: (100000, 4)
Number of unique users: 943
Number of unique movies: 1682
Range of ratings: 1 to 5


In [5]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [6]:
model = SVD(n_factors=20, lr_all=0.01, reg_all=0.01, n_epochs=20, random_state=42)
model.fit(trainset)

predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")

RMSE: 0.9576
MAE:  0.7455
Test RMSE: 0.9576
Test MAE: 0.7455


In [7]:
cv_results = cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

print(f"Average RMSE: {cv_results['test_rmse'].mean():.4f}")
print(f"Average MAE: {cv_results['test_mae'].mean():.4f}")

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9535  0.9611  0.9545  0.9600  0.9581  0.9574  0.0030  
MAE (testset)     0.7462  0.7512  0.7431  0.7520  0.7506  0.7486  0.0034  
Fit time          1.11    0.94    1.07    1.13    1.15    1.08    0.08    
Test time         0.27    0.11    0.32    0.12    0.28    0.22    0.09    
Average RMSE: 0.9574
Average MAE: 0.7486


In [8]:
def get_movie_names():
    movies_df = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1',
                          header=None, usecols=[0, 1],
                          names=['item_id', 'title'])
    return movies_df

movies_df = get_movie_names()

def recommend_movies(user_id, n=10):
    # List of all movies
    all_movies = movies_df['item_id'].unique()

    # Movies already rated by the user
    rated_movies = ratings_df[ratings_df['user_id'] == user_id]['item_id'].values

    # Movies not yet rated by the user
    unrated_movies = np.setdiff1d(all_movies, rated_movies)

    # Predicting ratings on unseen movies, by using the trained SVD model
    predictions = []
    for item_id in unrated_movies:
        predicted_rating = model.predict(user_id, item_id).est
        predictions.append((item_id, predicted_rating))

    # Rank predictions by estimated rating
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Get top N recommendations
    top_recommendations = predictions[:n]

    # Fetch movie titles associated with top N recommendations
    recommendations = pd.DataFrame(top_recommendations, columns=['item_id', 'predicted_rating'])
    recommendations = recommendations.merge(movies_df, on='item_id')

    return recommendations

In [9]:
user_id = 42
recommendations = recommend_movies(user_id, n=10)

print(f"\nTop 10 recommended movies for user {user_id}:")
print(recommendations[['title', 'predicted_rating']])


Top 10 recommended movies for user 42:
                                     title  predicted_rating
0                        Braveheart (1995)          5.000000
1                           Titanic (1997)          5.000000
2                         Boot, Das (1981)          5.000000
3  Some Folks Call It a Sling Blade (1993)          4.702539
4    Day the Earth Stood Still, The (1951)          4.641970
5                           Top Hat (1935)          4.624840
6                       Being There (1979)          4.620471
7                        Home Alone (1990)          4.619863
8            Miracle on 34th Street (1994)          4.613372
9                 Great Escape, The (1963)          4.607529


In [10]:
user_id = 22
recommendations = recommend_movies(user_id, n=10)

print(f"\nTop 10 recommended movies for user {user_id}:")
print(recommendations[['title', 'predicted_rating']])


Top 10 recommended movies for user 22:
                                          title  predicted_rating
0                              Apollo 13 (1995)          5.000000
1                                 Clerks (1994)          5.000000
2                    Clockwork Orange, A (1971)          5.000000
3                                 Scream (1996)          5.000000
4                        Wings of Desire (1987)          5.000000
5                          Happy Gilmore (1996)          5.000000
6                            Rear Window (1954)          4.882127
7  Rosencrantz and Guildenstern Are Dead (1990)          4.844697
8                     When We Were Kings (1996)          4.799633
9                             GoodFellas (1990)          4.790532
