In [1]:
import numpy as np
import pandas as pd
from surprise import Dataset
from surprise.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from math import sqrt

# Step 1: Load the MovieLens 100k dataset
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()

# Convert Surprise trainset to a pandas DataFrame
raw_ratings = np.array(trainset.all_ratings())
ratings_df = pd.DataFrame(raw_ratings, columns=['user_id', 'item_id', 'rating'])
ratings_matrix = ratings_df.pivot(index='user_id', columns='item_id', values='rating')

# Step 2: Remove 20% of the data to simulate missing entries
missing_mask = np.random.rand(*ratings_matrix.shape) < 0.2
ratings_with_missing = ratings_matrix.mask(missing_mask)

# Store the actual missing entries to calculate RMSE later
actual_missing = ratings_matrix[missing_mask]

# Step 3: Impute missing values using SVD
# Fill the missing values using the mean of each user (or movie)
imputer = SimpleImputer(strategy='mean')
ratings_filled = imputer.fit_transform(ratings_with_missing)

# Apply SVD (Matrix Factorization)
svd = TruncatedSVD(n_components=20, random_state=42)
U = svd.fit_transform(ratings_filled)
Sigma = svd.singular_values_
Vt = svd.components_

# Reconstruct the matrix
ratings_imputed = np.dot(U, np.dot(np.diag(Sigma), Vt))

# Step 4: Calculate RMSE for the missing entries
imputed_missing = ratings_imputed[missing_mask]
rmse = sqrt(mean_squared_error(actual_missing, imputed_missing))
print(f'RMSE after SVD-based imputation: {rmse:.4f}')


ModuleNotFoundError: No module named 'surprise'