In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-1m-dataset/users.dat
/kaggle/input/movielens-1m-dataset/ratings.dat
/kaggle/input/movielens-1m-dataset/README
/kaggle/input/movielens-1m-dataset/movies.dat


# Limitations of sk-learn's NMF library

In [4]:
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from math import sqrt

In [5]:
# Define column names for each file
movies_columns = ['movie_id', 'title', 'genres']
users_columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']

movies_data = pd.read_csv('/kaggle/input/movielens-1m-dataset/movies.dat',sep='::', encoding='ISO-8859-1', 
                          header=None, names=movies_columns, engine='python')
users_data = pd.read_csv('/kaggle/input/movielens-1m-dataset/users.dat',sep='::',encoding='ISO-8859-1', 
                         header=None, names=users_columns, engine='python')
ratings_data = pd.read_csv('/kaggle/input/movielens-1m-dataset/ratings.dat',sep='::',encoding='ISO-8859-1', 
                           header=None, names=ratings_columns,engine='python')

In [6]:
print("Movies data shape:", movies_data.shape)
print("Users data shape:", users_data.shape)
print("Ratings data shape:", ratings_data.shape)

Movies data shape: (3883, 3)
Users data shape: (6040, 5)
Ratings data shape: (1000209, 4)


In [7]:
# Create user-item ratings matrix
n_users = ratings_data['user_id'].max() + 1  # User IDs start at 1
n_items = ratings_data['movie_id'].max() + 1  # Movie IDs start at 1
ratings_matrix = np.zeros((n_users, n_items))

# Fill the ratings matrix
for row in ratings_data.itertuples():
    ratings_matrix[row.user_id, row.movie_id] = row.rating

# Split into train and test sets (80% train, 20% test)
train_indices = ratings_data.sample(frac=0.8, random_state=42).index
test_indices = ratings_data.drop(train_indices).index

train_matrix = np.zeros((n_users, n_items))
test_matrix = np.zeros((n_users, n_items))

# Fill train and test matrices
for idx in train_indices:
    row = ratings_data.loc[idx]
    train_matrix[int(row['user_id']), int(row['movie_id'])] = row['rating']

for idx in test_indices:
    row = ratings_data.loc[idx]
    test_matrix[int(row['user_id']), int(row['movie_id'])] = row['rating']

In [9]:
# Apply NMF with tuned parameters
n_components = 15
model = NMF(n_components=n_components, 
            init='nndsvd', 
            random_state=42, 
            max_iter=2000,  # Increased from 500
            tol=1e-3,       # Relaxed tolerance
            alpha_W=0.01,   # Regularization for W
            alpha_H=0.01,   # Regularization for H
            l1_ratio=0.5)   # Mix of L1 and L2 regularization

W = model.fit_transform(train_matrix)  # User latent factors
H = model.components_  # Item latent factors

In [10]:
# Predict ratings
predicted_matrix = np.dot(W, H)

# Clip predictions to valid rating range (1–5)
predicted_matrix = np.clip(predicted_matrix, 1, 5)

# Extract test set ratings and predictions
test_indices = np.where(test_matrix > 0)
actual_ratings = test_matrix[test_indices]
predicted_ratings = predicted_matrix[test_indices]

# Compute RMSE
rmse = sqrt(mean_squared_error(actual_ratings, predicted_ratings))
print(f"NMF RMSE: {rmse:.4f}")

NMF RMSE: 2.5138


# Results of matrix factorization:

The RMSE of 2.5138 from sklearn’s Non-Negative Matrix Factorization (NMF) on the MovieLens 1M dataset is significantly higher than the RMSE values from the simple baseline and similarity-based methods which I computed (ranging from 0.981 to 1.258). This indicates that NMF performed poorly compared to these methods

Why non-negative matrix facorization library did not work well compared to simple baseline or similarity-based methods?
1. Improper handling of missing ratings:
  * NMF treats missing ratings as actual zero ratings, biasing predictions toward low values and leading to large errors for actual ratings (1–5), where as in the earlier scenarios it would ignore the missing ratings & handling it well.
2. Sparsity in the data:
*  we have 6,040 users × 3,883 movies, if everyone gives the rating to all movies then we would be having 18Million+ rating but here we only have 1Million dataset & there is more sparse data matrix, this would be computationally inefficient, However, baseline & similarity methods are uneffected by the sparsity.
3. Lack of other feature data:
*  this is just ratings matrix & this wouldn't use genres & location information & making in inefficient in the rating prediction but in the previous methods, they have used all the features well.

How to fix this better?
1. Handle the missing ratings well, instead of taking them as 0, use the average of the movies.
2. Better handling of sparsity, the sparse data matrix will slow the convergence & decimates the factorization. filter the users or movies to make is dense matrix, which does better factorization & predicts well.
3. Incorporate the missing lables information, which further enhances the prediction for the movie ratings.