In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('../data/ml-latest-small/ratings.csv')
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


### --------> OBSERVATIONS:

+ movieId: A unique identifier for the movie.
+ title: The title of the movie, along with its release year in parentheses.
+ genres: The genres associated with the movie, separated by pipe characters (|).

In [10]:
# check for missing values and infinities
df.isnull().sum()
df.isnull().values.any()
# check for infinities
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.isnull().values.any()


False

In [11]:
# unique users
print(f'Number of unique users: {df.userId.unique().shape[0]}\n')

# unique movies
print(f'Number of unique movies: {df.movieId.unique().shape[0]}\n')

# unique ratings
print(f'Number of unique ratings: {df.rating.unique().shape[0]}\n')


Number of unique users: 610

Number of unique movies: 9724

Number of unique ratings: 10



In [12]:
def split_data_by_rated_items(df, test_size, given_n):
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=42, stratify=df['userId'])

    # For each user in the test set, keep only 'given_n' rated items if they have rated that many,
    # otherwise keep all the items they have rated.
    test_df = test_df.groupby('userId').apply(lambda x: x.sample(min(len(x), given_n), random_state=42))

    return train_df, test_df.reset_index(drop=True)


def split_data_by_unique_users(df):
    unique_users = df['userId'].unique()
    np.random.shuffle(unique_users)

    # Get the user IDs for each set
    M50_users = unique_users[:50]
    M100_users = unique_users[50:150]
    M400_users = unique_users[150:550]
    test_users = unique_users[550:]

    # Split the DataFrame into the different sets based on the user IDs
    M50_df = df[df['userId'].isin(M50_users)]
    M100_df = df[df['userId'].isin(M100_users)]
    M400_df = df[df['userId'].isin(M400_users)]
    test_df = df[df['userId'].isin(test_users)]

    return M50_df, M100_df, M400_df, test_df


def all_but_one(df):
    # For each user, select one rating and split it into a separate DataFrame
    test_df = df.groupby('userId').sample(n=1, random_state=42)
    train_df = df.drop(test_df.index)
    
    return train_df, test_df

# Call the function
M50_df, M100_df, M400_df, test_df = split_data_by_unique_users(df)

print('M50 set:\n', M50_df)
print('M100 set:\n', M100_df)
print('M400 set:\n', M400_df)
print('Test set:\n', test_df)

# Call the functions
train_df_given_10, test_df_given_10 = split_data_by_rated_items(df, test_size=0.2, given_n=10)  # Modify test_size and given_n as needed
print('Training set:\n', train_df_given_10)
print('Test set:\n', test_df_given_10)

train_df, test_df = all_but_one(df)
print('All-But-One Training set:\n', train_df)
print('All-But-One Test set:\n', test_df)



M50 set:
        userId  movieId  rating   timestamp
261         3       31     0.5  1306463578
262         3      527     0.5  1306464275
263         3      647     0.5  1306463619
264         3      688     0.5  1306464228
265         3      720     0.5  1306463595
...       ...      ...     ...         ...
95960     601   170705     5.0  1521397596
95961     601   172591     4.5  1521467819
95962     601   174055     4.0  1521397739
95963     601   176371     4.0  1521397623
95964     601   177765     4.5  1521397621

[6273 rows x 4 columns]
M100 set:
        userId  movieId  rating   timestamp
1569       16       47     3.5  1377477814
1570       16       50     4.0  1377476781
1571       16      111     4.5  1377477446
1572       16      204     2.0  1377476617
1573       16      260     3.0  1377476936
...       ...      ...     ...         ...
98474     606    76093     4.0  1368460114
98475     606    91500     4.5  1349082477
98476     606    95105     2.5  1349083032
98477   

In [13]:
train_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [7]:
# rmse: root mean squared error
def rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return sqrt(mse)

# mae: mean absolute error
def mae(y_true, y_pred):
    return np.mean(np.abs(np.array(y_true) - np.array(y_pred)))

# SVD

+ "cold-start handling"

In [23]:
class SVD:
    def __init__(self, num_factors, learning_rate, num_epochs, top_n=10):
        # Initializing the instance variables with given arguments
        self.num_factors = num_factors
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.top_n = top_n  # number of movies to recommend for cold start

    def fit(self, user_item_ratings):
        # Initializing the user and movie latent factors matrices with random numbers
        self.user_factors = np.random.randn(user_item_ratings.userId.nunique(), self.num_factors)
        self.movie_factors = np.random.randn(user_item_ratings.movieId.nunique(), self.num_factors)
        
        # Creating dictionaries to map user and movie IDs to their respective indices in the factor matrices
        self.user_index = {user_id: idx for idx, user_id in enumerate(user_item_ratings.userId.unique())}
        self.movie_index = {movie_id: idx for idx, movie_id in enumerate(user_item_ratings.movieId.unique())}

        # Calculate average rating for each movie
        self.movie_avg_rating = user_item_ratings.groupby('movieId')['rating'].mean().to_dict()

        # Get top-N movies based on average rating for cold start problem
        sorted_movies_by_avg_rating = sorted(self.movie_avg_rating.items(), key=lambda x: x[1], reverse=True)
        self.top_n_movies = [movie_id for movie_id, _ in sorted_movies_by_avg_rating[:self.top_n]]

        # Loop over epochs
        for epoch in range(self.num_epochs):
            # Loop over all user-item-rating rows in the DataFrame
            for idx, row in user_item_ratings.iterrows():
                user_id = row['userId']
                movie_id = row['movieId']
                rating = row['rating']

                # Getting the user and movie indices for the current user-item pair
                user_idx = self.user_index[user_id]
                movie_idx = self.movie_index[movie_id]

                # Computing the predicted rating as the dot product of the user and movie factors
                prediction = np.dot(self.user_factors[user_idx], self.movie_factors[movie_idx])
                # Computing the error as the difference between the actual and predicted ratings
                error = rating - prediction

                # Updating the user and movie factor vectors in the direction that minimizes the error
                self.user_factors[user_idx] += self.learning_rate * error * self.movie_factors[movie_idx]
                self.movie_factors[movie_idx] += self.learning_rate * error * self.user_factors[user_idx]

    def predict(self, user_id, movie_id):
        # Getting the user and movie indices for the given user-item pair
        user_idx = self.user_index.get(user_id, -1)
        movie_idx = self.movie_index.get(movie_id, -1)

        # If the user or the movie is not present in the training data, return the movie's average rating
        if user_idx == -1 or movie_idx == -1:
            return self.movie_avg_rating.get(movie_id)

        # Otherwise, return the predicted rating as the dot product of the user and movie factors
        return np.dot(self.user_factors[user_idx], self.movie_factors[movie_idx])

    def recommend(self, user_id):
        # If the user is not present in the training data, return top-N movies
        if user_id not in self.user_index:
            return self.top_n_movies

        # Otherwise, predict the rating for each movie and return the top-N movies
        user_ratings = {movie_id: self.predict(user_id, movie_id) for movie_id in self.movie_index.keys()}
        sorted_user_ratings = sorted(user_ratings.items(), key=lambda x: x[1], reverse=True)
        return [movie_id for movie_id, _ in sorted_user_ratings[:self.top_n]]


In [24]:
%%time

# Fit the model to the M50 data
svd.fit(M50_df)

# Predict ratings for the Test set and evaluate
test_predictions = test_df.apply(lambda row: svd.predict(row['userId'], row['movieId']), axis=1)

# Remove None values and corresponding actual ratings
actual_ratings = test_df['rating'][test_predictions.notna()]
test_predictions = test_predictions.dropna()

print('Test RMSE (M50):', rmse(actual_ratings, test_predictions))
print('Test MAE (M50):', mae(actual_ratings, test_predictions))


Test RMSE (M50): 1.6440047060689733
Test MAE (M50): 1.244640771592552
CPU times: user 2.53 s, sys: 24.9 ms, total: 2.55 s
Wall time: 2.55 s


In [25]:
%%time

# Fit the model to the M100 data
svd.fit(M100_df)

# Predict ratings for the Test set and evaluate
test_predictions = test_df.apply(lambda row: svd.predict(row['userId'], row['movieId']), axis=1)
# Remove None values and corresponding actual ratings
actual_ratings = test_df['rating'][test_predictions.notna()]
test_predictions = test_predictions.dropna()

print('Test RMSE (M50):', rmse(actual_ratings, test_predictions))
print('Test MAE (M50):', mae(actual_ratings, test_predictions))

# # Fit the model to the M400 data
# svd.fit(M400_df)

# # Predict ratings for the Test set and evaluate
# test_predictions = test_df.apply(lambda row: svd.predict(row['userId'], row['movieId']), axis=1)
# print('Test RMSE (M400):', rmse(test_df['rating'], test_predictions))


Test RMSE (M50): 1.3597788921469673
Test MAE (M50): 1.015072585570547
CPU times: user 17.7 s, sys: 574 ms, total: 18.2 s
Wall time: 17.8 s


In [26]:
%%time
# Fit the model to the M400 data
svd.fit(M400_df)

# Predict ratings for the Test set and evaluate
test_predictions = test_df.apply(lambda row: svd.predict(row['userId'], row['movieId']), axis=1)
# Remove None values and corresponding actual ratings
actual_ratings = test_df['rating'][test_predictions.notna()]
test_predictions = test_predictions.dropna()

print('Test RMSE (M50):', rmse(actual_ratings, test_predictions))
print('Test MAE (M50):', mae(actual_ratings, test_predictions))

Test RMSE (M50): 0.9970642086398543
Test MAE (M50): 0.7348708569627388
CPU times: user 1min 14s, sys: 2.59 s, total: 1min 17s
Wall time: 1min 15s


In [14]:
# Check for NaN values
print("Number of NaN values in each column:")
print(M50_df.isnull().sum())

# If there are NaN values, decide how to handle them
# Here's an example of filling NaNs with the mean of the column
M50_df.fillna(M50_df.mean(), inplace=True)

# Check for infinity values
print("Number of infinity values in each column:")
print(np.isinf(M50_df).sum())


Number of NaN values in each column:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Number of infinity values in each column:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  M50_df.fillna(M50_df.mean(), inplace=True)


In [16]:
test_predictions


219           NaN
245           NaN
294      3.979521
306           NaN
548           NaN
           ...   
98227         NaN
98506         NaN
98707         NaN
99501         NaN
99587         NaN
Length: 610, dtype: float64

In [18]:
print(test_predictions[test_predictions.isna()]) # Print NaN values
print("\n\n")
print(test_predictions[~np.isfinite(test_predictions)]) # Print infinity values


219     NaN
245     NaN
306     NaN
548     NaN
596     NaN
         ..
98227   NaN
98506   NaN
98707   NaN
99501   NaN
99587   NaN
Length: 560, dtype: float64



219     NaN
245     NaN
306     NaN
548     NaN
596     NaN
         ..
98227   NaN
98506   NaN
98707   NaN
99501   NaN
99587   NaN
Length: 560, dtype: float64


In [11]:
class SVD:
    def __init__(self, num_factors, learning_rate, num_epochs):
        # Initializing the instance variables with given arguments
        self.num_factors = num_factors
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs

    def fit(self, user_item_ratings):
        # Initializing the user and movie latent factors matrices with random numbers
        self.user_factors = np.random.randn(user_item_ratings.userId.nunique(), self.num_factors)
        self.movie_factors = np.random.randn(user_item_ratings.movieId.nunique(), self.num_factors)
        
        # Creating dictionaries to map user and movie IDs to their respective indices in the factor matrices
        self.user_index = {user_id: idx for idx, user_id in enumerate(user_item_ratings.userId.unique())}
        self.movie_index = {movie_id: idx for idx, movie_id in enumerate(user_item_ratings.movieId.unique())}

        # Loop over epochs
        for epoch in range(self.num_epochs):
            # Loop over all user-item-rating rows in the DataFrame
            for idx, row in user_item_ratings.iterrows():
                user_id = row['userId']
                movie_id = row['movieId']
                rating = row['rating']


                # Getting the user and movie indices for the current user-item pair
                user_idx = self.user_index[user_id]
                movie_idx = self.movie_index[movie_id]

                # Computing the predicted rating as the dot product of the user and movie factors
                prediction = np.dot(self.user_factors[user_idx], self.movie_factors[movie_idx])
                # Computing the error as the difference between the actual and predicted ratings
                error = rating - prediction

                # Updating the user and movie factor vectors in the direction that minimizes the error
                self.user_factors[user_idx] += self.learning_rate * error * self.movie_factors[movie_idx]
                self.movie_factors[movie_idx] += self.learning_rate * error * self.user_factors[user_idx]

    def predict(self, user_id, movie_id):
        # Getting the user and movie indices for the given user-item pair
        user_idx = self.user_index.get(user_id, -1)
        movie_idx = self.movie_index.get(movie_id, -1)

        # If either the user or the movie is not present in the training data, return None
        if user_idx == -1 or movie_idx == -1:
            return None

        # Otherwise, return the predicted rating as the dot product of the user and movie factors
        return np.dot(self.user_factors[user_idx], self.movie_factors[movie_idx])

# Creating an instance of the SVD class with desired parameters
num_factors = 10
learning_rate = 0.01
num_epochs = 10

svd = SVD(num_factors, learning_rate, num_epochs)

# Fitting the model to the training data (supposing you have a DataFrame `df`)
# svd.fit(df)

# Predicting the rating for a given user-item pair
# svd.predict(user_id, movie_id)


In [12]:
%%time
# Train the model on the train set
model_SVD = SVD(num_factors=35, learning_rate=0.001, num_epochs=100)
model_SVD.fit(train_df)

# Predict and evaluate on the test set
y_true = []
y_pred = []

for _, (user_id, movie_id, rating) in test_df.iterrows():
    prediction = model_SVD.predict(user_id, movie_id)
    
    if prediction is not None:
        y_true.append(rating)
        y_pred.append(prediction)

rmse_value = rmse(y_true, y_pred)
print(f"Root Mean Squared Error: {rmse_value}")
print(f"Mean Absolute Error: {mae(y_true, y_pred)}")


ValueError: too many values to unpack (expected 3)

In [1]:
# Check for NaN values
print("Number of NaN values in each column:")
print(M50_df.isnull().sum())

# If there are NaN values, decide how to handle them
# Here's an example of filling NaNs with the mean of the column
M50_df.fillna(M50_df.mean(), inplace=True)

# Check for infinity values
print("Number of infinity values in each column:")
print(np.isinf(M50_df).sum())

# If there are infinity values, decide how to handle them
# Here's an example of replacing infinities with a large number
M50_df.replace([np.inf, -np.inf], 1e12, inplace=True)

# If there are still errors, you might need to check for and handle extremely large values


Number of NaN values in each column:


NameError: name 'M50_df' is not defined

In [None]:
%time
# Fit the model to the M50 data
svd.fit(M50_df)
# Predict ratings for the Test set and evaluate
test_predictions = test_df.apply(lambda row: svd.predict(row['userId'], row['movieId']), axis=1)
print('Test RMSE (M50):', rmse(test_df['rating'], test_predictions))


CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 4.05 µs


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').