In [1]:
import numpy as np
import warnings
import pandas as pd

from math import sqrt
from sklearn.model_selection import train_test_split
from collections import defaultdict
from math import sqrt
import openai

warnings.filterwarnings('ignore')



# SVD from scratch using gradient descent

uses the described backpropagation algorithm for gradient descent and singular vectors as described in the [article](https://sifter.org/simon/journal/20070815.html)

1. Extract user-item interactions from the ratings dataframe.
2. Define the SVD model with functions for initializing the user and movie vectors, predicting ratings, and updating the vectors using gradient descent.
3. Train the model on the user-item interactions data.
4. Use the learned vectors to make predictions on new user-movie pairs.

## Data

This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.

Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.

The data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows.

This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent.

This and other GroupLens data sets are publicly available for download at <http://grouplens.org/datasets/>.


## Evaluation metric: 
+ RMSE

# Data Overview

In [2]:
# Loads the data
df = pd.read_csv('../data/ml-latest-small/ratings.csv')
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


### --------> OBSERVATIONS:

+ movieId: A unique identifier for the movie.
+ title: The title of the movie, along with its release year in parentheses.
+ genres: The genres associated with the movie, separated by pipe characters (|).

In [3]:
# unique users
print(f'Number of unique users: {df.userId.unique().shape[0]}\n')

# unique movies
print(f'Number of unique movies: {df.movieId.unique().shape[0]}\n')

# unique ratings
print(f'Number of unique ratings: {df.rating.unique().shape[0]}\n')


Number of unique users: 610

Number of unique movies: 9724

Number of unique ratings: 10



# Any infinities in the data with NaN?

In [4]:
# check for missing values and infinities
df.isnull().sum()
df.isnull().values.any()
# check for infinities
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.isnull().values.any()

False

# Splits the data into a training set and a test set using a user-stratified train-test split

In [5]:
def split_data_by_rated_items(df, test_size, given_n):
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=42, stratify=df['userId'])

    # For each user in the test set, keep only 'given_n' rated items if they have rated that many,
    # otherwise keep all the items they have rated.
    test_df = test_df.groupby('userId').apply(lambda x: x.sample(min(len(x), given_n), random_state=42))

    return train_df, test_df.reset_index(drop=True)


def split_data_by_unique_users(df):
    unique_users = df['userId'].unique()
    np.random.shuffle(unique_users)

    # Get the user IDs for each set
    M50_users = unique_users[:50]
    M100_users = unique_users[50:150]
    M400_users = unique_users[150:550]
    test_users = unique_users[550:]

    # Split the DataFrame into the different sets based on the user IDs
    M50_df = df[df['userId'].isin(M50_users)]
    M100_df = df[df['userId'].isin(M100_users)]
    M400_df = df[df['userId'].isin(M400_users)]
    test_df = df[df['userId'].isin(test_users)]

    return M50_df, M100_df, M400_df, test_df


def all_but_one(df):
    # For each user, select one rating and split it into a separate DataFrame
    test_df = df.groupby('userId').sample(n=1, random_state=42)
    train_df = df.drop(test_df.index)
    
    return train_df, test_df

# Call the function
M50_df, M100_df, M400_df, test_df = split_data_by_unique_users(df)

print('M50 set:\n', M50_df)
print('M100 set:\n', M100_df)
print('M400 set:\n', M400_df)
print('Test set:\n', test_df)

# Call the functions
train_df_given_10, test_df_given_10 = split_data_by_rated_items(df, test_size=0.2, given_n=10)  # Modify test_size and given_n as needed
print('Training set:\n', train_df_given_10)
print('Test set:\n', test_df_given_10)

train_df, test_df = all_but_one(df)
print('All-But-One Training set:\n', train_df)
print('All-But-One Test set:\n', test_df)



M50 set:
        userId  movieId  rating   timestamp
2977       20        2     3.0  1054038313
2978       20        8     1.0  1054038422
2979       20       13     4.0  1054038425
2980       20       34     4.0  1054038093
2981       20       48     5.0  1054038357
...       ...      ...     ...         ...
95960     601   170705     5.0  1521397596
95961     601   172591     4.5  1521467819
95962     601   174055     4.0  1521397739
95963     601   176371     4.0  1521397623
95964     601   177765     4.5  1521397621

[8719 rows x 4 columns]
M100 set:
        userId  movieId  rating   timestamp
1569       16       47     3.5  1377477814
1570       16       50     4.0  1377476781
1571       16      111     4.5  1377477446
1572       16      204     2.0  1377476617
1573       16      260     3.0  1377476936
...       ...      ...     ...         ...
99529     609      892     3.0   847221080
99530     609     1056     3.0   847221080
99531     609     1059     3.0   847221054
99532   

In [6]:
train_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


# Evaluation Metrics

+ RMSE
+ MAE



In [7]:
# RMSE (Root Mean Squared Error) from scratch
def rmse(y_true, y_pred):
    """Compute the Root Mean Squared Error (RMSE).
    
    Args:
        y_true (list): Actual values.
        y_pred (list): Predicted values.

    Returns:
        float: RMSE value.
    """
    error = 0
    # Calculate squared difference for each pair of actual and predicted values
    for true, pred in zip(y_true, y_pred):
        error += (true - pred) ** 2
    # Average out the squared differences
    error /= len(y_true)
    # Return the square root of the averaged squared differences
    return sqrt(error)

def mae(y_true, y_pred):
    """Compute the Mean Absolute Error (MAE).
    
    Args:
        y_true (list): Actual values.
        y_pred (list): Predicted values.

    Returns:
        float: MAE value.
    """
    # Calculate absolute difference for each pair and average them
    return np.mean(np.abs(np.array(y_true) - np.array(y_pred)))

def f1_score(precisions, recalls):
    """Compute the F1 score for each user based on provided precision and recall values.
    
    Args:
        precisions (dict): Precision values keyed by user ID.
        recalls (dict): Recall values keyed by user ID.

    Returns:
        dict: F1 scores keyed by user ID.
    """
    f1_scores = dict()
    # Calculate F1 score for each user
    for uid in precisions.keys():
        p, r = precisions[uid], recalls[uid]
        f1_scores[uid] = 2*(p*r) / (p + r) if (p + r) != 0 else 0
    return f1_scores

def precision_at_k(predictions, k=10, threshold=3.5):
    """Compute the precision at top k predictions for each user.
    
    Args:
        predictions (list): List of tuples containing user ID, unknown data, actual rating, and predicted rating.
        k (int, optional): Number of top predictions to consider. Defaults to 10.
        threshold (float, optional): Minimum rating to consider a prediction as relevant. Defaults to 3.5.

    Returns:
        dict: Precision at k values keyed by user ID.
    """
    user_est_true = defaultdict(list)
    # Organize predictions by user ID
    for uid, _, true_r, est in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # Count predictions above threshold in top k
        n_rel_and_rec_k = sum((true_r >= threshold) for (_, true_r) in user_ratings[:k])
        precisions[uid] = n_rel_and_rec_k / k

    return precisions

def recall_at_k(predictions, k=10, threshold=3.5):
    """Compute the recall at top k predictions for each user.
    
    Args:
        predictions (list): List of tuples containing user ID, unknown data, actual rating, and predicted rating.
        k (int, optional): Number of top predictions to consider. Defaults to 10.
        threshold (float, optional): Minimum rating to consider a prediction as relevant. Defaults to 3.5.

    Returns:
        dict: Recall at k values keyed by user ID.
    """
    user_est_true = defaultdict(list)
    # Organize predictions by user ID
    for uid, _, true_r, est in predictions:
        user_est_true[uid].append((est, true_r))

    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # Count actual relevant items and those in top k
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rel_and_rec_k = sum((true_r >= threshold) for (_, true_r) in user_ratings[:k])
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return recalls

def ndcg_at_k(predictions, k=10):
    """Compute the Normalized Discounted Cumulative Gain at k.
    
    Args:
        predictions (list): List of tuples containing user ID, unknown data, actual rating, and predicted rating.
        k (int, optional): Number of top predictions to consider. Defaults to 10.

    Returns:
        float: Average NDCG value across users.
    """
    user_est_true = defaultdict(list)
    # Organize predictions by user ID
    for uid, _, true_r, est in predictions:
        user_est_true[uid].append((est, true_r))

    ndcg_values = []
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # Compute DCG and IDCG for user and then compute NDCG
        dcg = sum((rel / np.log2(ind + 2)) for ind, (est, rel) in enumerate(user_ratings[:k]))
        idcg = sum((rel / np.log2(ind + 2)) for ind, (est, rel) in enumerate(sorted(user_ratings, key=lambda x: x[1], reverse=True)[:k]))
        ndcg_values.append(dcg / idcg if idcg > 0.0 else 0.0)

    return np.mean(ndcg_values)

def evaluate(predictions, k=10, threshold=3.5):
    """Evaluate predictions on various metrics: RMSE, MAE, Precision@k, Recall@k, and NDCG.
    
    Args:
        predictions (list): List of tuples containing user ID, unknown data, actual rating, and predicted rating.
        k (int, optional): Number of top predictions to consider for Precision@k, Recall@k, and NDCG. Defaults to 10.
        threshold (float, optional): Minimum rating to consider a prediction as relevant for Precision@k and Recall@k. Defaults to 3.5.

    Returns:
        dict: Dictionary containing RMSE, MAE, Precision@k, Recall@k, and NDCG values.
    """
    # Calculate each metric and return them in a dictionary
    precisions = precision_at_k(predictions, k=k, threshold=threshold)
    recalls = recall_at_k(predictions, k=k, threshold=threshold)
    ndcg = ndcg_at_k(predictions, k=k)
    
    return {
        'RMSE': rmse([true_r for uid, _, true_r, _ in predictions], [est for _, _, true_r, est in predictions]),
        'MAE': mae([true_r for uid, _, true_r, _ in predictions], [est for _, _, true_r, est in predictions]),
        'Precision@k': sum(prec for prec in precisions.values()) / len(precisions),
        'Recall@k': sum(rec for rec in recalls.values()) / len(recalls),
        'NDCG': ndcg
    }


# SVD

+ "cold-start handling"

In [8]:
class SVD:
    def __init__(self, num_factors, learning_rate, num_epochs, top_n=10):
        # Initializing the instance variables with given arguments
        self.num_factors = num_factors
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.top_n = top_n  # number of movies to recommend for cold start

    def fit(self, user_item_ratings):
        # Initializing the user and movie latent factors matrices with random numbers
        self.user_factors = np.random.randn(user_item_ratings.userId.nunique(), self.num_factors)
        self.movie_factors = np.random.randn(user_item_ratings.movieId.nunique(), self.num_factors)
        
        # Creating dictionaries to map user and movie IDs to their respective indices in the factor matrices
        self.user_index = {user_id: idx for idx, user_id in enumerate(user_item_ratings.userId.unique())}
        self.movie_index = {movie_id: idx for idx, movie_id in enumerate(user_item_ratings.movieId.unique())}

        # Calculate average rating for each movie
        self.movie_avg_rating = user_item_ratings.groupby('movieId')['rating'].mean().to_dict()

        # Get top-N movies based on average rating for cold start problem
        sorted_movies_by_avg_rating = sorted(self.movie_avg_rating.items(), key=lambda x: x[1], reverse=True)
        self.top_n_movies = [movie_id for movie_id, _ in sorted_movies_by_avg_rating[:self.top_n]]

        # Loop over epochs
        for epoch in range(self.num_epochs):
            # Loop over all user-item-rating rows in the DataFrame
            for idx, row in user_item_ratings.iterrows():
                user_id = row['userId']
                movie_id = row['movieId']
                rating = row['rating']

                # Getting the user and movie indices for the current user-item pair
                user_idx = self.user_index[user_id]
                movie_idx = self.movie_index[movie_id]

                # Computing the predicted rating as the dot product of the user and movie factors
                prediction = np.dot(self.user_factors[user_idx], self.movie_factors[movie_idx])
                # Computing the error as the difference between the actual and predicted ratings
                error = rating - prediction

                # Updating the user and movie factor vectors in the direction that minimizes the error
                self.user_factors[user_idx] += self.learning_rate * error * self.movie_factors[movie_idx]
                self.movie_factors[movie_idx] += self.learning_rate * error * self.user_factors[user_idx]

    def predict(self, user_id, movie_id):
        # Getting the user and movie indices for the given user-item pair
        user_idx = self.user_index.get(user_id, -1)
        movie_idx = self.movie_index.get(movie_id, -1)

        # If the user or the movie is not present in the training data, return the movie's average rating
        if user_idx == -1 or movie_idx == -1:
            return self.movie_avg_rating.get(movie_id)

        # Otherwise, return the predicted rating as the dot product of the user and movie factors
        return np.dot(self.user_factors[user_idx], self.movie_factors[movie_idx])

    def recommend(self, user_id):
        # If the user is not present in the training data, return top-N movies
        if user_id not in self.user_index:
            return self.top_n_movies

        # Otherwise, predict the rating for each movie and return the top-N movies
        user_ratings = {movie_id: self.predict(user_id, movie_id) for movie_id in self.movie_index.keys()}
        sorted_user_ratings = sorted(user_ratings.items(), key=lambda x: x[1], reverse=True)
        return [movie_id for movie_id, _ in sorted_user_ratings[:self.top_n]]
    

svd = SVD(num_factors=35, learning_rate=0.01, num_epochs=10, top_n=10)


In [9]:
def evaluate_model(df, model):
    import time
    
    start_time = time.time()

    # Fit the model to the data
    model.fit(df)

    # Predict ratings for the Test set and evaluate
    test_predictions = test_df.apply(lambda row: model.predict(row['userId'], row['movieId']), axis=1)
    
    # Remove None values and corresponding actual ratings
    actual_ratings = test_df['rating'][test_predictions.notna()]
    test_predictions = test_predictions.dropna()

    svd_predictions = [(uid, iid, true_r, est) for uid, iid, true_r, est in zip(test_df['userId'], test_df['movieId'], actual_ratings, test_predictions)]
    
    # Compute metrics for the model
    metrics = evaluate(svd_predictions)
    
    end_time = time.time()
    
    print(f"Execution time for: {end_time - start_time} seconds")
    
    return metrics


In [10]:
print(f"Execution time for M50 dataset")
svd_metrics_M50 = evaluate_model(M50_df, svd)
print(f"The evaluation metrics for the SVD model are: {svd_metrics_M50}\n")

print(f"Execution time for M100 dataset")
svd_metrics_M100 = evaluate_model(M100_df, svd)
print(f"The evaluation metrics for the SVD model are: {svd_metrics_M100}\n")

print(f"Execution time for M400 dataset")
svd_metrics_M400 = evaluate_model(M400_df, svd)
print(f"The evaluation metrics for the SVD model are: {svd_metrics_M400}\n")



# Convert the metrics to DataFrame
df_M50 = pd.DataFrame([svd_metrics_M50])
df_M50['Dataset'] = 'M50'

df_M100 = pd.DataFrame([svd_metrics_M100])
df_M100['Dataset'] = 'M100'

df_M400 = pd.DataFrame([svd_metrics_M400])
df_M400['Dataset'] = 'M400'

# Concatenate the DataFrames
metrics_df = pd.concat([df_M50, df_M100, df_M400], ignore_index=True)

# Reorder the columns
cols = metrics_df.columns.tolist()
cols = cols[-1:] + cols[:-1]  # Move the last column to first
metrics_df = metrics_df[cols]

metrics_df


Execution time for M50 dataset
Execution time for: 3.5147593021392822 seconds
The evaluation metrics for the SVD model are: {'RMSE': 1.0772998397808686, 'MAE': 0.8250792022416213, 'Precision@k': 0.06698292220113895, 'Recall@k': 1.0, 'NDCG': 1.0}

Execution time for M100 dataset
Execution time for: 9.22215223312378 seconds
The evaluation metrics for the SVD model are: {'RMSE': 0.9895066834662088, 'MAE': 0.7521875499473722, 'Precision@k': 0.06689655172413841, 'Recall@k': 1.0, 'NDCG': 1.0}

Execution time for M400 dataset
Execution time for: 22.76992893218994 seconds
The evaluation metrics for the SVD model are: {'RMSE': 0.8220405753392208, 'MAE': 0.5768161861180655, 'Precision@k': 0.06639072847682169, 'Recall@k': 1.0, 'NDCG': 1.0}



Unnamed: 0,Dataset,RMSE,MAE,Precision@k,Recall@k,NDCG
0,M50,1.0773,0.825079,0.066983,1.0,1.0
1,M100,0.989507,0.752188,0.066897,1.0,1.0
2,M400,0.822041,0.576816,0.066391,1.0,1.0


# KNN based CF

In [11]:
class KNN_CF:
    def __init__(self, n_users, n_items, k=3, gamma=0, delta=25, epsilon=1e-9):
        self.n_users = n_users
        self.n_items = n_items
        self.k = k
        self.gamma = gamma
        self.delta = delta
        self.epsilon = epsilon
        self.user_corrs = np.zeros((n_users, n_users))
        self.item_corrs = np.zeros((n_items, n_items))

    def fit(self, user_item_matrix):
        # user-based
        for i in range(self.n_users):
            for j in range(self.n_users):
                self.user_corrs[i, j] = self.pearson_corr(user_item_matrix[i], user_item_matrix[j])

        # item-based
        for i in range(self.n_items):
            for j in range(self.n_items):
                self.item_corrs[i, j] = self.pearson_corr(user_item_matrix[:, i], user_item_matrix[:, j])

    def predict(self, user_item_matrix, mode='user'):
        predictions = np.zeros((self.n_users, self.n_items))
        if mode == 'user':
            for i in range(self.n_users):
                for j in range(self.n_items):
                    if user_item_matrix[i, j] > 0:
                        sim_users = np.argsort(self.user_corrs[i])[-(self.k + 1):-1]
                        predictions[i, j] = self.predict_rating(user_item_matrix, sim_users, i, j, mode)
        elif mode == 'item':
            for i in range(self.n_users):
                for j in range(self.n_items):
                    if user_item_matrix[i, j] > 0:
                        sim_items = np.argsort(self.item_corrs[j])[-(self.k + 1):-1]
                        predictions[i, j] = self.predict_rating(user_item_matrix, sim_items, i, j, mode)
        return predictions

    def pearson_corr(self, vec_i, vec_j):
        mask_i = vec_i > 0
        mask_j = vec_j > 0
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            return 0
        mean_i = np.mean(vec_i[corrated_index])
        mean_j = np.mean(vec_j[corrated_index])
        sub_i = vec_i[corrated_index] - mean_i
        sub_j = vec_j[corrated_index] - mean_j
        return np.sum(sub_i * sub_j) / (np.sqrt(np.sum(np.square(sub_i))) * np.sqrt(np.sum(np.square(sub_j))) + self.epsilon)

    def predict_rating(self, user_item_matrix, sim_indices, i, j, mode):
        if mode == 'user':
            sim_ratings = user_item_matrix[sim_indices, j]
            sim_means = np.array([np.mean(user_item_matrix[k][user_item_matrix[k]>0]) for k in sim_indices])
            sim_vals = self.user_corrs[i][sim_indices]
        elif mode == 'item':
            sim_ratings = user_item_matrix[i, sim_indices]
            sim_means = np.array([np.mean(user_item_matrix[:, k][user_item_matrix[:, k]>0]) for k in sim_indices])
            sim_vals = self.item_corrs[j][sim_indices]
        if np.sum(sim_vals) == 0:
            return np.mean(sim_ratings)
        else:
            return np.mean(sim_ratings) + np.sum(sim_vals * (sim_ratings - sim_means)) / np.sum(sim_vals)


In [12]:
def df_to_matrix(df, nrows, ncols):
    matrix = np.zeros((nrows, ncols))
    for row in df.itertuples():
        matrix[row.userId, row.movieId] = row.rating
    return matrix

In [13]:
%%time

# Create mappings for userIds and movieIds to contiguous indices
user_mapping = {user_id: i for i, user_id in enumerate(M100_df['userId'].unique())}
movie_mapping = {movie_id: i for i, movie_id in enumerate(M100_df['movieId'].unique())}

# Create reverse mappings for later use
reverse_user_mapping = {i: user_id for user_id, i in user_mapping.items()}
reverse_movie_mapping = {i: movie_id for movie_id, i in movie_mapping.items()}

# Apply the mappings to the dataframes
M100_df['userId'] = M100_df['userId'].map(user_mapping)
M100_df['movieId'] = M100_df['movieId'].map(movie_mapping)

test_df['userId'] = test_df['userId'].map(user_mapping)
test_df['movieId'] = test_df['movieId'].map(movie_mapping)

# Drop rows with NaN userId or movieId
test_df.dropna(subset=['userId', 'movieId'], inplace=True)

# Convert userId and movieId to integer
test_df['userId'] = test_df['userId'].astype(int)
test_df['movieId'] = test_df['movieId'].astype(int)


n_users = M100_df['userId'].nunique()
n_items = M100_df['movieId'].nunique()

train_matrix = df_to_matrix(M100_df, n_users, n_items)
test_matrix = df_to_matrix(test_df, n_users, n_items)

knn_cf = KNN_CF(n_users, n_items, k=3)

# Fit the model to the M100 data
knn_cf.fit(train_matrix)

# Predict ratings for the Test set and evaluate
user_based_predictions = knn_cf.predict(test_matrix, mode='user')
test_predictions = user_based_predictions[test_matrix.nonzero()]
actual_ratings = test_matrix[test_matrix.nonzero()]

knn_predictions = [(uid, iid, true_r, est) for uid, iid, true_r, est in zip(test_df['userId'], test_df['movieId'], actual_ratings, test_predictions)]
# Compute metrics for the KNN model
knn_metrics_M100 = evaluate(knn_predictions)

# create a dataframe for the results 
knn_results = pd.DataFrame(knn_metrics_M100, index=[0])
# add first column of the dataframe as the dataset name
knn_results.insert(0, 'dataset', 'M100')
knn_results

CPU times: user 23min 19s, sys: 7.33 s, total: 23min 26s
Wall time: 33min 39s


Unnamed: 0,dataset,RMSE,MAE,Precision@k,Recall@k,NDCG
0,M100,7.722212,7.631977,0.069,1.0,1.0


In [14]:
%%time 

# Create mappings for userIds and movieIds to contiguous indices
user_mapping = {user_id: i for i, user_id in enumerate(M50_df['userId'].unique())}
movie_mapping = {movie_id: i for i, movie_id in enumerate(M50_df['movieId'].unique())}

# Create reverse mappings for later use
reverse_user_mapping = {i: user_id for user_id, i in user_mapping.items()}
reverse_movie_mapping = {i: movie_id for movie_id, i in movie_mapping.items()}

# Apply the mappings to the dataframes
M50_df['userId'] = M50_df['userId'].map(user_mapping)
M50_df['movieId'] = M50_df['movieId'].map(movie_mapping)

test_df['userId'] = test_df['userId'].map(user_mapping)
test_df['movieId'] = test_df['movieId'].map(movie_mapping)

# Drop rows with NaN userId or movieId
test_df.dropna(subset=['userId', 'movieId'], inplace=True)

# Convert userId and movieId to integer
test_df['userId'] = test_df['userId'].astype(int)
test_df['movieId'] = test_df['movieId'].astype(int)


n_users = M50_df['userId'].nunique()
n_items = M50_df['movieId'].nunique()

train_matrix = df_to_matrix(M50_df, n_users, n_items)
test_matrix = df_to_matrix(test_df, n_users, n_items)

knn_cf = KNN_CF(n_users, n_items, k=3)

# Fit the model to the M100 data
knn_cf.fit(train_matrix)

# Predict ratings for the Test set and evaluate
user_based_predictions = knn_cf.predict(test_matrix, mode='user')
test_predictions = user_based_predictions[test_matrix.nonzero()]
actual_ratings = test_matrix[test_matrix.nonzero()]

knn_predictions = [(uid, iid, true_r, est) for uid, iid, true_r, est in zip(test_df['userId'], test_df['movieId'], actual_ratings, test_predictions)]
# Compute metrics for the KNN model
knn_results_M50 = evaluate(knn_predictions)

# create a dataframe to concatenate the results
knn_results_M50 = pd.DataFrame(knn_results_M50, index=[0])
# add first column of the dataframe as the dataset name
knn_results_M50.insert(0, 'dataset', 'M50')
knn_results_M50

CPU times: user 8min 19s, sys: 2.95 s, total: 8min 22s
Wall time: 8min 24s


Unnamed: 0,dataset,RMSE,MAE,Precision@k,Recall@k,NDCG
0,M50,,,0.1,1.0,1.0
