In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

# Data Overview

In [2]:
# Loads the data
df = pd.read_csv('../data/ml-latest-small/ratings.csv')
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


### --------> OBSERVATIONS:

+ movieId: A unique identifier for the movie.
+ title: The title of the movie, along with its release year in parentheses.
+ genres: The genres associated with the movie, separated by pipe characters (|).

In [3]:
# unique users
print(f'Number of unique users: {df.userId.unique().shape[0]}\n')

# unique movies
print(f'Number of unique movies: {df.movieId.unique().shape[0]}\n')

# unique ratings
print(f'Number of unique ratings: {df.rating.unique().shape[0]}\n')


Number of unique users: 610

Number of unique movies: 9724

Number of unique ratings: 10



# Replaces any infinities in the data with NaN

In [4]:
# check for missing values and infinities
df.isnull().sum()
df.isnull().values.any()
# check for infinities
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.isnull().values.any()

False

# Splits the data into a training set and a test set using a user-stratified train-test split

In [5]:
def split_data_by_rated_items(df, test_size, given_n):
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=42, stratify=df['userId'])

    # For each user in the test set, keep only 'given_n' rated items if they have rated that many,
    # otherwise keep all the items they have rated.
    test_df = test_df.groupby('userId').apply(lambda x: x.sample(min(len(x), given_n), random_state=42))

    return train_df, test_df.reset_index(drop=True)


def split_data_by_unique_users(df):
    unique_users = df['userId'].unique()
    np.random.shuffle(unique_users)

    # Get the user IDs for each set
    M50_users = unique_users[:50]
    M100_users = unique_users[50:150]
    M400_users = unique_users[150:550]
    test_users = unique_users[550:]

    # Split the DataFrame into the different sets based on the user IDs
    M50_df = df[df['userId'].isin(M50_users)]
    M100_df = df[df['userId'].isin(M100_users)]
    M400_df = df[df['userId'].isin(M400_users)]
    test_df = df[df['userId'].isin(test_users)]

    return M50_df, M100_df, M400_df, test_df


def all_but_one(df):
    # For each user, select one rating and split it into a separate DataFrame
    test_df = df.groupby('userId').sample(n=1, random_state=42)
    train_df = df.drop(test_df.index)
    
    return train_df, test_df

# Call the function
M50_df, M100_df, M400_df, test_df = split_data_by_unique_users(df)

print('M50 set:\n', M50_df)
print('M100 set:\n', M100_df)
print('M400 set:\n', M400_df)
print('Test set:\n', test_df)

# Call the functions
train_df_given_10, test_df_given_10 = split_data_by_rated_items(df, test_size=0.2, given_n=10)  # Modify test_size and given_n as needed
print('Training set:\n', train_df_given_10)
print('Test set:\n', test_df_given_10)

train_df, test_df = all_but_one(df)
print('All-But-One Training set:\n', train_df)
print('All-But-One Test set:\n', test_df)



M50 set:
        userId  movieId  rating  timestamp
5377       38       11     5.0  841341447
5378       38       17     3.0  841341494
5379       38       21     3.0  841341362
5380       38       39     3.0  841341384
5381       38       48     3.0  841341570
...       ...      ...     ...        ...
97138     604      636     3.0  832080690
97139     604      637     4.0  832081130
97140     604      708     3.0  832080461
97141     604      724     3.0  832080735
97142     604      742     4.0  832080636

[5330 rows x 4 columns]
M100 set:
        userId  movieId  rating   timestamp
0           1        1     4.0   964982703
1           1        3     4.0   964981247
2           1        6     4.0   964982224
3           1       47     5.0   964983815
4           1       50     5.0   964982931
...       ...      ...     ...         ...
91723     594     8727     2.0  1108950862
91724     594     8778     4.0  1108951196
91725     594     8866     4.5  1108975669
91726     594     88

In [6]:
train_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


# Evaluation Metrics

+ RMSE
+ MAE

In [8]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt

# rmse: root mean squared error
def rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return sqrt(mse)

# mae: mean absolute error
def mae(y_true, y_pred):
    return np.mean(np.abs(np.array(y_true) - np.array(y_pred)))

# f1 score: harmonic mean of precision and recall
def f1_score(precisions, recalls):
    f1_scores = dict()
    for uid in precisions.keys():
        p, r = precisions[uid], recalls[uid]
        f1_scores[uid] = 2*(p*r) / (p + r) if (p + r) != 0 else 0
    return f1_scores

# precision and recall at k
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

# ndcg at k: normalized discounted cumulative gain
def ndcg_at_k(predictions, k=10):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est in predictions:
        user_est_true[uid].append((est, true_r))

    ndcg_values = []
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        dcg = sum((rel / np.log2(ind + 2)) for ind, (est, rel) in enumerate(user_ratings[:k]))
        idcg = sum((rel / np.log2(ind + 2)) for ind, (est, rel) in enumerate(sorted(user_ratings, key=lambda x: x[1], reverse=True)[:k]))
        ndcg_values.append(dcg / idcg if idcg > 0.0 else 0.0)

    return np.mean(ndcg_values)


# evaluate the model recording RMSE, MAE, precision and recall at k, F1 score, and NDCG at k metrics in a single dictionary
def evaluate(predictions, k=10, threshold=3.5):
    precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=threshold)
    f1_scores = f1_score(precisions, recalls)
    ndcg = ndcg_at_k(predictions, k=k)
    return {
        'RMSE': rmse([true_r for uid, iid, true_r, est in predictions], [est for uid, iid, true_r, est in predictions]),
        'MAE': mae([true_r for uid, iid, true_r, est in predictions], [est for uid, iid, true_r, est in predictions]),
        'Precision@k': sum(prec for prec in precisions.values()) / len(precisions),
        'Recall@k': sum(rec for rec in recalls.values()) / len(recalls),
        'F1 score': sum(f1 for f1 in f1_scores.values()) / len(f1_scores),
        'NDCG': ndcg
    }

# SVD

+ "cold-start handling"

In [9]:
class SVD:
    def __init__(self, num_factors, learning_rate, num_epochs, top_n=10):
        # Initializing the instance variables with given arguments
        self.num_factors = num_factors
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.top_n = top_n  # number of movies to recommend for cold start

    def fit(self, user_item_ratings):
        # Initializing the user and movie latent factors matrices with random numbers
        self.user_factors = np.random.randn(user_item_ratings.userId.nunique(), self.num_factors)
        self.movie_factors = np.random.randn(user_item_ratings.movieId.nunique(), self.num_factors)
        
        # Creating dictionaries to map user and movie IDs to their respective indices in the factor matrices
        self.user_index = {user_id: idx for idx, user_id in enumerate(user_item_ratings.userId.unique())}
        self.movie_index = {movie_id: idx for idx, movie_id in enumerate(user_item_ratings.movieId.unique())}

        # Calculate average rating for each movie
        self.movie_avg_rating = user_item_ratings.groupby('movieId')['rating'].mean().to_dict()

        # Get top-N movies based on average rating for cold start problem
        sorted_movies_by_avg_rating = sorted(self.movie_avg_rating.items(), key=lambda x: x[1], reverse=True)
        self.top_n_movies = [movie_id for movie_id, _ in sorted_movies_by_avg_rating[:self.top_n]]

        # Loop over epochs
        for epoch in range(self.num_epochs):
            # Loop over all user-item-rating rows in the DataFrame
            for idx, row in user_item_ratings.iterrows():
                user_id = row['userId']
                movie_id = row['movieId']
                rating = row['rating']

                # Getting the user and movie indices for the current user-item pair
                user_idx = self.user_index[user_id]
                movie_idx = self.movie_index[movie_id]

                # Computing the predicted rating as the dot product of the user and movie factors
                prediction = np.dot(self.user_factors[user_idx], self.movie_factors[movie_idx])
                # Computing the error as the difference between the actual and predicted ratings
                error = rating - prediction

                # Updating the user and movie factor vectors in the direction that minimizes the error
                self.user_factors[user_idx] += self.learning_rate * error * self.movie_factors[movie_idx]
                self.movie_factors[movie_idx] += self.learning_rate * error * self.user_factors[user_idx]

    def predict(self, user_id, movie_id):
        # Getting the user and movie indices for the given user-item pair
        user_idx = self.user_index.get(user_id, -1)
        movie_idx = self.movie_index.get(movie_id, -1)

        # If the user or the movie is not present in the training data, return the movie's average rating
        if user_idx == -1 or movie_idx == -1:
            return self.movie_avg_rating.get(movie_id)

        # Otherwise, return the predicted rating as the dot product of the user and movie factors
        return np.dot(self.user_factors[user_idx], self.movie_factors[movie_idx])

    def recommend(self, user_id):
        # If the user is not present in the training data, return top-N movies
        if user_id not in self.user_index:
            return self.top_n_movies

        # Otherwise, predict the rating for each movie and return the top-N movies
        user_ratings = {movie_id: self.predict(user_id, movie_id) for movie_id in self.movie_index.keys()}
        sorted_user_ratings = sorted(user_ratings.items(), key=lambda x: x[1], reverse=True)
        return [movie_id for movie_id, _ in sorted_user_ratings[:self.top_n]]
    

svd = SVD(num_factors=35, learning_rate=0.01, num_epochs=10, top_n=10)


In [10]:
def evaluate_model(df, model):
    import time
    
    start_time = time.time()

    # Fit the model to the data
    model.fit(df)

    # Predict ratings for the Test set and evaluate
    test_predictions = test_df.apply(lambda row: model.predict(row['userId'], row['movieId']), axis=1)
    
    # Remove None values and corresponding actual ratings
    actual_ratings = test_df['rating'][test_predictions.notna()]
    test_predictions = test_predictions.dropna()

    svd_predictions = [(uid, iid, true_r, est) for uid, iid, true_r, est in zip(test_df['userId'], test_df['movieId'], actual_ratings, test_predictions)]
    
    # Compute metrics for the model
    metrics = evaluate(svd_predictions)
    
    end_time = time.time()
    
    print(f"Execution time for: {end_time - start_time} seconds")
    
    return metrics


In [11]:
print(f"Execution time for M50 dataset")
svd_metrics_M50 = evaluate_model(M50_df, svd)
print(f"The evaluation metrics for the SVD model are: {svd_metrics_M50}\n")

print(f"Execution time for M100 dataset")
svd_metrics_M100 = evaluate_model(M100_df, svd)
print(f"The evaluation metrics for the SVD model are: {svd_metrics_M100}\n")

print(f"Execution time for M400 dataset")
svd_metrics_M400 = evaluate_model(M400_df, svd)
print(f"The evaluation metrics for the SVD model are: {svd_metrics_M400}\n")



# Convert the metrics to DataFrame
df_M50 = pd.DataFrame([svd_metrics_M50])
df_M50['Dataset'] = 'M50'

df_M100 = pd.DataFrame([svd_metrics_M100])
df_M100['Dataset'] = 'M100'

df_M400 = pd.DataFrame([svd_metrics_M400])
df_M400['Dataset'] = 'M400'

# Concatenate the DataFrames
metrics_df = pd.concat([df_M50, df_M100, df_M400], ignore_index=True)

# Reorder the columns
cols = metrics_df.columns.tolist()
cols = cols[-1:] + cols[:-1]  # Move the last column to first
metrics_df = metrics_df[cols]

metrics_df


Execution time for M50 dataset
Execution time for: 2.2075259685516357 seconds
The evaluation metrics for the SVD model are: {'RMSE': 1.041359842194938, 'MAE': 0.7931285610571793, 'Precision@k': 0.8024193548387096, 'Recall@k': 0.842741935483871, 'F1 score': 0.6451612903225806, 'NDCG': 1.0}

Execution time for M100 dataset
Execution time for: 5.991877794265747 seconds
The evaluation metrics for the SVD model are: {'RMSE': 1.0235664676943952, 'MAE': 0.7638649666840859, 'Precision@k': 0.8512544802867383, 'Recall@k': 0.8243727598566308, 'F1 score': 0.6756272401433692, 'NDCG': 1.0}

Execution time for M400 dataset
Execution time for: 28.712896823883057 seconds
The evaluation metrics for the SVD model are: {'RMSE': 0.8189564361794365, 'MAE': 0.5815151122352734, 'Precision@k': 0.9139072847682119, 'Recall@k': 0.8609271523178808, 'F1 score': 0.7748344370860927, 'NDCG': 1.0}



Unnamed: 0,Dataset,RMSE,MAE,Precision@k,Recall@k,F1 score,NDCG
0,M50,1.04136,0.793129,0.802419,0.842742,0.645161,1.0
1,M100,1.023566,0.763865,0.851254,0.824373,0.675627,1.0
2,M400,0.818956,0.581515,0.913907,0.860927,0.774834,1.0


# KNN based CF

In [12]:
class KNN_CF:
    def __init__(self, n_users, n_items, k=3, gamma=0, delta=25, epsilon=1e-9):
        self.n_users = n_users
        self.n_items = n_items
        self.k = k
        self.gamma = gamma
        self.delta = delta
        self.epsilon = epsilon
        self.user_corrs = np.zeros((n_users, n_users))
        self.item_corrs = np.zeros((n_items, n_items))

    def fit(self, user_item_matrix):
        # user-based
        for i in range(self.n_users):
            for j in range(self.n_users):
                self.user_corrs[i, j] = self.pearson_corr(user_item_matrix[i], user_item_matrix[j])

        # item-based
        for i in range(self.n_items):
            for j in range(self.n_items):
                self.item_corrs[i, j] = self.pearson_corr(user_item_matrix[:, i], user_item_matrix[:, j])

    def predict(self, user_item_matrix, mode='user'):
        predictions = np.zeros((self.n_users, self.n_items))
        if mode == 'user':
            for i in range(self.n_users):
                for j in range(self.n_items):
                    if user_item_matrix[i, j] > 0:
                        sim_users = np.argsort(self.user_corrs[i])[-(self.k + 1):-1]
                        predictions[i, j] = self.predict_rating(user_item_matrix, sim_users, i, j, mode)
        elif mode == 'item':
            for i in range(self.n_users):
                for j in range(self.n_items):
                    if user_item_matrix[i, j] > 0:
                        sim_items = np.argsort(self.item_corrs[j])[-(self.k + 1):-1]
                        predictions[i, j] = self.predict_rating(user_item_matrix, sim_items, i, j, mode)
        return predictions

    def pearson_corr(self, vec_i, vec_j):
        mask_i = vec_i > 0
        mask_j = vec_j > 0
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            return 0
        mean_i = np.mean(vec_i[corrated_index])
        mean_j = np.mean(vec_j[corrated_index])
        sub_i = vec_i[corrated_index] - mean_i
        sub_j = vec_j[corrated_index] - mean_j
        return np.sum(sub_i * sub_j) / (np.sqrt(np.sum(np.square(sub_i))) * np.sqrt(np.sum(np.square(sub_j))) + self.epsilon)

    def predict_rating(self, user_item_matrix, sim_indices, i, j, mode):
        if mode == 'user':
            sim_ratings = user_item_matrix[sim_indices, j]
            sim_means = np.array([np.mean(user_item_matrix[k][user_item_matrix[k]>0]) for k in sim_indices])
            sim_vals = self.user_corrs[i][sim_indices]
        elif mode == 'item':
            sim_ratings = user_item_matrix[i, sim_indices]
            sim_means = np.array([np.mean(user_item_matrix[:, k][user_item_matrix[:, k]>0]) for k in sim_indices])
            sim_vals = self.item_corrs[j][sim_indices]
        if np.sum(sim_vals) == 0:
            return np.mean(sim_ratings)
        else:
            return np.mean(sim_ratings) + np.sum(sim_vals * (sim_ratings - sim_means)) / np.sum(sim_vals)


In [13]:
def df_to_matrix(df, nrows, ncols):
    matrix = np.zeros((nrows, ncols))
    for row in df.itertuples():
        matrix[row.userId, row.movieId] = row.rating
    return matrix

# Create mappings for userIds and movieIds to contiguous indices
user_mapping = {user_id: i for i, user_id in enumerate(M100_df['userId'].unique())}
movie_mapping = {movie_id: i for i, movie_id in enumerate(M100_df['movieId'].unique())}

# Create reverse mappings for later use
reverse_user_mapping = {i: user_id for user_id, i in user_mapping.items()}
reverse_movie_mapping = {i: movie_id for movie_id, i in movie_mapping.items()}

# Apply the mappings to the dataframes
M100_df['userId'] = M100_df['userId'].map(user_mapping)
M100_df['movieId'] = M100_df['movieId'].map(movie_mapping)

test_df['userId'] = test_df['userId'].map(user_mapping)
test_df['movieId'] = test_df['movieId'].map(movie_mapping)

# Drop rows with NaN userId or movieId
test_df.dropna(subset=['userId', 'movieId'], inplace=True)

# Convert userId and movieId to integer
test_df['userId'] = test_df['userId'].astype(int)
test_df['movieId'] = test_df['movieId'].astype(int)


n_users = M100_df['userId'].nunique()
n_items = M100_df['movieId'].nunique()

train_matrix = df_to_matrix(M100_df, n_users, n_items)
test_matrix = df_to_matrix(test_df, n_users, n_items)

knn_cf = KNN_CF(n_users, n_items, k=3)

# Fit the model to the M100 data
knn_cf.fit(train_matrix)

# Predict ratings for the Test set and evaluate
user_based_predictions = knn_cf.predict(test_matrix, mode='user')
test_predictions = user_based_predictions[test_matrix.nonzero()]
actual_ratings = test_matrix[test_matrix.nonzero()]

# print('Test RMSE (M50):', sqrt(mean_squared_error(actual_ratings, test_predictions)))
# print('Test MAE (M50):', mean_absolute_error(actual_ratings, test_predictions))

knn_predictions = [(uid, iid, true_r, est) for uid, iid, true_r, est in zip(test_df['userId'], test_df['movieId'], actual_ratings, test_predictions)]
# Compute metrics for the KNN model
knn_metrics_M100 = evaluate(knn_predictions)
knn_metrics_M100


knn_predictions = [(uid, iid, true_r, est) for uid, iid, true_r, est in zip(test_df['userId'], test_df['movieId'], actual_ratings, test_predictions)]
# Compute metrics for the KNN model
knn_metrics = evaluate(knn_predictions)
knn_metrics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  M100_df['userId'] = M100_df['userId'].map(user_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  M100_df['movieId'] = M100_df['movieId'].map(movie_mapping)


NameError: name 'knn_metrics' is not defined

In [14]:
 knn_metrics_

{'RMSE': 6.897463180889444,
 'MAE': 6.7631290634978765,
 'Precision@k': 1.0,
 'Recall@k': 0.4,
 'F1 score': 0.4,
 'NDCG': 1.0}

In [15]:
knn_predictions = [(uid, iid, true_r, est) for uid, iid, true_r, est in zip(test_df['userId'], test_df['movieId'], actual_ratings, test_predictions)]
# Compute metrics for the KNN model
knn_metrics = evaluate(knn_predictions)
knn_metrics

{'RMSE': 6.897463180889444,
 'MAE': 6.7631290634978765,
 'Precision@k': 1.0,
 'Recall@k': 0.4,
 'F1 score': 0.4,
 'NDCG': 1.0}

In [23]:
%%time

def process_dataframe(df):
    df = df.copy()  # Add this line

    # Create mappings for userIds and movieIds to contiguous indices
    user_mapping = {user_id: i for i, user_id in enumerate(df['userId'].unique())}
    movie_mapping = {movie_id: i for i, movie_id in enumerate(df['movieId'].unique())}

    # Apply the mappings to the dataframe
    df.loc[:, 'userId'] = df['userId'].map(user_mapping)
    df.loc[:, 'movieId'] = df['movieId'].map(movie_mapping)

    # Drop rows with NaN userId or movieId
    df.dropna(subset=['userId', 'movieId'], inplace=True)

    # Convert userId and movieId to integer
    df.loc[:, 'userId'] = df['userId'].astype(int)
    df.loc[:, 'movieId'] = df['movieId'].astype(int)
    
    return df, user_mapping, movie_mapping



def train_knn(df, test_df, k=3):
    n_users = df['userId'].nunique()
    n_items = df['movieId'].nunique()

    train_matrix = df_to_matrix(df, n_users, n_items)
    test_matrix = df_to_matrix(test_df, n_users, n_items)

    knn_cf = KNN_CF(n_users, n_items, k)

    # Fit the model to the data
    knn_cf.fit(train_matrix)

    # Predict ratings for the Test set and evaluate
    user_based_predictions = knn_cf.predict(test_matrix, mode='user')
    test_predictions = user_based_predictions[test_matrix.nonzero()]
    actual_ratings = test_matrix[test_matrix.nonzero()]
    
    knn_predictions = [(uid, iid, true_r, est) for uid, iid, true_r, est in zip(test_df['userId'], test_df['movieId'], actual_ratings, test_predictions)]

    # Compute metrics for the KNN model
    knn_metrics = evaluate(knn_predictions)

    return knn_metrics

# Process and train on M100_df
M100_df, user_mapping, movie_mapping = process_dataframe(M100_df)
test_df['userId'] = test_df['userId'].map(user_mapping)
test_df['movieId'] = test_df['movieId'].map(movie_mapping)
knn_metrics_M100 = train_knn(M100_df, test_df)

# Process and train on M50_df
M50_df, user_mapping, movie_mapping = process_dataframe(M50_df)
test_df['userId'] = test_df['userId'].map(user_mapping)
test_df['movieId'] = test_df['movieId'].map(movie_mapping)
knn_metrics_M50 = train_knn(M50_df, test_df)

# Process and train on M400_df
M400_df, user_mapping, movie_mapping = process_dataframe(M400_df)
test_df['userId'] = test_df['userId'].map(user_mapping)
test_df['movieId'] = test_df['movieId'].map(movie_mapping)
knn_metrics_M400 = train_knn(M400_df, test_df)

print("KNN Metrics for M100: ", knn_metrics_M100)
print("KNN Metrics for M50: ", knn_metrics_M50)
print("KNN Metrics for M400: ", knn_metrics_M400)


In [None]:
%%time
def evaluate_model(df, model, dataset_name):
    
    # Copy df and apply processing
    df, user_mapping, movie_mapping = process_dataframe(df)
    
    # Apply mappings to test_df
    test_df_copy = test_df.copy()
    test_df_copy['userId'] = test_df_copy['userId'].map(user_mapping)
    test_df_copy['movieId'] = test_df_copy['movieId'].map(movie_mapping)
    
    # Train and evaluate model
    model_metrics = train_knn(df, test_df_copy)
    
    # Add execution time to metrics
    exec_time = time.time() - start_time
    model_metrics['Execution time'] = exec_time
    
    # Add dataset name to metrics
    model_metrics['Dataset'] = dataset_name
    
    # Convert metrics to DataFrame
    model_metrics_df = pd.DataFrame([model_metrics])
    
    return model_metrics_df

# Evaluate models for each dataset
metrics_M50 = evaluate_model(M50_df, KNN_CF(n_users=M50_df['userId'].nunique(), n_items=M50_df['movieId'].nunique(), k=3), 'M50')
metrics_M100 = evaluate_model(M100_df, KNN_CF(n_users=M100_df['userId'].nunique(), n_items=M100_df['movieId'].nunique(), k=3), 'M100')
metrics_M400 = evaluate_model(M400_df, KNN_CF(n_users=M400_df['userId'].nunique(), n_items=M400_df['movieId'].nunique(), k=3), 'M400')

# Concatenate metrics DataFrames
metrics_df = pd.concat([metrics_M50, metrics_M100, metrics_M400], ignore_index=True)

# Reorder the columns
cols = metrics_df.columns.tolist()
cols = cols[-2:] + cols[:-2]  # Move the last two columns to first
metrics_df = metrics_df[cols]

metrics_df


NameError: name 'time' is not defined

In [None]:
def train_knn(df, test_df, k=3):
    # Start time
    start_time = time.time()
    n_users = df['userId'].nunique()
    n_items = df['movieId'].nunique()

    train_matrix = df_to_matrix(df, n_users, n_items)
    test_matrix = df_to_matrix(test_df, n_users, n_items)

    knn_cf = KNN_CF(n_users, n_items, k)

    # Fit the model to the data
    knn_cf.fit(train_matrix)

    # Predict ratings for the Test set and evaluate
    user_based_predictions = knn_cf.predict(test_matrix, mode='user')
    test_predictions = user_based_predictions[test_matrix.nonzero()]
    actual_ratings = test_matrix[test_matrix.nonzero()]

    # Remove NaN and inf values
    clean_predictions = test_predictions[~np.isnan(test_predictions) & ~np.isinf(test_predictions)]
    clean_actual_ratings = actual_ratings[~np.isnan(actual_ratings) & ~np.isinf(actual_ratings)]
    
    knn_predictions = [(uid, iid, true_r, est) for uid, iid, true_r, est in zip(test_df['userId'], test_df['movieId'], clean_actual_ratings, clean_predictions)]

    # Compute metrics for the KNN model
    knn_metrics = evaluate(knn_predictions)

    return knn_metrics


def evaluate(predictions, k=10, threshold=3.5):
    precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=threshold)
    f1_scores = f1_score(precisions, recalls)
    ndcg = ndcg_at_k(predictions, k=k)

    y_true = [true_r for uid, iid, true_r, est in predictions]
    y_pred = [est for uid, iid, true_r, est in predictions]

    # Remove NaN and inf values
    y_true_clean = np.array(y_true)[~np.isnan(y_true) & ~np.isinf(y_true)]
    y_pred_clean = np.array(y_pred)[~np.isnan(y_pred) & ~np.isinf(y_pred)]
    
    return {
        'RMSE': rmse(y_true_clean, y_pred_clean),
        'MAE': mae(y_true_clean, y_pred_clean),
        'Precision@k': sum(prec for prec in precisions.values()) / len(precisions),
        'Recall@k': sum(rec for rec in recalls.values()) / len(recalls),
        'F1 score': sum(f1 for f1 in f1_scores.values()) / len(f1_scores),
        'NDCG': ndcg
    }


In [None]:
def map_and_dropna(df, user_mapping, movie_mapping):
    df = df.copy()
    df['userId'] = df['userId'].map(user_mapping)
    df['movieId'] = df['movieId'].map(movie_mapping)
    df.dropna(subset=['userId', 'movieId'], inplace=True)
    
    # Convert to integer type if they are not NaN
    df.loc[df['userId'].notna(), 'userId'] = df.loc[df['userId'].notna(), 'userId'].astype(int)
    df.loc[df['movieId'].notna(), 'movieId'] = df.loc[df['movieId'].notna(), 'movieId'].astype(int)

    return df



# Process and train on M100_df
M100_df, user_mapping, movie_mapping = process_dataframe(M100_df)
test_df_M100 = map_and_dropna(test_df, user_mapping, movie_mapping)
knn_metrics_M100 = train_knn(M100_df, test_df_M100)

# Process and train on M50_df
M50_df, user_mapping, movie_mapping = process_dataframe(M50_df)
test_df_M50 = map_and_dropna(test_df, user_mapping, movie_mapping)
knn_metrics_M50 = train_knn(M50_df, test_df_M50)

# Process and train on M400_df
M400_df, user_mapping, movie_mapping = process_dataframe(M400_df)
test_df_M400 = map_and_dropna(test_df, user_mapping, movie_mapping)
knn_metrics_M400 = train_knn(M400_df, test_df_M400)


NameError: name 'time' is not defined