Tasks
1. Data Retrieval and Preprocessing
- Obtain MovieLens 1M dataset. []
- Load dataset. []
- Check data integrity: []
- Address issues like missing movies. []
- Handle data inconsistencies (e.g., user IDs with additional data, ratings for non-existent movies). []
- Create User-Item Interaction Matrix. []
- Split data for 5-fold cross-validation. []
- Handling Cold Starts (dealing with users or items not seen during training). []
2. Recommendation Algorithms
- Implement Naive Approaches: []
- Global Average Rating. []
- Average Rating per Item. []
- Average Rating per User. []
- Optimal Linear Combination with and without bias. []
- Implement UV Matrix Decomposition. []
- Implement Matrix Factorization with Gradient Descent and Regularization. []
- For each algorithm, calculate: 
- RMSE (Root Mean Squared Error) and MAE (Mean Absolute Error). []
- Address Cold Starts for the implemented algorithms. []
3. Visualization
- Apply dimensionality reduction techniques for visualization: []
- PCA (Principal Component Analysis). []
- t-SNE (t-Distributed Stochastic Neighbor Embedding). []
- UMAP (Uniform Manifold Approximation and Projection). []
4. Documentation and Reporting 
- Document code, algorithms, and preprocessing steps. []
- Summarize and analyze the results of each algorithm. []
- Provide insights into the best-performing algorithms. []
- Discuss challenges and limitations encountered during the implementation. []

In [3]:
import pandas as pd
import numpy as np
import sklearn as sklearn
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# import umap
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.linear_model import LinearRegression

## Data retrival

In [4]:
# Loading data sets

# movies
df_movies = pd.read_csv("ml-1m/movies.dat", sep='::', encoding='ISO-8859-1', header=None, engine='python', names=['MovieID', 'Title', 'Genres'])
df_movies = df_movies.rename({0: 'MovieID', 1: 'Title', 2: 'Genre'}, axis='columns')

#ratings
df_ratings = pd.read_csv("ml-1m/ratings.dat", sep='::', encoding='ISO-8859-1', header=None, engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

#users
df_users = pd.read_csv("ml-1m/users.dat", sep='::', encoding='ISO-8859-1', header=None, engine="python", names=['UserID', 'Gender', 'Age', 'Occupation', 'ZipCode'])
df_users.columns = ['UserID', 'Gender', 'Age', 'Occupation', 'ZipCode']



In [5]:
# Check for missing values in ratings dataset
df_ratings.isnull().sum()

# Check for missing values in movies dataset
df_movies.isnull().sum()

# Check for missing values in users dataset
df_users.isnull().sum()



UserID        0
Gender        0
Age           0
Occupation    0
ZipCode       0
dtype: int64

In [6]:
# Checking data integrity

# movies
print(df_movies.head())
print

# ratings
print(df_ratings.head())
print

# users

print(df_users.head())

# In movies.dat there is missing movieID of 91 we create a placeholder
new_movie = pd.DataFrame({'MovieID': [91], 'Title': ['Unknown'], 'Genres': ['Unknown']})
df_movies = pd.concat([df_movies, new_movie], ignore_index=True)




   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
   UserID Gender  Age  Occupation ZipCode
0       1      F    1          10   48067
1       2      M   56          16   70072
2       3      M   25          15   55117
3       4      M   45           7   02460
4       5      M   25          20   55455


In [7]:
# Users split into 5

num_folds = 5

# Kfold object to split data into
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Convert the DataFrame to a list to ensure consistent indices
user_data = df_users.values
ratings_data = df_ratings.values

# Split the data into 5 folds
for fold, (train_indices, test_indices) in enumerate(kf.split(user_data)):
    # Split the users dataset
    train_users = df_users.iloc[train_indices]
    test_users = df_users.iloc[test_indices]

    # Split the ratings dataset
    train_ratings = df_ratings[df_ratings['UserID'].isin(train_users['UserID'])]
    test_ratings = df_ratings[df_ratings['UserID'].isin(test_users['UserID'])]

    print(f"Fold {fold + 1} - Train Users: {len(train_users)}, Test Users: {len(test_users)}")


Fold 1 - Train Users: 4832, Test Users: 1208
Fold 2 - Train Users: 4832, Test Users: 1208
Fold 3 - Train Users: 4832, Test Users: 1208
Fold 4 - Train Users: 4832, Test Users: 1208
Fold 5 - Train Users: 4832, Test Users: 1208


In [8]:
# Movie split into 5
num_folds = 5

# Kfold object to split data into
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Convert the DataFrame to a list to ensure consistent indices
movie_data = df_movies.values

# Split the data into 5 folds
for fold, (train_indices, test_indices) in enumerate(kf.split(movie_data)):
    # Split the movies dataset
    train_movies = df_movies.iloc[train_indices]
    test_movies = df_movies.iloc[test_indices]

    print(f"Fold {fold + 1} - Train Movies: {len(train_movies)}, Test Movies: {len(test_movies)}")

Fold 1 - Train Movies: 3107, Test Movies: 777
Fold 2 - Train Movies: 3107, Test Movies: 777
Fold 3 - Train Movies: 3107, Test Movies: 777
Fold 4 - Train Movies: 3107, Test Movies: 777
Fold 5 - Train Movies: 3108, Test Movies: 776


In [9]:
# Ratins into 5
num_folds = 5

# Kfold object to split data into
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Convert the DataFrame to a list to ensure consistent indices
ratings_data = df_ratings.values

# Split the data into 5 folds
for fold, (train_indices, test_indices) in enumerate(kf.split(ratings_data)):
    # Split the ratings dataset
    train_ratings = df_ratings.iloc[train_indices]
    test_ratings = df_ratings.iloc[test_indices]

    print(f"Fold {fold + 1} - Train Ratings: {len(train_ratings)}, Test Ratings: {len(test_ratings)}")


Fold 1 - Train Ratings: 800167, Test Ratings: 200042
Fold 2 - Train Ratings: 800167, Test Ratings: 200042
Fold 3 - Train Ratings: 800167, Test Ratings: 200042
Fold 4 - Train Ratings: 800167, Test Ratings: 200042
Fold 5 - Train Ratings: 800168, Test Ratings: 200041


# Recommendations algorithms

## Naive approach

In [10]:
class recommenderSystem():


    def Naive_1(self, train_df, test_df):
        # Naive Approach
        r_item = train_df.groupby('MovieID')['Rating'].mean().reset_index().rename({'Rating':
                                                                        'R_item'},axis='columns')
        
        r_user = train_df.groupby('UserID')['Rating'].mean().reset_index().rename({'Rating':
                                                                        'R_user'},axis='columns')

        train_df=train_df.merge(r_item, on=['MovieID']).merge(r_user, on=['UserID'])

        #Handle instances were we do not have instances in the training set of movies/users in test set
        test_df=test_df.merge(r_item, on=['MovieID']).merge(r_user, on=['UserID'])
        test_only_users = set(test_df['user_id']) - set(train_df['user_id'])
        test_only_movies = set(test_df['movie_id']) - set(train_df['movie_id'])
        global_average_rating = train_df['Rating'].mean()
        for user in test_only_users:
            test_df.loc[test_df['user_id'] == user, 'R_user'] = global_average_rating

        for movie in test_only_movies:
            test_df.loc[test_df['movie_id'] == movie, 'R_item'] = global_average_rating


        X = train_df[['R_item','R_user']]
        y = train_df['Rating']
        model = LinearRegression().fit(X, y)

        alpha, beta = model.coef_
        gamma = model.intercept_

        X_test = test_df[['R_item','R_user']]
        y_test = test_df["Rating"]
        
        # Predict ratings for the test set
        y_pred = model.predict(X_test)

        # Calculate the root mean squared error (RMSE) for the predictions
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        # Print the coefficients and RMSE
        print(f"Alpha: {alpha}, Beta: {beta}, Gamma: {gamma}")
        print(f"Root Mean Squared Error: {rmse}")
        
        return train_df, test_df

    def matrix_factorization(self, train_df, test_df):
        # The Matrix Factorization

        num_factors=10
        num_iter=75
        reg=0.05
        lr=0.005
        num_users = df_ratings['UserID'].nunique()
        num_movies = df_ratings['MovieID'].nunique()
        
        U = np.random.rand(num_users, num_factors)
        V = np.random.rand(num_movies, num_factors).T
        R = pd.pivot_table(train_df, index='UserID', columns='MovieID', values='Rating').values
        for i in range(num_iter):
            for i in range(num_users):
                for j in range(num_movies):
                    error = R[i][j] -  np.dot(U[i, :], V[:, j])
                    for k in range(num_factors):
                        U[i][k] += lr * (error * V[k][j] - reg*  U[i][k])
                        V[k][j] += lr * (error * U[i][k] - reg*  V[k][j])
            # Compute loss
            total_error = 0
            for i in range(num_users):
                for j in range(num_movies):
                    if R[i][j] > 0:
                        total_error += (R[i][j] - np.dot(U[i, :], V[:, j]))**2

            print(f"Total Training Error epoch {e}: {total_error}")
       # Test the model on the test set and compute the RMSE
            test_matrix = pd.pivot_table(test_df, index='UserID', columns='MovieID', values='Rating').values
            test_error = 0
            count = 0
            
            for i in range(num_users):
                for j in range(num_movies):
                    if not np.isnan(test_matrix[i][j]):
                        pred = np.dot(U[i, :], V[:, j])
                        test_error += (test_matrix[i][j] - pred)**2
                        count += 1
            
            if count > 0:
                test_rmse = sqrt(test_error / count)
                print(f"Iteration {iteration + 1}: Test RMSE = {test_rmse}")
        
    def visualisation_1(self):
        # Apply PCA
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(data)
        
    def visualisation_2(self):
        # Apply t-SNE
        tsne = TSNE(n_components=2, verbose=1)
        tsne_result = tsne.fit_transform(data)
        
    def visualisation_3(self):
        # Apply UMAP
        umap_model = umap.UMAP(n_components=2)
        umap_result = umap_model.fit_transform(data)

    def cross_validation(self,df_ratings, folds, model):
        # prepare cross validation
        # Shuffle DataFrame
        df_ratings = df_ratings.sample(frac=1).reset_index(drop=True)

        # Split DataFrame into folds
        num_rows = len(df_ratings)
        fold_size = num_rows // folds
        splits = []

        for i in range(folds):
            start_index = i * fold_size
            end_index = (i + 1) * fold_size if i < folds - 1 else num_rows
            test_df = df_ratings.iloc[start_index:end_index]
            train_df = pd.concat([df_ratings.iloc[:start_index], df_ratings.iloc[end_index:]])
            if (model=="Naive"):
                train_df, test_df=self.Naive_1(train_df, test_df)
            elif (model=="Matrix"):
                train_df, test_df = self.matrix_factorization(train_df, test_df)
    
    def perf_measures(y_true,y_pred):
        # Calculate RMSE (Root Mean Squared Error)
        rmse = sqrt(mean_squared_error(y_true, y_pred))
        print(f'RMSE: {rmse}')

        # Calculate MAE (Mean Absolute Error)
        mae = mean_absolute_error(y_true, y_pred)
        print(f'MAE: {mae}')
        
    def main():
        train_list,test_list=self.cross_validartion(5);
        
            
        
if __name__ == '__main__':


            # Specify the file path
        file_path = 'ml-1m/ratings.dat'
        df_ratings = pd.read_csv(file_path, sep='::',header=None, engine='python')
        df_ratings = df_ratings.rename({0: 'UserID',
                                        1:'MovieID',
                                        2:'Rating',
                                        3:'Timestamp'},axis='columns')

        print(df_ratings.head())
        # Specify the file path
        file_path = 'ml-1m/users.dat'
        df_users = pd.read_csv(file_path, sep='::',header=None, engine='python')
        df_users = df_users.rename({0: 'UserID',
                                        1:'Gender',
                                        2:'Age',
                                        3:'Occupation',
                                        4: 'Zip-code'
                                        },axis='columns')
        print(df_users.head())
        # Specify the file path
        file_path = 'ml-1m/movies.dat'
        df_movies = pd.read_csv(file_path, sep='::', header=None, encoding='ISO-8859-1', engine='python')
        df_movies = df_movies.rename({0: 'MovieID',
                                        1:'Title',
                                        2:'Genre'},axis='columns')
        

        
        print(df_movies.head())
        rec= recommenderSystem()
        df_filled=rec.Naive_1(df_ratings)
        
        rec.matrix_factorization(df_filled)


   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455
   MovieID                               Title                         Genre
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


TypeError: recommenderSystem.Naive_1() missing 1 required positional argument: 'test_df'

# The UV matrix decomposition

In [11]:
# Load the ratings data
file_path_ratings = 'ml-1m/ratings.dat'
df_ratings = pd.read_csv(file_path_ratings, sep='::', header=None, engine='python')
df_ratings = df_ratings.rename(columns={0: 'UserID', 1: 'MovieID', 2: 'Rating', 3: 'Timestamp'})

# Load the movies data
file_path_movies = 'ml-1m/movies.dat'
df_movies = pd.read_csv(file_path_movies, sep='::', header=None, engine='python', encoding='latin-1')
df_movies = df_movies.rename(columns={0: 'MovieID', 1: 'Title', 2: 'Genres'})

# Merge the ratings and movies data based on MovieID
df_merged = df_ratings.merge(df_movies, on='MovieID')

# Create the user-item interaction matrix
user_item_matrix = df_merged.pivot(index='UserID', columns='MovieID', values='Rating')

# Optionally, convert the Pandas DataFrame to a NumPy array
user_item_matrix = user_item_matrix.values

# Display the user-item interaction matrix
print("User-Item Interaction Matrix:")
print(user_item_matrix)


User-Item Interaction Matrix:
[[ 5. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [ 3. nan nan ... nan nan nan]]


In [12]:

# Load the ratings data
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')

# Create the user-item interaction matrix
user_item_matrix = ratings.pivot(index='UserID', columns='MovieID', values='Rating')

# Data preparation: Filter out rows/columns with missing values (NaN)
valid_indices = ~np.isnan(user_item_matrix)
user_item_matrix = user_item_matrix[valid_indices]

# Calculate the global average rating
global_average = user_item_matrix.mean().mean()

# Initialize the U and V matrices with the global average divided by the number of latent factors
num_factors = 10  # Number of latent factors
num_users, num_items = user_item_matrix.shape  # Get the number of users and items from the filtered user-item matrix
U = np.full((num_users, num_factors), global_average / num_factors)
V = np.full((num_items, num_factors), global_average / num_factors)

# Display the shapes of U and V
print("Shape of U:", U.shape)
print("Shape of V:", V.shape)


Shape of U: (6040, 10)
Shape of V: (3706, 10)


In [13]:
# Hyperparameters
learning_rate = 0.01  
num_epochs = 1  
regularization = 0.1  # Regularization term

def calculate_rmse(user_item_matrix, U, V):
    predicted_ratings = np.dot(U, V.T)
    observed_ratings = user_item_matrix
    # Calculate the RMSE
    rmse = np.sqrt(np.nanmean((predicted_ratings - observed_ratings) ** 2))
    return rmse

# Training loop
for epoch in range(num_epochs):
    for user in range(num_users):
        for item in range(num_items):
            if not np.isnan(user_item_matrix.iloc[user, item]):
                # Calculate the error (difference between actual and predicted rating)
                error = user_item_matrix.iloc[user, item] - np.dot(U[user, :], V[item, :])
                
                # Update U and V using gradient descent
                U[user, :] += learning_rate * (error * V[item, :] - regularization * U[user, :])
                V[item, :] += learning_rate * (error * U[user, :] - regularization * V[item, :])
    
    # Calculate RMSE at the end of each epoch
    rmse = calculate_rmse(user_item_matrix, U, V)
    print(f"Epoch {epoch + 1}/{num_epochs}, RMSE: {rmse:.4f}")

# Display the shapes of U and V after training 
print("Shape of U after training:", U.shape)
print("Shape of V after training:", V.shape)


Epoch 1/1, RMSE: 1.4150
Shape of U after training: (6040, 10)
Shape of V after training: (3706, 10)


In [14]:
# Initialize an empty list to store RMSE values for each epoch
rmse_values = []

# Training loop
for epoch in range(num_epochs):
    for user in range(num_users):
        for item in range(num_items):
            if not np.isnan(user_item_matrix.iloc[user, item]):
                # Calculate the error (difference between actual and predicted rating)
                error = user_item_matrix.iloc[user, item] - np.dot(U[user, :], V[item, :])
                
                # Update U and V using gradient descent
                U[user, :] += learning_rate * (error * V[item, :] - regularization * U[user, :])
                V[item, :] += learning_rate * (error * U[user, :] - regularization * V[item, :])
    
    # Calculate RMSE at the end of each epoch
    predicted_ratings = np.dot(U, V.T)
    rmse = np.sqrt(np.nanmean((user_item_matrix - predicted_ratings) ** 2))
    rmse_values.append(rmse)
    
    print(f"Epoch {epoch + 1}/{num_epochs}, RMSE: {rmse:.4f}")


Epoch 1/1, RMSE: 0.9315


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold


# Specify the number of splits (K) for cross-validation
K = 5

# Initialize KFold with the number of splits (K)
kf = KFold(n_splits=K)

# Convert user_item_matrix to a NumPy array (if not already)
user_item_matrix = user_item_matrix.values

# Iterate over the folds
for train_indices, val_indices in kf.split(user_item_matrix):
    train_indices = np.array(train_indices)  # Convert indices to NumPy array
    val_indices = np.array(val_indices)  # Convert indices to NumPy array

    # Select valid rows from the user_item_matrix using the indices
    train_set = user_item_matrix[train_indices, :]
    val_set = user_item_matrix[val_indices, :]

    # Continue with training and evaluation steps for each fold
    # You can perform UV matrix decomposition, train the model, and evaluate it on the validation set
    # ...

    # After each fold, you can store the evaluation results (e.g., RMSE or MAE) for analysis


In [16]:

# Load the ratings data
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')

# Create the user-item interaction matrix
user_item_matrix = ratings.pivot(index='UserID', columns='MovieID', values='Rating')
user_item_matrix = user_item_matrix.values

# Data preparation: Filter out rows/columns with missing values (NaN)
valid_indices = ~np.isnan(user_item_matrix)
user_item_matrix = user_item_matrix[valid_indices]

# Define the number of latent factors
num_factors = 10

# Create a KFold cross-validator with 5 folds
kf = KFold(n_splits=5)

# Initialize a list to store RMSE values for each fold
rmse_scores = []

# Loop over the folds
for train_indices, val_indices in kf.split(user_item_matrix):
    # Select valid rows from the user_item_matrix using the indices
    train_set = user_item_matrix[train_indices, :]
    val_set = user_item_matrix[val_indices, :]

    # Initialize U and V matrices with random values
    num_users, num_items = train_set.shape
    U = np.random.rand(num_users, num_factors)
    V = np.random.rand(num_items, num_factors)

    # Set hyperparameters for training
    learning_rate = 0.01
    num_epochs = 20

    # Training loop
    for epoch in range(num_epochs):
        for user in range(num_users):
            for item in range(num_items):
                if not np.isnan(train_set[user, item]):
                    error = train_set[user, item] - np.dot(U[user, :], V[item, :])
                    U[user, :] += learning_rate * (error * V[item, :])
                    V[item, :] += learning_rate * (error * U[user, :])

    # Predict on the validation set
    val_indices = np.where(~np.isnan(val_set))
    val_predictions = np.dot(U[train_indices, :], V.T[:, val_indices[1]])

    # Calculate RMSE for the current fold
    rmse = np.sqrt(mean_squared_error(val_set[val_indices], val_predictions))
    rmse_scores.append(rmse)

# Calculate the average RMSE over all folds
avg_rmse = np.mean(rmse_scores)

# Print the average RMSE
print("Average RMSE:", avg_rmse)


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed