In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
import time
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import minmax_scale

In [2]:
# Loading the cleaned datasets
ratings_df = pd.read_csv("cleaned_data/rangering.csv")
movies_df = pd.read_csv("cleaned_data/film.csv")
users_df = pd.read_csv("cleaned_data/bruker.csv")

In [3]:
# Splitting the ratings dataset into the feature set (X) and target labels (y)
X = ratings_df.drop(columns='Rangering')
y = ratings_df["Rangering"].values  # The movie ratings are the target variables we want to predict

# Preparing train, validation and test datasets of 70%, 15%, 15%
X_train, X_val_and_test, y_train, y_val_and_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_val, X_test, y_val, y_test = train_test_split(X_val_and_test, y_val_and_test, test_size=0.5, random_state=101)

# Creating a complete training dataset with X_train and y_train
train_df = X_train.copy()
train_df["Rangering"] = y_train

train_df

Unnamed: 0,BrukerID,FilmID,Tidstempel,Rangering
708938,4330,631,997066167,3
371257,1403,3553,965458942,3
845603,5564,1805,965592338,4
667558,3965,2891,974853586,4
821518,5319,1872,968197585,5
...,...,...,...,...
661055,3908,3611,970460079,3
204614,6034,2406,958842165,4
476497,2330,3907,965348341,4
214539,87,3783,974700883,4


In [4]:
# Creating a 2D matrix (user ID vs movie ID) with the ratings as elements
user_matrix = train_df.pivot(index='BrukerID', columns='FilmID', values='Rangering')
user_matrix = user_matrix.sub(user_matrix.mean(axis=1), axis=0)

# Replace NaN with 0.0, which is the "neutral" value
user_matrix = user_matrix.fillna(0.0)

In [5]:
# User-based CF with Pearson Correlation 
# Subtract this from 1 to get the Pearson Distance between users
user_dist_matrix = 1 - user_matrix.T.corr()
user_dist_matrix

BrukerID,0,1,2,3,4,5,6,7,8,9,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
BrukerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000000,1.005730,1.000000,0.870425,0.950251,0.989126,1.021686,0.987359,0.972514,1.006407,...,1.000000,0.993009,0.919575,0.973815,1.000520,1.004525,0.987334,0.994223,1.108225,0.992078
1,1.005730,0.000000,1.000000,1.000000,1.023675,0.996367,1.000000,1.000000,0.947764,0.982663,...,1.000000,1.013994,0.999093,0.994488,1.000000,1.000000,0.981683,1.007970,0.990286,1.024007
2,1.000000,1.000000,0.000000,1.000000,1.008322,1.000000,1.009775,0.980399,1.014098,0.913725,...,1.000000,0.949654,0.964671,0.965342,0.981088,0.917735,1.040763,1.000000,0.988679,1.003989
3,0.870425,1.000000,1.000000,0.000000,1.000000,1.000000,1.000000,1.000000,0.948267,1.000833,...,1.000000,1.095537,1.000000,0.998228,1.000000,1.000000,1.001245,1.053294,1.000000,1.072804
4,0.950251,1.023675,1.008322,1.000000,0.000000,0.980329,1.058243,0.969385,0.918261,0.948364,...,0.990988,0.951779,0.966242,1.010757,1.039321,1.021285,0.954605,0.955308,0.997829,1.013774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,1.004525,1.000000,0.917735,1.000000,1.021285,1.036856,1.036273,0.950684,0.942980,0.996306,...,0.948502,0.975727,0.992479,1.016878,0.964859,0.000000,1.067465,1.000000,0.941595,1.008256
6037,0.987334,0.981683,1.040763,1.001245,0.954605,1.000000,1.003730,1.031722,1.003004,1.005353,...,1.000000,0.928262,1.027527,0.876273,1.105656,1.067465,0.000000,1.014733,1.000000,0.969143
6038,0.994223,1.007970,1.000000,1.053294,0.955308,1.000000,1.000000,1.000000,1.013576,0.967883,...,1.000000,0.980886,1.026629,1.000000,1.006370,1.000000,1.014733,0.000000,1.000000,1.019397
6039,1.108225,0.990286,0.988679,1.000000,0.997829,1.000000,1.023857,1.000000,0.971694,0.967725,...,1.000000,0.977534,0.981483,0.987530,0.951471,0.941595,1.000000,1.000000,0.000000,0.986514


In [None]:
# MODELLING: K Nearest Neighbours
# Models with a different number of neighbors
ml_algorithms = {'kNN-5': 5, 'kNN-10': 10, 'kNN-20': 20, 'kNN-30': 30, 'kNN-40': 40, "kNN-60": 60}

models_CF = []
RMSE_CF = []

# Training the models and predicting for the users and movies in the validation data
for name, num_neighbours in ml_algorithms.items():
    predictions = []

    # For every rating in the validation data
    for index, row in X_val.iterrows():
        # If the movie is in the training data
        if row["FilmID"] in X_train["FilmID"].unique():
            # Extract all user ID's for users who have rated the movie
            users_rated_movie = X_train.loc[X_train['FilmID'] == row['FilmID'], 'BrukerID']
            # Sort these users by similarity (Pearson distance)
            users_sorted = (user_dist_matrix.loc[row['BrukerID'], users_rated_movie].sort_values())
            # Select the nearest neighbours
            nearest_neighbours = users_sorted[:num_neighbours]
            # Extract the nearest neighbours' ratings data
            nn_data = train_df.loc[train_df['BrukerID'].isin(nearest_neighbours.index.to_list())]
            # Calculate the weighted average of the nearest neighbours' ratings
            nearest_neighbours_avg_rating = np.average(nn_data.loc[train_df['FilmID'] == row['FilmID'], 'Rangering'],
                                                       axis=0, weights=(1/nearest_neighbours))
        else:
            # There is a small chance that a few movies in the validation set might not appear in the training set.
            # I therefore predict that the user will rate these movies with the average rating for all movies
            nearest_neighbours_avg_rating = 4   # Must be changed!

        # Appending the prediction to the list of predictions
        if not np.isnan(nearest_neighbours_avg_rating):
            predictions.append(nearest_neighbours_avg_rating)
        else:
            predictions.append(3)

    models_CF.append(name)
    RMSE_CF.append(sqrt(mean_squared_error(y_val, predictions)))


# Displaying the results
RMSE_CF_dict = {"Model": models_CF, "RMSE": RMSE_CF}
RMSE_CF_df = pd.DataFrame(RMSE_CF_dict)
RMSE_CF_df

In [None]:
# Elbow graph for hyperperameters tunning
fig7, ax7 = plt.subplots()
ax7.plot(RMSE_CF_df.Model, RMSE_CF_df.RMSE, label="RMSE", color='darkred', linewidth=2)
plt.xlabel("Number of nearest neighbors", labelpad=18)
plt.ylabel("Root mean squared error", labelpad=15)
plt.title("K-value effect on RMSE for collaborative filtering models")
fig7.set_figheight(10)
fig7.set_figwidth(16)
plt.show()

In [None]:
# Best model to get the prediction results
best_CF_model = []
RMSE_best_CF = []

# Training the models and predicting for the users and movies in the validation data
CF_predictions = []

# For every movie in the validation data
for index, row in X_val.iterrows():
    # If that movie is in the training data
    if row["FilmID"] in X_train["FilmID"].unique():
        # Extract all user ID's for users who have rated the movie
        users_rated_movie = X_train.loc[X_train['FilmID'] == row['FilmID'], 'BrukerID']
        # Sort these users by similarity (Pearson distance)
        users_sorted = (user_dist_matrix.loc[row['BrukerID'], users_rated_movie].sort_values())
        # Select the nearest neighbours
        nearest_neighbours = users_sorted[:40]
        # Extract the nearest neighbours' ratings data
        nn_data = train_df.loc[train_df['BrukerID'].isin(nearest_neighbours.index.to_list())]
        # Calculate the weighted average of the nearest neighbours' ratings
        nearest_neighbours_avg_rating = np.average(nn_data.loc[train_df['FilmID'] == row['FilmID'], 'Rangering'],
                                                   axis=0, weights=(1/nearest_neighbours))
    else:
        # There is a small chance that a few movies in the validation set might not appear in the training set.
        # I therefore predict that the user will rate these movies with the average rating for all movies
        nearest_neighbours_avg_rating = 4   # Must be changed!

    # Appending the prediction to the list of predictions
    if not np.isnan(nearest_neighbours_avg_rating):
        CF_predictions.append(nearest_neighbours_avg_rating)
    else:
        CF_predictions.append(4)