# Assignment 2: Movie Recommender System
## Notebook 1.0: Exploratory experiments
> Practical Machine Learning & Deep Learning course, Fall 2023
#### Author of the notebook: Vladislav Urhzumov, BS21-AI-01

---




### Libraries import

In [5]:
import pandas as pd
from sklearn.utils.extmath import randomized_svd
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from scipy.linalg import sqrtm
from math import sqrt

### Constant setting

In [6]:
np.random.seed(42)

In [7]:
names = ['user_id', 'movie_id', 'rating', 'timestamp']
user_col, movie_col = 'user_id', 'movie_id'

### Dataset exploration

more details in the notebook 0.1

In [8]:
data = pd.read_csv('/content/u.data', sep='\t', names=names)

# Create a user-item matrix
user_item_matrix = data.pivot(index=user_col, columns=movie_col, values='rating')
user_item_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [9]:
data.head(10)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [10]:
user_item_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 943 entries, 1 to 943
Columns: 1682 entries, 1 to 1682
dtypes: float64(1682)
memory usage: 12.1 MB


Code for train/test split and for mask-imputing NaN values was partilly taken from [source](https://towardsdatascience.com/beginners-guide-to-creating-an-svd-recommender-system-1fd7326d1f65)

In [11]:
# Take only <test_ratio> of LATEST ratings made by each user as test set, other are train set
# This approach is more essential than random test split, due to recommender system nature

def train_test_split_latest(data, test_ratio=0.2, users_col=user_col, movies_col=movie_col):
    users = data[users_col].unique()
    movies = data[movies_col].unique()
    test = pd.DataFrame(columns=data.columns)
    train = pd.DataFrame(columns=data.columns)
    test_ratio = test_ratio
    for u in users:
        temp = data[data[users_col] == u]
        n = len(temp)
        test_size = int(test_ratio*n)
        temp = temp.sort_values('timestamp').reset_index()

        test = pd.concat([test, temp.iloc[n-1-test_size :]])
        train = pd.concat([train, temp.iloc[: n-2-test_size]])
    return train, test, users, movies

### Dataset preprocessing

For SVD, removing NaNs is required. That is why we need masking NaNs and replacing by mean.
For the pure predictions, we need to subtract average, and then add it back

In [12]:
class MovieLensDataset:
    def __init__(self, user_item_matrix: pd.DataFrame, mask=False, impute=False, impute_strategy='mean'):
        self.data = user_item_matrix
        self.mask_tile = None
        if mask:
            self.mask_transform()
        elif impute:
            if impute_strategy:
                self.impute_nan_by_simple_imputer(impute_strategy)

    def impute_nan_by_simple_imputer(self, strategy='median'):
        imputer = SimpleImputer(strategy=strategy, fill_value=0)
        user_item_matrix_imputed = imputer.fit_transform(self.data.T).T
        self.data =  pd.DataFrame(user_item_matrix_imputed, index=self.data.index, columns=self.data.columns)

    def mask_transform(self):
        """
        We need to remove NaN values in out utility matrix before applying SVD
        we remove the per item average from all entries.
        the above mentioned nan entries will be essentially zero now
        """
        utility_matrix = self.data.to_numpy().astype(float)
        mask = np.isnan(utility_matrix)
        masked_arr = np.ma.masked_array(utility_matrix, mask)
        item_means = np.mean(masked_arr, axis=0)
        utility_matrix = masked_arr.filled(item_means)
        x = np.tile(item_means, (utility_matrix.shape[0],1))
        utility_matrix = utility_matrix - x
        self.mask_tile = x
        return utility_matrix, self.mask_tile

    def head(self, k):
        return self.data.head(k)

    def tail(self, k):
        return self.data.tail(k)

    def __len__(self):
        df = self.data
        num_rows = df.shape[0]
        return num_rows


train, test, users, movies = train_test_split_latest(data)
user_item_matrix_train = train.pivot(index=user_col, columns=movie_col, values='rating')
user_item_matrix_test = test.pivot(index=user_col, columns=movie_col, values='rating')
dataset = MovieLensDataset(user_item_matrix_train)
print("Number of users:", len(dataset))
print("Number of films:", dataset.data.shape[0])
dataset.head(10)

Number of users: 943
Number of films: 943


  user_item_matrix_train = train.pivot(index=user_col, columns=movie_col, values='rating')
  user_item_matrix_train = train.pivot(index=user_col, columns=movie_col, values='rating')
  user_item_matrix_test = test.pivot(index=user_col, columns=movie_col, values='rating')
  user_item_matrix_test = test.pivot(index=user_col, columns=movie_col, values='rating')


movie_id,1,2,3,4,5,6,7,8,9,10,...,1660,1662,1663,1664,1670,1672,1673,1675,1676,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,,3.0,,,4.0,1.0,,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
6,4.0,,,,,,2.0,4.0,4.0,,...,,,,,,,,,,
7,,,,5.0,,,5.0,5.0,5.0,4.0,...,,,,,,,,,,
8,,,,,,,3.0,,,,...,,,,,,,,,,
9,,,,,,,4.0,,,,...,,,,,,,,,,
10,4.0,,,4.0,,,,,4.0,,...,,,,,,,,,,


### Class of RecommenderSVD

more details are available in the project's code

In [13]:
class RecommenderSVD:
    def __init__(self, data: MovieLensDataset):
        self.data = data

    def svd_factorize(self, k):
        M, x = self.data.mask_transform()
        U, s, V = np.linalg.svd(M, full_matrices=False)
        s = np.diag(s)
        s = s[0:k, 0:k]
        U = U[:, 0:k]
        V = V[0:k, :]
        s_root = sqrtm(s)
        UsV = np.dot(np.dot(U, s_root), np.dot(s_root, V))
        UsV = UsV + x
        return UsV

    def predict_ratings(self, n_components):
        predicted_ratings = self.svd_factorize(n_components)
        return pd.DataFrame(predicted_ratings, index=self.data.data.index, columns=self.data.data.columns)

    def generate_recommendations(self, user_id, k):
        predicted_ratings = self.predict_ratings(k)
        user_ratings = predicted_ratings.loc[user_id]
        user_ratings = user_ratings.sort_values(ascending=False)
        return user_ratings.head(k)



Let us check that the predictions are in range and actually work!

In [14]:
predicted_ratings = RecommenderSVD(dataset).predict_ratings(50)

In [15]:
predicted_ratings.index = predicted_ratings.index.astype(int)
predicted_ratings = predicted_ratings.sort_index(axis=1).sort_index()
predicted_ratings

movie_id,1,2,3,4,5,6,7,8,9,10,...,1660,1662,1663,1664,1670,1672,1673,1675,1676,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.809781,3.016268,3.066685,3.778045,2.695608,3.203575,4.161387,2.709787,4.062838,3.714220,...,2.0,1.0,2.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0
2,4.049755,3.209070,3.031840,3.513802,3.166895,3.410418,3.923518,3.863885,3.851188,3.659160,...,2.0,1.0,2.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0
3,4.033235,3.277055,3.065931,3.536369,3.136901,3.279015,3.627863,3.939353,4.067700,3.713679,...,2.0,1.0,2.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0
4,3.893196,3.223265,3.033498,3.531108,3.166396,3.405713,3.801601,4.138167,3.851854,3.863066,...,2.0,1.0,2.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0
5,4.826852,2.925292,2.800098,3.758659,3.039774,3.292522,4.171781,3.943477,3.779847,3.750119,...,2.0,1.0,2.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,4.373325,3.224812,3.109782,3.473544,3.267697,3.314347,4.160844,3.860156,4.216217,3.790287,...,2.0,1.0,2.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0
940,4.183787,3.288120,3.109496,3.276977,3.209571,3.347383,3.727899,4.318139,3.646775,3.698811,...,2.0,1.0,2.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0
941,3.806426,3.324411,3.128974,3.534835,3.153625,3.328931,3.857527,4.083278,3.900691,3.854092,...,2.0,1.0,2.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0
942,4.009074,3.274628,3.143316,3.562677,3.214712,3.395553,3.658067,4.194878,4.055421,3.853333,...,2.0,1.0,2.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0


### Grid search of n_components
and corresponding metric measurements

In [21]:
no_of_features = [2, 3, 4, 5, 8, 50]
results = {}
for f in no_of_features:
    predicted_ratings = RecommenderSVD(dataset).predict_ratings(f)
    preds = []
    for _, row in test.iterrows():
        user = int(row[user_col])
        item = row[movie_col]
        if item in predicted_ratings.columns:
            preds.append(predicted_ratings.iloc[user - 1][item - 1])
        else:
            preds.append(np.mean(predicted_ratings.iloc[user - 1]))
    mae = mean_absolute_error(test['rating'], preds)
    rmse = sqrt(mean_squared_error(test['rating'], preds))
    results[f] = {'mae': mae, 'rmse': rmse}
results

{2: {'mae': 0.8350037536912575, 'rmse': 1.0470936149133494},
 3: {'mae': 0.8311700675851423, 'rmse': 1.0433913309290264},
 4: {'mae': 0.8284132051015004, 'rmse': 1.0403976385801204},
 5: {'mae': 0.8277144729254315, 'rmse': 1.0396773135003001},
 8: {'mae': 0.8249714650489115, 'rmse': 1.0372921725981317},
 50: {'mae': 0.8335773316231814, 'rmse': 1.0480141986309612}}

### Evaluated on u1.test

In [None]:
train, test = pd.read_csv('/content/u1.base', sep='\t', names=names), pd.read_csv('/content/u1.test', sep='\t', names=names)
user_item_matrix_train = train.pivot(index=user_col, columns=movie_col, values='rating')
user_item_matrix_test = test.pivot(index=user_col, columns=movie_col, values='rating')
dataset = MovieLensDataset(user_item_matrix_train)
print("Number of users:", len(dataset))
print("Number of films:", dataset.data.shape[0])
no_of_features = [2, 5, 8, 16, 32, 50]
results = {}
for f in no_of_features:
    predicted_ratings = RecommenderSVD(dataset).predict_ratings(f)
    preds = []
    for _, row in test.iterrows():
        user = int(row[user_col])
        item = row[movie_col]
        if item in predicted_ratings.columns:
            preds.append(predicted_ratings.iloc[user - 1][item])
        else:
            preds.append(np.mean(predicted_ratings.iloc[user - 1]))
    mae = mean_absolute_error(test['rating'], preds)
    rmse = sqrt(mean_squared_error(test['rating'], preds))
    results[f] = {'mae': mae, 'rmse': rmse}

print('Results on test set from u1.test:')
results

Number of users: 943
Number of films: 943


In [None]:
results

{2: {'mae': 0.8070933916856515, 'rmse': 1.0109273068906621},
 5: {'mae': 0.7949703757070576, 'rmse': 0.9978783902015383},
 8: {'mae': 0.7921271988908917, 'rmse': 0.994873218243789},
 16: {'mae': 0.7904318363096654, 'rmse': 0.9932558668904463},
 32: {'mae': 0.7951160400955491, 'rmse': 0.9984942737048694},
 50: {'mae': 0.7973779089001025, 'rmse': 1.0015453129357172}}

### A good starting baseline, isn't it?