In [3]:
import numpy as np
import csv
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack, vstack
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold

In [2]:
combined_data_file_name = "./netflix-prize-data/combined_data_%s.txt"

target_f = open("./processed_data.csv", "w+")
for i in {1,2,3,4}:
    print("Data file: " + str(i) + "/4")
    
    cur_movie_id = None
    with open(combined_data_file_name % i) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')

        for row in csv_reader:
            if (len(row) == 1):
                cur_movie_id = row[0][:-1]
            else:
                user_id = row[0]
                rating = row[1]
                
                target_f.write(user_id + "," + rating + "," + cur_movie_id + "\n")
target_f.close()

In [4]:
def get_data():
    data_file_path = "./processed_data.csv"

    df = pd.read_csv(data_file_path, header = None, names = ['User_Id','Rating','Movie_Id'])
    print(df.iloc[::5000000, :])
    
    # Filter really sparse movies and users
    min_movie_ratings = 10000
    filter_movies = (df['Movie_Id'].value_counts()>min_movie_ratings)
    filter_movies = filter_movies[filter_movies].index.tolist()

    # Filter sparse users
    min_user_ratings = 200
    filter_users = (df['User_Id'].value_counts()>min_user_ratings)
    filter_users = filter_users[filter_users].index.tolist()

    # Actual filtering
    df_filterd = df[(df['Movie_Id'].isin(filter_movies)) & (df['User_Id'].isin(filter_users))]
    print('Shape User-Ratings unfiltered:\t{}'.format(df.shape))
    print('Shape User-Ratings filtered:\t{}'.format(df_filterd.shape))
    del filter_movies, filter_users, min_movie_ratings, min_user_ratings, df
    
    encoder = OneHotEncoder(categories='auto') 

    # (number_of_ratings x number_of_users)
    one_hot_user_matrix = encoder.fit_transform(np.asarray(df_filterd['User_Id']).reshape(-1,1)) 
    print("One-hot user matrix shape: " + str(one_hot_user_matrix.shape))
    
    # (number_of_ratings x number_of_movie_ids)
    one_hot_movie_matrix = encoder.fit_transform(np.asarray(df_filterd['Movie_Id']).reshape(-1,1))
    print("One-hot movie matrix shape: " + str(one_hot_movie_matrix.shape))
    
    # train data in CSR format
    X = hstack([one_hot_user_matrix, one_hot_movie_matrix]).tocsr()
    # data to predict
    ratings=np.asarray(df_filterd['Rating']).reshape(-1,1)
    
    return X,ratings

X,ratings = get_data()

# do shuffling so records will be evenly distributed over the matrix
X,ratings = shuffle(X,ratings)

print(X.shape)
print(ratings.shape)

           User_Id  Rating  Movie_Id
0          1488844       3         1
5000000     501954       2       996
10000000    404654       5      1962
15000000    886608       2      2876
20000000   1193835       2      3825
25000000   1899206       3      4661
30000000    154804       4      5496
35000000   2078749       5      6274
40000000    450763       5      7057
45000000    102092       3      7991
50000000    220298       5      9023
55000000    550530       5     10042
60000000    222570       3     11038
65000000   1273080       5     11875
70000000   2026970       5     12676
75000000    506044       4     13582
80000000    353605       2     14453
85000000    664606       3     15116
90000000   2213715       3     16008
95000000   1589401       5     16879
100000000  2314006       4     17627
Shape User-Ratings unfiltered:	(100480507, 3)
Shape User-Ratings filtered:	(60546559, 3)
One-hot user matrix shape: (60546559, 150245)
One-hot movie matrix shape: (60546559, 2042)
(60546

In [5]:
# Utility functions
def rmse(y, y_pred):
    rmse = np.sqrt(np.sum((y - y_pred) ** 2) / len(y))
    return rmse

def r2_score(y, y_pred):
    mean_y = np.mean(y)
    ss_tot = np.sum((y - mean_y) ** 2)
    ss_res = np.sum((y - y_pred) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2

In [28]:
# Define model class
class FMModel():
    def __init__(self, features_num, k=2, lr=0.1, batch_size=256, num_iters=1500):
        np.random.seed(0)  # Set the seed for repeatability
        self.lr = lr  # learning rate
        self.batch_size = batch_size
        self.num_iters = num_iters
        self.v = np.random.normal(0, 0.1, size=(features_num, k))
        self.w_lin = np.random.normal(0, 0.1, size=(features_num, 1))
        self.w_bias = np.array((0,), dtype=np.float32)
        
    def forward(self, x):
        out_1 = np.sum((x.dot(self.v) * x.dot(self.v)), keepdims=True, axis=1)
        out_2 = np.sum((x.multiply(x)).dot(self.v * self.v), keepdims=True, axis=1)
        out_inter = (out_1 - out_2) / 2
        out_linear = x.dot(self.w_lin) + self.w_bias
        return out_linear + out_inter
    
    def calculate_loss(self, x, y):
        pred = self.forward(x)
        n = len(y)
        loss = np.sum((pred - y) ** 2) / n
        pred_grad = (pred - y) / n  * 2.0
        return loss, pred_grad
    
    def step(self, x, y):
        # Calculate loss
        loss, dout = self.calculate_loss(x, y)
        
        # Calculate grads for a batch
        dw_lin = x.T.dot(dout)      
        dw_bias = np.sum(dout, axis=0)
        
        coef = (x.dot(self.v)).T
        dv_1 = x.T.dot(coef.T)
        dv = 0
        for i in range(self.batch_size):
            x_ = hstack((x[i, :].T, x[i, :].T))
            for j in range(self.v.shape[1]-2):
                x_ = hstack((x_, x[i, :].T))
            x_2 = x_.multiply(x_)
            dv += dv_1 * dout[i] - (x_2.multiply(self.v)).multiply(dout[i])
        dv /= self.batch_size
        
        # Take a step towards negative gradient
        self.w_lin -= self.lr * dw_lin
        self.w_bias -= self.lr * dw_bias
        self.v -= self.lr * dv
        
    def train(self, x, y):
        for i in tqdm(range(self.num_iters)):
            # Make a batch from already shuffled data
            x_batch = x[i*self.batch_size:(i+1)*self.batch_size, :]
            y_batch = y[i*self.batch_size:(i+1)*self.batch_size, :]
            if (i % 100 == 0) and (self.lr >= 0.01):
                self.lr /= 1.1
            self.step(x_batch, y_batch)

In [29]:
# Cross-validation
kf = KFold(shuffle=False, n_splits=5)
train_history = []
for train_index, test_index in kf.split(X):
    X_train = X[train_index]
    X_test = X[test_index]
    y_train = ratings[train_index]
    y_test = ratings[test_index]
    model = FMModel(X_train.shape[1])
    print(f'Starting loss: {model.calculate_loss(X_test, y_test)[0]}')
    
    model.train(X_train, y_train)
    y_pred = model.forward(X_train)
    y_pred_test = model.forward(X_test)
    
    print(f'RMSE train: {rmse(y_train, y_pred)}')
    print(f'RMSE test: {rmse(y_test, y_pred_test)}')
    print(f'R2 score train: {r2_score(y_train, y_pred)}')
    print(f'R2 score test: {r2_score(y_test, y_pred_test)}')
    
    history_entry = {'rmse_train': rmse(y_train, y_pred),
                     'rmse_test': rmse(y_test, y_pred_test),
                     'r2_train': r2_score(y_train, y_pred),
                     'r2_test': r2_score(y_test, y_pred_test),
                    }
    train_history.append(history_entry)
    print(f'Final loss: {model.calculate_loss(X_test, y_test)[0]}')

Starting loss: 14.318356676340128


HBox(children=(FloatProgress(value=0.0, max=1500.0), HTML(value='')))


RMSE train: 1.0415860754034834
RMSE test: 1.041671097965188
R2 score train: 0.0053463422604707045
R2 score test: 0.005317635843055668
Final loss: 1.085078676336
Starting loss: 14.319031217329572


HBox(children=(FloatProgress(value=0.0, max=1500.0), HTML(value='')))


RMSE train: 1.0417490475811149
RMSE test: 1.041669246180437
R2 score train: 0.005077271713272213
R2 score test: 0.005152296805631007
Final loss: 1.08507481843812
Starting loss: 14.316390152059538


HBox(children=(FloatProgress(value=0.0, max=1500.0), HTML(value='')))


RMSE train: 1.0417341494501506
RMSE test: 1.041728841137501
R2 score train: 0.005109947668328396
R2 score test: 0.005021679896495601
Final loss: 1.0851989784576808
Starting loss: 14.315422406159891


HBox(children=(FloatProgress(value=0.0, max=1500.0), HTML(value='')))


RMSE train: 1.0417027025274528
RMSE test: 1.0418546199747754
R2 score train: 0.005094617401049217
R2 score test: 0.005082997316346871
Final loss: 1.0854610491627836
Starting loss: 14.314165786297808


HBox(children=(FloatProgress(value=0.0, max=1500.0), HTML(value='')))


RMSE train: 1.0417552828718215
RMSE test: 1.0416443027249374
R2 score train: 0.0050972151017790734
R2 score test: 0.005072517463157955
Final loss: 1.085022853399321


In [32]:
# Prepare data for resulting table
def mean_std(arr):
    return np.mean(arr), np.std(arr)

r2_train_all = np.array([entry['r2_train'] for entry in train_history], dtype='float32')
r2_test_all = np.array([entry['r2_test'] for entry in train_history], dtype='float32')
rmse_train_all = np.array([entry['rmse_train'] for entry in train_history], dtype='float32')
rmse_test_all = np.array([entry['rmse_test'] for entry in train_history], dtype='float32')

r2_train_mean, r2_train_std = mean_std(r2_train_all)
r2_test_mean, r2_test_std = mean_std(r2_test_all)
rmse_train_mean, rmse_train_std = mean_std(rmse_train_all)
rmse_test_mean, rmse_test_std = mean_std(rmse_test_all)

In [33]:
# Create resulting table
from prettytable import PrettyTable
import logging

logging.basicConfig(filename='table.txt', level=logging.INFO)

tb = PrettyTable()
tb.field_names = ['', 'T1', 'T2', 'T3', 'T4', 'T5', 'E', 'STD']
tb.add_row(['r2_train'] + r2_train_all.tolist() + [r2_train_mean, r2_train_std])
tb.add_row(['r2_test'] + r2_test_all.tolist() + [r2_test_mean, r2_test_std])
tb.add_row(['rmse_train'] + rmse_train_all.tolist() + [rmse_train_mean, rmse_train_std])
tb.add_row(['rmse_test'] + rmse_test_all.tolist() + [rmse_test_mean, rmse_test_std])

logging.info(tb)