In [13]:
from surprise.prediction_algorithms import NormalPredictor
from surprise.prediction_algorithms.matrix_factorization import SVD, NMF
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, KFold as KFold_sp
from surprise.model_selection.split import train_test_split as tts_sp

import time
import pandas as pd
import numpy as np

import tensorflow as tf
from keras.layers import Input, Embedding, Flatten, Dropout, Dense, Concatenate, Lambda
from keras.models import Model
from keras import callbacks
from keras.regularizers import l2

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae
from sklearn.model_selection import cross_validate as cv_sk, KFold as KFold_sk, train_test_split as tts_sk
from sklearn.neighbors import KNeighborsRegressor as KNN

from sentence_transformers import SentenceTransformer
roberta = SentenceTransformer('stsb-roberta-large')

In [14]:
ratings_df = pd.read_csv('ratings.csv')
ratings_df = ratings_df.drop(ratings_df.columns[-1], axis=1)

tags = pd.read_csv('tags.csv')
tags = tags.drop(tags.columns[-1], axis=1)

ratings = Dataset.load_from_df(ratings_df, Reader(rating_scale=(1, 5)))
kf_sp = KFold_sp(n_splits=5, shuffle=False)

ratings_mat = ratings_df.to_numpy()
kf_sk = KFold_sk(n_splits=5, shuffle=False)

vecs = np.load('encodedText.npy')

In [15]:
temp_df = pd.DataFrame()
user_enc = LabelEncoder()
temp_df['user'] = user_enc.fit_transform(ratings_df['userId'].values)
item_enc = LabelEncoder()
temp_df['movie'] = item_enc.fit_transform(ratings_df['movieId'].values)
temp_df['rating'] = ratings_df['rating'].values.astype(np.float32)

n_users = temp_df['user'].nunique()
n_movies = temp_df['movie'].nunique()
min_rating = min(temp_df['rating'])
max_rating = max(temp_df['rating'])

temp = temp_df.to_numpy()

# 1. Performance Table

In [16]:
# get results of algorithm random, NMF, and SVD
algo_1 = cross_validate(NormalPredictor(), ratings, cv=kf_sp, n_jobs=-1, measures=['rmse', 'mae'])
algo_2 = cross_validate(NMF(n_factors=17, n_epochs=30, random_state=12345), ratings, cv=kf_sp, n_jobs=-1, measures=['rmse', 'mae'])
algo_3 = cross_validate(SVD(n_factors=7, n_epochs=550, random_state=12345), ratings, cv=kf_sp, n_jobs=-1, measures=['rmse', 'mae'])

In [17]:
# predictions from knn
algo_4 = {'test_rmse': [], 'test_mae': [], 'test_time': []}

for train_index, test_index in kf_sk.split(ratings_mat):
    knn = KNN(n_neighbors=6, n_jobs=-1)
    X_train, X_test = vecs[train_index], vecs[test_index]
    y_train, y_test = ratings_mat[train_index, -1], ratings_mat[test_index, -1]
    knn.fit(X_train, y_train)
    start_time = time.time()
    preds = knn.predict(X_test)
    algo_4['test_time'].append(round(time.time() - start_time, 10))
    algo_4['test_rmse'].append(mse(y_test, preds, squared=False))
    algo_4['test_mae'].append(mae(y_test, preds))

In [18]:
def build_model():
    tf.keras.backend.clear_session()

    user = Input(shape=(1,))
    u = Embedding(n_users, 32, embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(user)
    u = Flatten()(u)

    movie = Input(shape=(1,))
    m = Embedding(n_movies, 32, embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(movie)
    m = Flatten()(m)

    x = Concatenate()([u, m])

    x = Dense(16, activation='relu', kernel_initializer='he_normal')(x)
    x = Dropout(0.5)(x)
    
    x = Dense(1, activation='sigmoid', kernel_initializer='he_normal')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)

    model = Model([user, movie], x)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model

earlystop = callbacks.EarlyStopping(patience=3,
                                    monitor='val_loss',
                                    restore_best_weights=True)

In [20]:
# get results of algorithm neural network
algo_5 = {'test_rmse': [], 'test_mae': [], 'test_time': []}

for train_index, test_index in kf_sk.split(temp):
    train, test = temp[train_index], temp[test_index]
    model = build_model()
    history = model.fit([train[:, 0], train[:, 1]], train[:, 2], epochs=10, verbose=0, callbacks=[earlystop], batch_size=64, validation_split=0.1)
    start_time = time.time()
    preds = model.predict([test[:, 0], test[:, 1]])
    algo_5['test_time'].append(round(time.time() - start_time, 10))
    algo_5['test_rmse'].append(mse(test[:, 2], preds, squared=False))
    algo_5['test_mae'].append(mae(test[:, 2], preds))

In [26]:
for i in [algo_1, algo_2, algo_3, algo_4, algo_5]:
    print(f"{np.mean(i['test_rmse']):.4f}\t {np.mean(i['test_mae']):.4f}\t {np.mean(i['test_time']):.4f}\t")

1.4263	 1.1387	 0.1021	
1.0429	 0.8296	 0.0902	
0.9640	 0.7480	 0.0746	
1.4478	 1.2488	 35.6309	
0.9627	 0.7525	 0.6441	


In [None]:
"""
        RMSE     MAE     Time
random  1.4263	 1.1387	 0.1021	
NMF     1.0429	 0.8296	 0.0902	
SVD     0.9640	 0.7480	 0.0746	
KNN     1.4478	 1.2488	 35.6309	
nn      0.9627	 0.7525	 0.6441	

* nn: neural network
"""

# 2. Consistency Table

In [27]:
# get predictions from all 5 algorithms
preds = []

# predictions from random, nmf, svd
trainset, testset = tts_sp(ratings, test_size=.25, shuffle=False)
preds.append([i.est for i in NormalPredictor().fit(trainset).test(testset)])
preds.append([i.est for i in NMF(n_factors=17, n_epochs=30, random_state=12345).fit(trainset).test(testset)])
preds.append([i.est for i in SVD(n_factors=7, n_epochs=550, random_state=12345).fit(trainset).test(testset)])

In [28]:
# predictions from knn
X_train, X_test = tts_sk(vecs, test_size=.25, shuffle=False)
train_data, test_data = tts_sk(ratings_mat, test_size=.25, shuffle=False)
knn = KNN(n_neighbors=6, n_jobs=-1)
preds.append(knn.fit(X_train, train_data[:, -1]).predict(X_test))

In [29]:
# predictions from neural network
train_data, test_data = tts_sk(temp, test_size=.25, shuffle=False)
model = build_model()
history = model.fit([train_data[:, 0], train_data[:, 1]], train_data[:, 2], epochs=10, verbose=0, callbacks=[earlystop], batch_size=64, validation_split=0.1)
preds.append(model.predict([test_data[:, 0], test_data[:, 1]]).reshape(-1))

In [34]:
for i in preds:
    res = []
    for j in preds:
        res.append(mse(i, j, squared=False))
    s = ''.join(f"{np.mean(r):.2f}\t" for r in res)
    print(s)

0.00	0.96	1.06	0.97	1.05	
0.96	0.00	0.44	0.12	0.43	
1.06	0.44	0.00	0.45	0.23	
0.97	0.12	0.45	0.00	0.44	
1.05	0.43	0.23	0.44	0.00	


In [None]:
"""
        random  NMF     SVD     KNN     nn
random  0.00	0.96	1.06	0.97	1.05	
NMF     0.96	0.00	0.44	0.12	0.43	
SVD     1.06	0.44	0.00	0.45	0.23	
KNN     0.97	0.12	0.45	0.00	0.44	
nn      1.05	0.43	0.23	0.44	0.00	

* nn: neural network
"""