### Config

In [None]:
# Crossvalodation
CV_NUM_FOLDS = 10
CV_TRAIN_RATIO = 0.9

# NCF model parameters
NCF_EPOCHS = 100
NCF_ES_PATIENCE = 20
# NCF_DROPOUT = 0.2
NCF_LR = 0.005
NCF_BATCH_SIZE = 1024

# LightGCN model parameters
LGCN_EPOCHS = 50
LGCN_EVAL_EPOCHS = -1 # -1 to turn off
LGCN_N_LAYERS = 3
LGCN_EMBED_SIZE = 64
LGCN_LR = 0.005
LGCN_BATCH_SIZE = 1024

# SVD model parameters
SVD_EPOCHS = 10
SVD_EMBED_SIZE = 64

# For deterministic results
SEED = 42

### Install dependencies and import libraries

In [None]:
# LightGCN
!pip install recommenders[examples]

In [None]:
# ResNet
!pip install img2vec_pytorch

In [None]:
# Import libraries
import math
import statistics
import random
random.seed(SEED)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy.random import seed
seed(SEED)

import pandas as pd
import re
import sys
import os
import papermill as pm
import scrapbook as sb
from tensorflow.random import set_seed
set_seed(SEED)
from tensorflow import keras
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

### Movielens 100k dataset

#### Download datasets

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

In [None]:
!wget https://csukas.org/dipterv1/ml100k-img.zip
!unzip ml100k-img.zip

In [None]:
# from google.colab import drive

# drive.mount('/content/drive')
# !unzip '/content/drive/MyDrive/Colab Notebooks/ml100k-img.zip'

#### Load movies

In [None]:
# !head ml-100k/u.item

In [None]:
def load_movies():
  # Reading files
  movies_dtype={
    'id': int,
    'title': str,
    'release_date': str,
    'video_release_date': str,
    'imdb_url': str,
    'unknown': int,
    'Action': int,
    'Adventure': int,
    'Animation': int,
    'Childrens': int,
    'Comedy': int,
    'Crime': int,
    'Documentary': int,
    'Drama': int,
    'Fantasy': int,
    'FilmNoir': int,
    'Horror': int,
    'Musical': int,
    'Mystery': int,
    'Romance': int,
    'SciFi': int,
    'Thriller': int,
    'War': int,
    'Western': int
  }

  names = ['id', 'title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'FilmNoir', 'Horror', 'Musical', 'Mystery', 'Romance', 'SciFi', 'Thriller', 'War', 'Western'];

  movies = pd.read_csv('ml-100k/u.item', 
                      sep='|', 
                      encoding='latin-1',
                      dtype=movies_dtype,
                      names=names
                      )
  movies.drop(['video_release_date', 'imdb_url'], axis=1, inplace=True)
  movies.set_index('id', inplace=True)

  # movies.describe(include='all')

  return movies

#### Load users

In [None]:
# !tail ml-100k/u.user

In [None]:
def load_users():
  # Reading files
  users_dtype={
    'id': int,
    'age': int,
    'gender': str,
    'occupation': str,
    'zip_code': str
  }

  names = ['id', 'age', 'gender', 'occupation', 'zip_code'];

  users = pd.read_csv('ml-100k/u.user', 
                      sep='|', 
                      encoding='latin-1',
                      dtype=users_dtype,
                      names=names
                      )
  users.drop(['zip_code'], axis=1, inplace=True)
  users.set_index('id', inplace=True)

  return users

#### Load ratings

In [None]:
def load_ratings():
  ratings_dtype={
    'user_id': int,
    'item_id': int,
    'rating': float,
    'timestamp': int
  }

  names = ['user_id', 'item_id', 'rating', 'timestamp'];

  ratings = pd.read_csv('ml-100k/u.data', 
                      sep='\t', 
                      encoding='latin-1',
                      dtype=ratings_dtype,
                      names=names
                      )
  ratings.drop(['timestamp'], axis=1, inplace=True)

  return ratings

#### Feature selection and preparation

In [None]:
def prepare_movie_features(movies):
  movies = movies.copy()
  # Movie release year
  movies['release_year']=np.NaN
  for ind, m in movies.iterrows():
    match = re.search('([^\()]*).* \((\d*)\)', m['title'])
    if match:
      movies.at[ind,'release_year'] = match.group(2)*1

  # replace missing values
  movies.loc[movies.release_year.isna(), 'release_year'] = movies.release_year.mean()

  # Drop unused cols
  movies.drop(['title', 'release_date'], axis=1, inplace=True)

  return movies

In [None]:
def prepare_movies_avg_ratings(movies, ratings):
  movies['avg_rating'] = movies.apply(lambda m: ratings[ratings.item_id == m.name]['rating'].mean(), axis=1)
  movies['avg_rating'] = movies['avg_rating'].fillna(movies['avg_rating'].mean())
  return movies

In [None]:
from sklearn.preprocessing import OneHotEncoder

def prepare_user_features(users):
  users = users.copy()
  # One-hot encode gender and occupation
  encoder = OneHotEncoder(handle_unknown='ignore')

  gender_df = pd.DataFrame(encoder.fit_transform(users[['gender']]).toarray())
  gender_df.index = np.arange(1, len(gender_df) + 1)
  gender_df.columns = ['gender_%s' % str(i) for i in range(len(gender_df.columns))]
  users = users.join(gender_df).drop(['gender'], axis=1)

  occupation_df = pd.DataFrame(encoder.fit_transform(users[['occupation']]).toarray())
  occupation_df.index = np.arange(1, len(occupation_df) + 1)
  occupation_df.columns = ['occupation_%s' % str(i) for i in range(len(occupation_df.columns))]
  users = users.join(occupation_df).drop(['occupation'], axis=1)

  return users

In [None]:
def prepare_users_avg_ratings(users, ratings):
  users['avg_rating'] = users.apply(lambda u: ratings[ratings.user_id == u.name]['rating'].mean(), axis=1)
  users['avg_rating'] = users['avg_rating'].fillna(users['avg_rating'].mean())
  return users

#### Feature normalization

In [None]:
def normalize_df(df):
  for c in list(df.columns):
    c_min = df[c].min()
    c_max = df[c].max()
    df[c] = (df[c] - c_min) / (c_max - c_min)
    
  return df

#### Split data for crossvalidation

In [None]:
def create_crossval_buckets(ratings, n_folds):
  folds = []
  for i in range(n_folds):
    folds.append([])

  for i in range(1, (ratings['user_id'].max() + 1)):
    u_rows = ratings.index[ratings.user_id == i].tolist()
    for j in range(len(u_rows)):
      folds[j % n_folds].append(u_rows[j])

  return [ratings.filter(items=folds[i], axis=0).sample(frac=1) for i in range(n_folds)]

#### Create user and item embeddings with LightGCN

In [None]:
class dotdict(dict):
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [None]:
# Source https://github.com/microsoft/recommenders/blob/main/examples/02_model_collaborative_filtering/lightgcn_deep_dive.ipynb

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k

def create_user_item_embeddings(train, test):
  print('Creating user item embeddings...')

  TOP_K = 10 # top k items to recommend
  hparams = dotdict({
    "model_type" : "lightgcn",
    "embed_size" : LGCN_EMBED_SIZE, # the embedding dimension of users and items
    "n_layers" : LGCN_N_LAYERS, # number of layers of the model
    "batch_size" : LGCN_BATCH_SIZE,
    "decay" : 0.0001, # l2 regularization for embedding parameters
    "epochs" : LGCN_EPOCHS, # number of epochs for training
    "learning_rate" : LGCN_LR,
    "eval_epoch" : LGCN_EVAL_EPOCHS, # if it is not -1, evaluate the model every eval_epoch; -1 means that evaluation will not be performed during training
    "top_k" : 10, # number of items to recommend when calculating evaluation metrics
    "save_model" : False, # whether to save model
    "save_epoch" : 100, # if save_model is set to True, save the model every save_epoch
    "metrics" : ["recall", "ndcg", "precision", "map"], # metrics for evaluation
    "MODEL_DIR" : "./tests/resources/deeprec/lightgcn/model/lightgcn_model/" # directory of saved models
  })

  train = train.rename(columns = {'user_id':'userID', 'item_id':'itemID'})
  test = test.rename(columns = {'user_id':'userID', 'item_id':'itemID'})

  data = ImplicitCF(train=train, test=test, seed=SEED) # col_user='user_id', col_item='item_id', col_rating='rating', 

  model = LightGCN(hparams, data, seed=SEED)
  with Timer() as train_time:
      model.fit()

  print("Took {} seconds for training.".format(train_time.interval))

  # eval_map = map_at_k(test, topk_scores, k=TOP_K)
  # eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
  # eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
  # eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

  # print("MAP:\t%f" % eval_map,
  #       "NDCG:\t%f" % eval_ndcg,
  #       "Precision@K:\t%f" % eval_precision,
  #       "Recall@K:\t%f" % eval_recall, sep='\n')

  # user embeddings
  user_embeddings = [[] for i in range(train['userID'].max()+1)]
  embeddings = list(model.ua_embeddings.eval(session=model.sess))
  for i in range(model.n_users):
    user_embeddings[model.data.id2user[i]] = embeddings[i]

  movie_embeddings = [[] for i in range(train['itemID'].max()+1)]
  embeddings = list(model.ia_embeddings.eval(session=model.sess))
  for i in range(model.n_items):
    movie_embeddings[model.data.id2item[i]] = embeddings[i]
  
  print('User item embeddings are created.')

  return (user_embeddings, movie_embeddings)

#### Create image embeddings with ResNet

In [None]:
from img2vec_pytorch import Img2Vec
from PIL import Image

def create_image_embeddings(img_df):
  print('Creating image embeddings...')

  if os.path.isfile('img_embeddings.npy'):
    with open('img_embeddings.npy', 'rb') as f:
      embeddings = np.load(f, allow_pickle=True)

    print('Image embeddings are already created.')
      
    return embeddings

  embeddings = [[] for i in range(img_df['id'].max()+1)]

  img2vec = Img2Vec(cuda=True)

  for ind, img in img_df.iterrows():
    if not os.path.isfile(img['file']):
      embeddings[img['id']] = [0 for i in range(512)]
      continue

    print(img['file'])
    
    imgf = Image.open(img['file']).convert('RGB')
    embeddings[img['id']] = img2vec.get_vec(imgf)

  embeddings = np.array(embeddings)
  with open('img_embeddings.npy', 'wb') as f:
    np.save(f, embeddings)

  print('Image embeddings are created.')

  return embeddings

#### Rotate array by d elements using temp array

In [None]:
# Source https://www.geeksforgeeks.org/python-program-for-program-for-array-rotation-2/
# function to rotate array by d elements using temp array
def rotateArray(arr, n, d):
    temp = []
    i = 0
    while (i < d):
        temp.append(arr[i])
        i = i + 1
    i = 0
    while (d < n):
        arr[i] = arr[d]
        i = i + 1
        d = d + 1
    arr[:] = arr[: i] + temp
    return arr

### Model

In [None]:
from surprise import SVD

def build_SVD_model():
  return SVD(verbose=True, n_factors=SVD_EMBED_SIZE, n_epochs=SVD_EPOCHS)

In [None]:
from keras import layers
from keras.utils.vis_utils import plot_model

def build_NeuMF_model(ncf_layers):
  movie_features_input = keras.Input(
    shape=(21,), name="movie_features"
  )
  movie_poster_embeddings_input = keras.Input(
    shape=(512,), name="movie_poster_embeddings"
  )
  movie_embeddings_input = keras.Input(
    shape=(LGCN_EMBED_SIZE,), name="movie_embeddings"
  )
  user_embeddings_input = keras.Input(
    shape=(LGCN_EMBED_SIZE,), name="user_embeddings"
  )
  user_features_input = keras.Input(
    shape=(25,), name="user_features"
  )

  gmf_layer = keras.layers.Multiply(name="gmf_layer")([movie_embeddings_input, user_embeddings_input])

  last_ncf_layer = layers.concatenate([movie_features_input, movie_poster_embeddings_input, movie_embeddings_input, user_embeddings_input, user_features_input])
  for i in range(len(ncf_layers)):
    last_ncf_layer = layers.Dense(ncf_layers[i], activation="relu")(last_ncf_layer)
    # last_ncf_layer = layers.Dropout(NCF_DROPOUT, noise_shape=None, seed=SEED)(last_ncf_layer)
    last_ncf_layer = layers.BatchNormalization()(last_ncf_layer)

  concatenate_layer = layers.concatenate([gmf_layer, last_ncf_layer])
  
  output_layer = layers.Dense(1, activation="relu")(concatenate_layer)

  model = keras.Model(
    inputs=[movie_features_input, movie_poster_embeddings_input, movie_embeddings_input, user_embeddings_input, user_features_input],
    outputs=output_layer,
  )

  return model

### Crossvalidation

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def crossvalidate_model(model, train_fn, predict_fn, movies, users, cv_buckets, n_train):
  rmse = []
  mae = []
  for i in range(len(cv_buckets)):
    print('Cross-validation round: %s' % (str(i+1)))
    # sort out data from buckets to splits
    cv_buckets = rotateArray(cv_buckets, len(cv_buckets), 1)
    # train
    train_d = []
    for i in range(0, n_train):
      train_d.append(cv_buckets[i])
    train_df = pd.concat(train_d)
    # test
    test_d = []
    for i in range(n_train, len(cv_buckets)):
      test_d.append(cv_buckets[i])
    test_df = pd.concat(test_d)

    # train model
    trained_model = train_fn(model, movies, users, train_df)

    # predict
    predictions = predict_fn(trained_model, test_df)

    y_actual = test_df['rating'].to_list()

    c_rmse = mean_squared_error(y_actual, predictions, squared=False)
    rmse.append(c_rmse)
    c_mae = mean_absolute_error(y_actual, predictions)
    mae.append(c_mae)

    print("End of cross-validation round. RMSE=%s, MAE=%s | AVG(RMSE)=%s, AVG(MAE)=%s." % (str(c_rmse), str(c_mae), str(statistics.mean(rmse)), str(statistics.mean(mae))))

  return (rmse, mae)

#### AVG train & predict

In [None]:
from surprise import Dataset
from surprise import Reader

def train_avg(model, movies_df, users_df, train_df):
  movies_df = movies_df.copy()
  movies_df = prepare_movies_avg_ratings(movies_df, train_df)[['avg_rating']]
  movies_df['avg_rating'] = movies_df['avg_rating'].fillna(movies_df['avg_rating'].mean())

  return movies_df

In [None]:
def predict_avg(model, test_df):
  predictions = []
  for ind, r in test_df.iterrows():
    predictions.append(model.loc[r['item_id']]['avg_rating'])

  return predictions

#### SVD train & predict

In [None]:
from surprise import Dataset
from surprise import Reader

def train_svd(model, movies_df, users_df, train_df):
  reader = Reader(rating_scale=(1, 5))
  data = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
  trainset = data.build_full_trainset()

  model.fit(trainset)

  return model

In [None]:
def predict_svd(model, test_df):
  predictions = []
  for ind, r in test_df.iterrows():
    predictions.append(model.predict(uid=r['user_id'], iid=r['item_id']).est)

  return predictions

#### NeuMF train & predict

In [None]:
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

def train_ncf(model, movies_df, users_df, train_df):
  movies_df = normalize_df(prepare_movies_avg_ratings(prepare_movie_features(movies_df), train_df))
  users_df = normalize_df(prepare_users_avg_ratings(prepare_user_features(users_df), train_df))

  # create image embeddings for movie posters
  img_df = movies_df.reset_index()[['id']]
  img_df['id_str'] = img_df['id'].apply(str)
  img_df['file'] = './img/' + img_df['id_str'] + '.jpg'
  img_df.drop(['id_str'], axis=1, inplace=True)

  movie_poster_embeddings = create_image_embeddings(img_df)
    
  # create user and movie embeddings
  # create dummy ratings for users and movies with unobserved ratings
  next_movie_id = movies_df.index.max() + 1
  next_user_id = users_df.index.max() + 1
  avg_rating = round(train_df.rating.mean())
  for ind, m in movies_df.iterrows():
    if len(train_df[train_df.item_id == ind]) == 0:
      train_df = train_df.append({'user_id': next_user_id, 'item_id': ind, 'rating': avg_rating}, ignore_index=True)
  for ind, u in users_df.iterrows():
    if len(train_df[train_df.user_id == ind]) == 0:
      train_df = train_df.append({'user_id': ind, 'item_id': next_movie_id, 'rating': avg_rating}, ignore_index=True)
  # create embeddings
  user_embeddings, movie_embeddings = create_user_item_embeddings(train_df, train_df)
  # delete dummy ratings
  train_df = train_df[(train_df.item_id != next_movie_id) & (train_df.user_id != next_user_id)]

  x_mf = []
  x_mie = []
  x_me = []
  x_ue = []
  x_uf = []
  for i, r in train_df.iterrows():
    x_mf.append(movies_df.loc[int(r['item_id'])].to_list())
    x_mie.append(movie_poster_embeddings[int(r['item_id'])])
    x_me.append(movie_embeddings[int(r['item_id'])])
    x_ue.append(user_embeddings[int(r['user_id'])])
    x_uf.append(users_df.loc[int(r['user_id'])].to_list())
  x_mf = np.array(x_mf)
  x_mie = np.array(x_mie)
  x_me = np.array(x_me)
  x_ue = np.array(x_ue)
  x_uf = np.array(x_uf)

  y_train = np.array(train_df['rating'].to_list())

  model_copy = keras.models.clone_model(model)

  opt = keras.optimizers.Adam(learning_rate=NCF_LR)

  model_copy.compile(optimizer=opt,
              loss='mean_squared_error',
              metrics=['accuracy'])
  
  es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=NCF_ES_PATIENCE)
  mc = ModelCheckpoint('ncf_model', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

  H = model_copy.fit(
    {"movie_features": x_mf, "movie_poster_embeddings": x_mie, "movie_embeddings": x_me, "user_embeddings": x_ue, "user_features": x_uf},
    y_train, 
    validation_split=0.2,
    batch_size=NCF_BATCH_SIZE,
    epochs=NCF_EPOCHS,
    verbose=1,  
    callbacks=[es, mc]
  )

  return (keras.models.load_model('ncf_model'), movies_df, users_df, movie_embeddings, movie_poster_embeddings, user_embeddings)

In [None]:
def predict_ncf(model, test_df):
  nn_model, movies_df, users_df, movie_embeddings, movie_poster_embeddings, user_embeddings = model

  x_mf = []
  x_mie = []
  x_me = []
  x_ue = []
  x_uf = []
  for i, r in test_df.iterrows():
    x_mf.append(movies_df.loc[int(r['item_id'])].to_list())
    x_mie.append(movie_poster_embeddings[int(r['item_id'])])
    x_me.append(movie_embeddings[int(r['item_id'])])
    x_ue.append(user_embeddings[int(r['user_id'])])
    x_uf.append(users_df.loc[int(r['user_id'])].to_list())
  x_mf = np.array(x_mf)
  x_mie = np.array(x_mie)
  x_me = np.array(x_me)
  x_ue = np.array(x_ue)
  x_uf = np.array(x_uf)

  # print(mids[np.apply_along_axis(lambda a: np.count_nonzero(a) > 0, axis=1, arr=np.isnan(x_mf))])

  return nn_model.predict({"movie_features": x_mf, "movie_poster_embeddings": x_mie, "movie_embeddings": x_me, "user_embeddings": x_ue, "user_features": x_uf})

#### Run crossvalidation

In [None]:
# load datasets
movies_df = load_movies()
users_df = load_users()
ratings_df = load_ratings()

# prepare for crossvalidation
cv_buckets = create_crossval_buckets(ratings_df, CV_NUM_FOLDS)

##### AVG

In [None]:
rmse, mae = crossvalidate_model(None, train_avg, predict_avg, movies_df, users_df, cv_buckets, n_train=math.floor(CV_NUM_FOLDS * CV_TRAIN_RATIO))

print(statistics.mean(rmse))
print(statistics.mean(mae))

print(rmse)
print(mae)

##### SVD

In [None]:
model = build_SVD_model()

rmse, mae = crossvalidate_model(model, train_svd, predict_svd, movies_df, users_df, cv_buckets, n_train=math.floor(CV_NUM_FOLDS * CV_TRAIN_RATIO))

print(statistics.mean(rmse))
print(statistics.mean(mae))

print(rmse)
print(mae)

##### NCF

In [None]:
model = build_NeuMF_model(ncf_layers=[686, 1024, 512, 128, 32])

print(model.summary())

rmse, mae = crossvalidate_model(model, train_ncf, predict_ncf, movies_df, users_df, cv_buckets, n_train=math.floor(CV_NUM_FOLDS * CV_TRAIN_RATIO))

print(statistics.mean(rmse))
print(statistics.mean(mae))

print(rmse)
print(mae)