In [None]:
import os
import math
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from keras.utils import plot_model
from scipy.sparse import csr_matrix
from sklearn.metrics import make_scorer

import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras import initializers
from tensorflow.python.keras.models import Model
from tensorflow.python.keras import regularizers
from tensorflow.python.keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint

from tensorflow.python.keras.models import model_from_json
from tensorflow.keras import optimizers
from tensorflow.python.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.python.keras.layers import add, concatenate
from tensorflow.python.keras.layers import Input, Dense, Embedding, Flatten
from tensorflow.keras.layers import  Dropout,Activation, BatchNormalization, LeakyReLU
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV
%matplotlib inline
warnings.filterwarnings('ignore')

load the data

In [None]:
db = pd.read_csv("db-ratings.csv", na_values=-1)
dbmovies = pd.read_csv("db-movies.csv")
ml = pd.read_csv("ml-ratings.csv", na_values=-1)
links = pd.read_csv("links.csv")

data precessing

In [None]:
dbmovies = dbmovies[["MOVIE_ID",'NAME', "IMDB_ID"]]
db = pd.merge(db,dbmovies, how = 'outer', on='MOVIE_ID')

In [None]:
##drop na, clean the data
db = db.dropna()

In [None]:
db['IMDB_ID_cleaned'] = db['IMDB_ID'].map(lambda x: x.strip('tt'))
db['IMDB_ID_cleaned']= db['IMDB_ID_cleaned'].astype(int)

In [None]:
db = db.drop(['RATING_ID', 'MOVIE_ID','RATING_TIME','NAME','IMDB_ID'], axis=1)

In [None]:
##rename the columns
db.columns = ['userId', 'rating','imdbId']

In [None]:
##statistics about the db data: rating counts, unique users, unique ids
percent_missing = db.isnull().sum() * 100 / len(db)
dbunique_users = db['userId'].unique()
dbunique_imdbid = db['imdbId'].unique()
print(percent_missing)
print('missing values: ' + str(round(percent_missing['rating'], 2)) + '%')
print('dbunique users: ' + str(len(dbunique_users)))
print('dbunique imdbid: ' + str(len(dbunique_imdbid)))
print(len(db))

In [None]:
##Movielens, merge the tables, add imdbid with rating table, drop the unneccesary colmns
ml = pd.merge(ml,links, how = 'outer', on='movieId')
ml = ml.drop(['movieId','timestamp', 'tmdbId'], axis=1)
ml = ml.dropna()

In [None]:
percent_missingml = ml.isnull().sum() * 100 / len(ml)
mlunique_users = ml['userId'].unique()
mlunique_imdb = ml['imdbId'].unique()

print('missing values: ' + str(round(percent_missingml['rating'], 2)) + '%')
print('mlunique users: ' + str(len(mlunique_users)))
print('mlunique imdb: ' + str(len(mlunique_imdb)))
print(len(ml))

In [None]:
##slect the sub datasets ony have the same movies
db = db.loc[db['imdbId'].isin(list(mlunique_imdb))]##only include the same movies

dbunique_users = db['userId'].unique()
dbunique_imdbid = db['imdbId'].unique()

print('dbunique users: ' + str(len(dbunique_users)))
print('dbunique imdbid: ' + str(len(dbunique_imdbid)))
print(len(db))
print(dbunique_imdbid)

In [None]:
##checking the douban dataset counting frequency
def plot_frequency(db, k):
    plt.style.use('ggplot')
    fig, ax = plt.subplots(figsize=(8,5.5))
    counts = db['imdbId'].value_counts(sort=True, ascending=False)
    orig = counts.index.tolist()
    counts2 = counts.reset_index(inplace=False, drop=True)
    sub = counts2.index.tolist()

    sns.lineplot(x=counts2.index[0:k], y=counts2[0:k]/counts2[0:k].sum(), ax=ax)
    ax.fill_between(counts2.index[0:k], counts2[0:k]/counts2[0:k].sum(), alpha=0.5)
    ax.set_ylabel("Relative frequency")
    ax.set_xlabel("top-k movies")
    plt.title('The relative frequency of top-k movies in Douban')

plot_frequency(db, k=5000)
#plt.savefig('The relative frequency of top-k items in Douban')

In [None]:
##slect the sub datasets ony have the same movies
ml = ml.loc[ml['imdbId'].isin(list(dbunique_imdbid))]##only include the same movies
mlunique_users = ml['userId'].unique()
mlunique_imdb = ml['imdbId'].unique()

print('mlunique users: ' + str(len(mlunique_users)))
print('mlunique imdb: ' + str(len(mlunique_imdb)))
print(len(ml))

In [None]:
def plot_frequency2(ml, k):
    plt.style.use('ggplot')
    fig, ax = plt.subplots(figsize=(8,5.5))
    counts = ml['imdbId'].value_counts(sort=True, ascending=False)
    orig = counts.index.tolist()
    counts2 = counts.reset_index(inplace=False, drop=True)
    sub = counts2.index.tolist()

    sns.lineplot(x=counts2.index[0:k], y=counts2[0:k]/counts2[0:k].sum(), ax=ax)
    ax.fill_between(counts2.index[0:k], counts2[0:k]/counts2[0:k].sum(), alpha=0.5)
    ax.set_ylabel("relative frequency")
    ax.set_xlabel("top-k movies")
    plt.title('The relative frequency of top-k movies in IMDb')

plot_frequency2(ml, k=5000)
#plt.savefig('The relative frequency of top-k items in IMDb')

In [None]:
##select subdatasets
##select the top rated datasets
def select(db, k, col):

    top_values = db[col].value_counts().nlargest(k)
    return db.loc[db[col].isin(top_values.index)]

dbtop = select(db, k=1000, col='imdbId')

In [None]:
dbtopuser = dbtop['userId'].unique()
dbtopmovie = dbtop['imdbId'].unique()

print(len(dbtopuser))
print(len(dbtopmovie))
print(len(dbtop))

In [None]:
##select the top rated 1000 moviesof db in movielens dataset
mlsub = ml.loc[ml['imdbId'].isin(list(dbtopmovie))]##only include the same movies

In [None]:
mlsubuser = mlsub['userId'].unique()
mlsubmovie = mlsub['imdbId'].unique()

print(len(mlsubuser))
print(len(mlsubmovie))
print(len(mlsub))

In [None]:
##check rating distribution
plt.style.use('ggplot')
plt.figure(figsize=(5, 5))
sns.countplot(data = dbtop, x= 'rating', palette='Set2')
plt.xlabel('Movie ratings of Douban')
plt.show()

In [None]:
##round up the rating to integer
mlsub['rating'] = mlsub['rating'].apply(np.ceil)


In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(5, 5))
sns.countplot(data = mlsub, x= 'rating', palette='Set2')
plt.xlabel('Movie ratings of ML')
plt.show()

In [None]:
##use dbtop and mlsub
##transfer dbtop to arrays,creat a matrix where rows are users, columns are movies
dbtop_arr = dbtop.pivot(index = "userId",columns = "imdbId",values = "rating")

Train/val/test split

In [None]:
##Train/val/test split
##Split the data into random 90%–10% train-test sets
train_db, test_db = train_test_split(dbtop_arr, 
                                     test_size=0.1, random_state = 42)

In [None]:
##data into matrix
matrix_train_db = train_db.to_numpy(dtype = 'float')
matrix_train_db[np.isnan(matrix_train_db)] = 0
matrix_train_db

In [None]:
matrix_test_db = test_db.to_numpy(dtype = 'float')
matrix_test_db[np.isnan(matrix_test_db)] = 0

In [None]:
print(matrix_train_db.shape)
print(matrix_test_db.shape)

In [None]:
##mlsub data processing
mlsub_arr = mlsub.pivot(index = "userId",columns = "imdbId",values = "rating")

In [None]:
##Train/val/test split
##Split the data into random 90%–10% train-test sets
train_ml, test_ml = train_test_split(mlsub_arr,
                                     test_size=0.1, random_state = 42)

In [None]:
matrix_train_ml = train_ml.to_numpy(dtype = 'float')
matrix_train_ml[np.isnan(matrix_train_ml)] = 0
matrix_train_ml
matrix_test_ml = test_ml.to_numpy(dtype = 'float')
matrix_test_ml[np.isnan(matrix_test_ml)] = 0

In [None]:
print(matrix_train_ml.shape)
print(matrix_test_ml.shape)

Funtions

In [None]:
##plots error & rmse
def show_error(history, skip):
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    plt.plot(np.arange(skip, len(loss), 1), loss[skip:])
    plt.plot(np.arange(skip, len(loss), 1), val_loss[skip:])
    plt.title('model train vs validation loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='best')
    plt.show()

In [None]:
def show_rmse(history, skip):
    rmse = history.history['masked_rmse_clip']
    val_rmse = history.history['val_masked_rmse_clip']
    plt.plot(np.arange(skip, len(rmse), 1), rmse[skip:])
    plt.plot(np.arange(skip, len(val_rmse), 1), val_rmse[skip:])
    plt.title('model train vs validation masked_rmse')
    plt.ylabel('rmse')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='best')
    plt.show()

In [None]:
##loss fuction

In [None]:
def masked_mse(y_true, y_pred):
  # masked function
    mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
  # masked squared error
    masked_squared_error = K.square(mask_true * (y_true - y_pred))
    masked_mse = K.sum(masked_squared_error, axis=-1) / K.maximum(K.sum(mask_true, axis=-1), 1)
    return masked_mse

In [None]:
def masked_rmse(y_true, y_pred):
  # masked function
    mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
  # masked squared error
    masked_squared_error = K.square(mask_true * (y_true - y_pred))
    masked_rmse = K.sqrt(K.sum(masked_squared_error, axis=-1) / K.maximum(K.sum(mask_true, axis=-1), 1))
    return masked_rmse

In [None]:
def masked_rmse_clip(y_true, y_pred):
  # masked function
    mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
    y_pred = K.clip(y_pred, 1, 5)
  # masked squared error
    masked_squared_error = K.square(mask_true * (y_true - y_pred))
    masked_rmse = K.sqrt(K.sum(masked_squared_error, axis=-1) / K.maximum(K.sum(mask_true, axis=-1), 1))
    return masked_rmse

build models

In [None]:
####hyparparameter tuning, create the model, check architeture first

In [None]:
def Deep_AE_model(X, layers, activation, last_activation, dropout, regularizer_encode, regularizer_decode, side_infor_size=0):
    input_layer = x = Input(shape=(X.shape[1],), name='UserRating')
    x = Dense(layers[0], activation=activation, kernel_regularizer=regularizers.l2(regularizer_encode))(x)
    x = Dense(layers[1], activation=activation, kernel_regularizer=regularizers.l2(regularizer_encode))(x)
    x = Dropout(rate = dropout)(x)
    x = Dense(layers[2], activation=activation, kernel_regularizer=regularizers.l2(regularizer_decode))(x)
    output_layer = Dense(X.shape[1]-side_infor_size, activation=last_activation, kernel_regularizer=regularizers.l2(regularizer_decode))(x)
    model = Model(input_layer, output_layer)
    return model
    

In [None]:
#layers = [100, 50, 100]
layers = [256, 512, 256]
#layers = [500,100,500]
dropout = 0.8
#learning_rate = 0.001
# activation = 'sigmoid'
#last_activation = 'linear'
activation = 'selu'
last_activation = 'selu'
regularizer_encode = 0.001
regularizer_decode = 0.001
adam = optimizers.Adam(lr=0.001)

In [None]:
model = Deep_AE_model(matrix_train_db, layers, activation, last_activation, dropout, regularizer_encode, regularizer_decode)
model.compile(optimizer = 'adam',loss=masked_mse, metrics=[masked_rmse_clip]) 
model.summary()

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='masked_rmse_clip', patience= 5)

In [None]:
##fit the model
hist_Deep_AE = model.fit(x=matrix_train_db, y=matrix_train_db,
                  epochs=100,
                  batch_size= 128, validation_split= 0.1 ,callbacks = [stop_early],verbose=2)

In [None]:
##confirm the architeture, use gridsearch to choose the best model


In [None]:
def create_model(activation_function, learning_rate):
    input_layer = x = Input(shape=(X.shape[1],))
    x = Dense(256, activation=activation_function, kernel_regularizer=regularizers.l2(0.001))(x)
    x = Dense(512, activation=activation_function, kernel_regularizer=regularizers.l2(0.001))(x)
    x = Dropout(rate = 0.8)(x)
    x = Dense(256, activation=activation_function, kernel_regularizer=regularizers.l2(0.001))(x)
    output_layer = Dense(X.shape[1], activation='selu', kernel_regularizer=regularizers.l2(0.001))(x)
    model = Model(input_layer, output_layer)
    adam = optimizers.Adam(learning_rate = learning_rate)
    model.compile(optimizer = 'adam', loss=masked_mse, metrics=[masked_rmse_clip])
    return model

In [None]:
#define the scoring function
def rmse(y_true, y_pred):
    mask_true = np.not_equal(y_true, 0).astype(float)
    y_pred = np.clip(y_pred, 1, 5)
    square_error = np.square(mask_true * (y_true - y_pred))
    rmse = np.sqrt(np.sum(square_error,axis=-1)/np.maximum(np.sum(mask_true,axis=-1),1))
    return np.mean(rmse)

In [None]:
rmse = make_scorer(rmse,greater_is_better = False)

In [None]:
##input data
X = matrix_train_db
Y = matrix_train_db

In [None]:
# define the grid search parameters
batch_size = [128,256,512]
activation_function = ['selu','elu']
learning_rate = [0.01,0.001,0.0001]
param_grid = dict(batch_size = batch_size, activation_function=activation_function, learning_rate = learning_rate)

# create model
model1 = KerasRegressor(build_fn=create_model, epochs = 30, verbose=2)

# Build and fit the GridSearchCV
#stop_early = tf.keras.callbacks.EarlyStopping(monitor='masked_rmse_clip', patience=5)
grid = GridSearchCV(estimator=model1, scoring= rmse, param_grid=param_grid, n_jobs = 2, cv=3)
grid_result = grid.fit(X, Y)

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
##grid search graph

sns.set()

def plot_tuning_results(df_val):
    
    df_temp = df_val[:][df_val.batch_size != 256].sort_values(by=['learning_rate', 'activation_function'])
    df_128 = df_temp[:][df_temp.batch_size != 512].sort_values(by=['learning_rate', 'activation_function'])
    df_256 = df_val[:][df_val.batch_size == 256].sort_values(by=['learning_rate', 'activation_function'])
    df_512 = df_val[:][df_val.batch_size == 512].sort_values(by=['learning_rate', 'activation_function'])

    pivot1 = df_128.pivot_table(index='learning_rate',columns='activation_function',values='RMSE')
    pivot2 = df_256.pivot_table(index='learning_rate',columns='activation_function',values='RMSE')
    pivot3 = df_512.pivot_table(index='learning_rate',columns='activation_function',values='RMSE')
    
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16,4))

    sns.heatmap(pivot1,vmin=0.4,vmax=0.55,annot=True,linewidths=0.4,ax=ax1,cmap=sns.cubehelix_palette(as_cmap=True))
    sns.heatmap(pivot2,vmin=0.4,vmax=0.55,annot=True,linewidths=0.4,ax=ax2,cmap=sns.cubehelix_palette(as_cmap=True))
    sns.heatmap(pivot3,vmin=0.4,vmax=0.55,annot=True,linewidths=0.4,ax=ax3,cmap=sns.cubehelix_palette(as_cmap=True))
    
    ax1.set_title("batch_size: 128")
    ax3.set_title("batch_size: 512")
    ax2.set_title("batch_size: 256")

In [None]:
df_grid = pd.DataFrame(grid.cv_results_["params"])
df_grid
df_grid['RMSE'] = np.round(np.abs(grid.cv_results_["mean_test_score"]),3)
plot_tuning_results(df_grid)
#plt.savefig('output.eps')

In [None]:
###bulid a new model with the best parameters and train on the whole dataset, benefit is you could train more epochs
bestmodel_db = create_model(activation_function = 'selu', learning_rate=0.01)
bestmodel_db.summary()

In [None]:
#stop_early = tf.keras.callbacks.EarlyStopping(monitor='masked_rmse_clip', patience=5)
hist_best = bestmodel_db.fit(x=matrix_train_db, y=matrix_train_db, batch_size=512,validation_split= 0.1,
                  epochs=100,
                  verbose=2)

In [None]:
##evaluate the model 
test_db = bestmodel_db.evaluate(matrix_test_db, matrix_test_db)

In [None]:
##cross-test
test_ml = bestmodel_db.evaluate(matrix_test_ml, matrix_test_ml)

In [None]:
##load the weights
bestmodel_db.save_weights("db_weights.h5")

In [None]:
##retain the model on another dataset
hist_db_retrain = bestmodel_db.fit(x=matrix_train_ml, y=matrix_train_ml, batch_size=512,
                  epochs=100,
                  verbose=2)

In [None]:
##evaluate the retrained model 
test_ml2 = bestmodel_db.evaluate(matrix_test_ml, matrix_test_ml)

In [None]:
##additonal experiemnt
##creat a new model with random parameters(activation = relu, lr = 0.01, batch_size = 128) but load the weights of best model, to check if the model performance well on the other dataset
test_model_db = create_model(activation_function = 'relu',learning_rate=0.001)
test_model_db.summary()

In [None]:
##load the first model's weights
test_model_db.load_weights("db_weights.h5")

In [None]:
##fit the model
hist_test2 = test_model_db.fit(x=matrix_train_ml, y=matrix_train_ml, batch_size=128,
                  epochs=100,
                  verbose=2)

In [None]:
##evaluate the test model
test_db2 = test_model_db.evaluate(matrix_test_ml, matrix_test_ml)

In [None]:
##the same processure on another dataset