In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os,sys,inspect
import gc
from tqdm import tqdm
import random

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

from load import *
from evals import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import optimizers, callbacks, layers, losses
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input, Embedding, Flatten, Multiply, Conv1D
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
        print(e)
        
def mish(x):
    return x*tf.math.tanh(tf.math.softplus(x))

def leakyrelu(x, factor=0.2):
    return tf.maximum(x, factor*x)

## Load

In [3]:
df = load_data('../data/ml-100k/u.data', threshold=3)
uuid = df['userId'].unique()
uiid = df['movieId'].unique()


In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.15, random_state=SEED, stratify=df['userId'].values)

In [5]:
tr_X = np.stack([train['userId'].values.astype(np.int32), train['movieId'].values.astype(np.int32)], 1)
test_X = np.stack([test['userId'].values.astype(np.int32), test['movieId'].values.astype(np.int32)], 1)

tr_X.shape, test_X.shape

((85000, 2), (15000, 2))

## Model

In [6]:
class CIN(layers.Layer):
    def __init__(self, cross_layer_sizes, activation=None):
        super(CIN, self).__init__()
        self.cross_layer_sizes = cross_layer_sizes
        self.n_layers = len(cross_layer_sizes)
        self.activation = None
        
        if activation:
            self.activation = Activation(activation)
        
        self.cross_layers = []
        for corss_layer_size in cross_layer_sizes:
            self.cross_layers.append(Conv1D(corss_layer_size, 1, data_format='channels_first'))
            
        self.linear = Dense(1)
    
    def call(self, inputs): # embedding is input
        batch_size, field_size, emb_size = inputs.shape
        xs = [inputs]

        for i, layer in enumerate(self.cross_layers):
            x = tf.einsum('nie,nje->nije', xs[i], xs[0])
            x = tf.reshape(x, (-1, field_size*xs[i].shape[1] , emb_size))

            x = layer(x)
            if self.activation:
                x = self.activation(x)
            
            xs.append(x)
            
        res = tf.reduce_sum(tf.concat(xs, axis=1), -1)
        return res

In [7]:
class xDFM(Model):
    def __init__(self, x_dims, latent_dim, cin_layers, dnn_layers, activation=None, l2_emb=1e-4):
        super(xDFM, self).__init__()
        self.x_dims = x_dims
        
        self.embedding = Embedding(sum(x_dims)+1, latent_dim, input_length=1, embeddings_regularizer=l2(l2_emb))
        
        self.linear = Dense(1)
        
        self.dnn_layers = [Dense(n, activation=activation) for n in dnn_layers]
        self.dnn_final = Dense(1)
        
        self.cin_layers = CIN(cin_layers, activation=activation)
        self.cin_final = Dense(1)
        
    def call(self, inputs):
        # only apply ohe for categorical
        n_feat = inputs.shape[-1]
        sparse = [(tf.one_hot(inputs[:,i], self.x_dims[i])) for i in range(n_feat)]
        sparse = tf.concat(sparse, 1)

        emb = self.embedding(inputs + tf.constant((0, *np.cumsum(self.x_dims)))[:-1])

        dnn_input = Flatten()(emb)

        linear_out = self.linear(sparse)
            
        dnn_out = dnn_input
        for dnn_layer in self.dnn_layers:
            dnn_out = dnn_layer(dnn_out)
        dnn_out = self.dnn_final(dnn_out)

        cin_out = self.cin_layers(emb)
        cin_out = self.cin_final(cin_out)

        out = linear_out + dnn_out + cin_out
        
        return out
        

## Train

In [8]:
xdfm = xDFM((len(uuid), len(uiid)), 8, [32, 32], [128, 64], 'relu')

In [9]:
# easily overfitting, reduce epochs
xdfm.compile(loss=losses.BinaryCrossentropy(from_logits=True), 
            optimizer=optimizers.Adam())

xdfm.fit(tr_X, 
       train['rating'].values,
      epochs=5,
      shuffle=True,
      validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x155ab9c9508>

## Eval

In [10]:
pred = xdfm.predict(test_X)


In [11]:
np.sum(np.where(pred>0., 1, 0).flatten() == test['rating'].values) / len(pred)

0.7232

In [12]:
from sklearn.metrics import precision_score, recall_score,  roc_auc_score, precision_recall_fscore_support

print(roc_auc_score(test['rating'].values, pred))
print(precision_score(test['rating'].values, np.where(pred>0., 1, 0)))
print(recall_score(test['rating'].values, np.where(pred>0., 1, 0)))

0.7906122962149786
0.7255305326981377
0.8056023082471748
