In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os,sys,inspect
import gc
from tqdm import tqdm
import random
import heapq

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

from load import *
def eval_NDCG(true, pred):
    top_k = pred

    for i, item in enumerate(top_k, 1):
        if item == true:
            return 1 / np.log2(i+1)
    return 0

import warnings
warnings.filterwarnings('ignore')

In [2]:
from tensorflow import keras as K
import tensorflow as tf
from tensorflow.keras import optimizers, callbacks, layers, losses
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input, Embedding, Flatten, Multiply
from tensorflow.keras.models import Model, Sequential, load_model

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
        print(e)
        
def mish(x):
    return x*tf.math.tanh(tf.math.softplus(x))

def leakyrelu(x, factor=0.2):
    return tf.maximum(x, factor*x)

In [3]:
df = load_data('../data/ml-100k/u.data', threshold=3)
df = df[df['rating']==1].reset_index(drop=True)
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)

# 10개 이상 평가한 유저만 포함 => 0이 나오는 문제가 발생하여
cnt = tdf.sum(1)
df = df[df['userId'].isin(np.where(cnt >= 10)[0])].reset_index(drop=True)
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)
tdf.iloc[:,:] = 0

In [4]:
test_idx = []
for i in tdf.index:
    test_idx += list(np.random.choice(df[df['userId']==i].index, 1))
    
train = df.iloc[list(set(df.index)-set(test_idx)),:]
test = df.iloc[test_idx, :]

for uid, iid in zip(train['userId'].values, train['movieId'].values):
    tdf.loc[uid, iid] = 1
train =  tdf.copy().astype(np.float32)

loader = tf.data.Dataset.from_tensor_slices(train.values.astype(np.float32))
loader = loader.batch(8, drop_remainder=True).shuffle(len(train))

In [5]:
def log_norm_pdf(x, mu, logvar):
    return -0.5*(logvar + tf.math.log(2 * np.pi) + tf.pow((x - mu), 2) / tf.exp(logvar))

def sampling(args):
    z_mean, z_log_var = args
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = tf.random.normal(shape=(batch, dim), stddev=0.01)
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [6]:
class CompositePrior(tf.keras.models.Model):
    def __init__(self, x_dim, latent_dim, mixture_weights = [3/20, 15/20, 2/20]):
        super().__init__()
        self.encoder_old = Encoder(x_dim, latent_dim, dropout_rate=0)
        self.latent_dim = latent_dim
        self.mixture_weights = mixture_weights
        
        self.mu_prior = self.add_weight(shape=(self.latent_dim, ), initializer = tf.zeros_initializer(), trainable=False)
        self.logvar_prior  = self.add_weight(shape=(self.latent_dim, ), initializer = tf.zeros_initializer(), trainable=False)
        self.logvar_unif_prior = self.add_weight(shape=(self.latent_dim, ), initializer = tf.constant_initializer(10), trainable=False)
        
    def call(self, x, z):
        post_mu, post_logvar = self.encoder_old(x)
        
        stnd_prior = log_norm_pdf(z, self.mu_prior, self.logvar_prior)
        post_prior = log_norm_pdf(z, post_mu, post_logvar)
        unif_prior = log_norm_pdf(z, self.mu_prior, self.logvar_unif_prior)
        
        gaussians = [stnd_prior, post_prior, unif_prior]
        gaussians = [g+tf.math.log(w) for g, w in zip(gaussians, self.mixture_weights)]
        
        density = tf.stack(gaussians, -1)
        return tf.math.log(tf.reduce_sum(tf.exp(density), -1)) # logsumexp

In [7]:
class Encoder(tf.keras.models.Model):
    def __init__(self, x_dim, latent_dim, dropout_rate = 0.1):
        super().__init__()
        self.latent_dim = latent_dim
        self.x_dim = x_dim
        self.dropout_rate = dropout_rate
        self.model = self.build_model()
        
    def build_model(self): # now just shallow net
        x_in = Input(shape=(self.x_dim, ))
        
        h = Dense(1024, activation='relu')(x_in)
        mu = Dense(self.latent_dim)(h)
        logvar = Dense(self.latent_dim)(h)
        
        return Model(x_in, [mu, logvar])
        
    def call(self, x):
        norm = tf.sqrt(tf.reduce_sum(tf.pow(x, 2), -1, keepdims=True))
        x = x/norm
        if self.dropout_rate>0:
            x = Dropout(self.dropout_rate)(x)
        
        return self.model(x)

class RecVAE(tf.keras.models.Model):
    def __init__(self, x_dim, latent_dim):
        super().__init__()
        
        self.encoder = Encoder(x_dim, latent_dim)
        self.decoder = Dense(x_dim)
        self.prior = CompositePrior(x_dim, latent_dim)
        
    def call(self, data):
        mu, logvar = self.encoder(data)
        z = sampling([mu, logvar])
        recon = self.decoder(z)
        
        return mu, logvar, z, recon
    
    def predict(self, data):
        mu, logvar = self.encoder(data)
        z = sampling([mu, logvar])
        recon = self.decoder(z)
        
        return recon
    
    def update_prior(self):
        self.prior.encoder_old.set_weights(self.encoder.get_weights())

In [8]:
def tf_train(model, loader, optimizer, target, gamma=1.):
    total_loss = 0.
    for x in loader:
        norm = tf.reduce_sum(x, -1, keepdims=True)
        kl_weight = gamma*norm
        
        with tf.GradientTape() as tape:
            mu, logvar, z, pred = model(x)
            
#             kl_loss = tf.reduce_mean(tf.reduce_sum(0.5*(logvar + tf.exp(logvar) + tf.pow(mu, 2)-1), 1, keepdims=True))
            kl_loss = tf.reduce_mean(log_norm_pdf(z, mu, logvar) - tf.multiply(model.prior(x, z), kl_weight))
            ce_loss = -tf.reduce_mean(tf.reduce_sum(tf.nn.log_softmax(pred) * x, -1))
            
            loss = ce_loss + kl_loss*kl_weight
            
        if target == 'encoder':
            grads = tape.gradient(loss, model.encoder.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.encoder.trainable_weights))
        else:
            grads = tape.gradient(loss, model.decoder.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.decoder.trainable_weights))
            
        total_loss += tf.reduce_sum(loss)
    return total_loss

In [9]:
epochs = 25

model = RecVAE(train.shape[1], 200)
enc_opt = optimizers.Adam()
dec_opt = optimizers.Adam()

for e in range(epochs):
    # alternating 
    ## train step
    tf_train(model, loader, enc_opt, 'encoder')
    model.update_prior()
    tf_train(model, loader, dec_opt, 'decoder')
    ## eval step

In [10]:
top_k = 10

scores = []
for idx, i in tqdm(enumerate(train.index)):
    item_to_pred = {item: pred.numpy() for item, pred in zip(train.columns, model.predict(train.values)[idx])}
    test_ = test[(test['userId']==i) & (test['rating']==1)]['movieId'].values
    items = list(np.random.choice(list(filter(lambda x: x not in np.argwhere(train.values[idx]).flatten(), item_to_pred.keys())), 100)) + list(test_)
    top_k_items = heapq.nlargest(top_k, items, key=item_to_pred.get)
    
    score = eval_NDCG(test_, top_k_items)
    scores.append(score)
#     break
np.mean(scores)

896it [05:59,  2.49it/s]


0.41734477604306824