In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os
import gc
from tqdm import tqdm
import random

import warnings
warnings.filterwarnings('ignore')

In [2]:
from tensorflow import keras
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import optimizers, callbacks, layers, losses
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input, Embedding, Flatten, Multiply, Dot
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.utils import to_categorical

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
        print(e)
        
def mish(x):
    return x*tf.math.tanh(tf.math.softplus(x))

def leakyrelu(x, factor=0.2):
    return tf.maximum(x, factor*x)

## Load
- make triplet dataset
    - [user_id, positive_item_id, negative_item_id]
    - randomly select just one pair
    

In [3]:
def load_data(filname):
    f = open(filname, 'r')
    fs = f.readlines()
    f.close()

    df = pd.DataFrame(list(map(lambda x: x.split('\t'), fs)), columns=['userId', 'movieId', 'rating', 'time'])
    df = df.drop('time', axis=1)
    df['userId'] = df['userId'].astype(int)
    df['movieId'] = df['movieId'].astype(int)
    df['rating'] = df['rating'].astype(float)
    
    df = df[['userId', 'movieId', 'rating']]
    df['rating'] = 1.
    m_codes = df['movieId'].astype('category').cat.codes
    u_codes = df['userId'].astype('category').cat.codes
    df['movieId'] = m_codes
    df['userId'] = u_codes
    
    return df

def add_negative(df, times=4):
    df_ = df.copy()
    user_id = df['userId'].unique()
    item_id = df['movieId'].unique()
    
    for i in tqdm(user_id):
        cnt = 0
        n = len(df_[df_['userId']==i])
        n_negative = min(n*times, len(item_id)-n)
        available_negative = list(set(uiid) - set(df[df['userId']==i]['movieId'].values))
        
        new = np.random.choice(available_negative, n_negative, replace=False)
        new = [[i, j, 0] for j in new]
        df_ = df_.append(pd.DataFrame(new, columns=df.columns), ignore_index=True)
    
    return df_

def extract_from_df(df, n_positive, n_negative):
    df_ = df.copy()
    rtd = []
    
    user_id = df['userId'].unique()
    
    for i in tqdm(user_id):
        rtd += list(np.random.choice(df[df['userId']==i][df['rating']==1]['movieId'].index, n_positive, replace=False))
        rtd += list(np.random.choice(df[df['userId']==i][df['rating']==0]['movieId'].index, n_negative, replace=False))
        
    return rtd

def make_triplet(df):
    df_ = df.copy()
    user_id = df['userId'].unique()
    item_id = df['movieId'].unique()
    
    negs = np.zeros(len(df), dtype=int)
    for u in tqdm(user_id):
        user_idx = list(df[df['userId']==u].index)
        n_choose = len(user_idx)
        available_negative = list(set(uiid) - set(df[df['userId']==u]['movieId'].values))
        new = np.random.choice(available_negative, n_choose, replace=True)
        
        negs[user_idx] = new
    df_['negative'] = negs
    
    return df_

In [4]:
df = load_data('../data/ml-100k/u.data')
uuid = df['userId'].unique()
uiid = df['movieId'].unique()


In [5]:
df = make_triplet(df)

100%|██████████| 943/943 [00:01<00:00, 741.99it/s]


In [6]:
rtd = extract_from_df(df, 1, 0)

100%|██████████| 943/943 [00:02<00:00, 410.93it/s]


In [7]:
train = df.drop(rtd)
test = df.loc[rtd]

In [8]:
tr_X = [
    train['userId'].values, 
    train['movieId'].values,
    train['negative'].values
]

## Model
- no additional layer after embedding

In [9]:
class BPR_Triplet(keras.Model):
    def __init__(self, u_dim, i_dim, latent_dim):
        super(BPR_Triplet, self).__init__()
        
        self.u_dim = u_dim
        self.i_dim = i_dim
        self.latent_dim = latent_dim
        
        self.model = self.build_model()

    def compile(self, optim):
        super(BPR_Triplet, self).compile()
        self.optim = optim
    
    def build_model(self):
        u_input = Input(shape=(1, ))
        i_input = Input(shape=(1, ))

        u_emb = Flatten()(Embedding(self.u_dim, self.latent_dim, input_length=u_input.shape[1])(u_input))
        i_emb = Flatten()(Embedding(self.i_dim, self.latent_dim, input_length=i_input.shape[1])(i_input))

        mul = Dot(1)([u_emb, i_emb])

#         out = Dense(1)(mul)
        
        return Model([u_input, i_input], mul)
    
    def train_step(self, data):
        user, pos, neg = data[0]

        with tf.GradientTape() as tape:
            pos_d = self.model([user, pos])
            neg_d = self.model([user, neg])
            
            loss = -tf.reduce_mean(tf.math.log(tf.sigmoid(pos_d - neg_d)))

        grads = tape.gradient(loss, self.model.trainable_weights)
        self.optim.apply_gradients(zip(grads, self.model.trainable_weights))
        
        return {'loss': loss}
    
    def call(self, data):
        user, item = data
        return self.model([user, item])


In [10]:
bpr = BPR_Triplet(len(uuid), len(uiid), 32)
bpr.compile(optim=optimizers.Adam())
bpr.fit(tr_X,
         epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a2a96c43c8>

## Evaluate

In [11]:
import heapq    

def eval_hit(model, test, user_id, item_ids, top_k):
    # TODO(maybe): remove negative used in train
    items = list(set(uiid) - set(df[df['userId']==user_id][df['rating']==1]['movieId'].values) - set(df[df['userId']==user_id]['negative'].values))
    np.random.shuffle(items)
    items = items[:99]
    items.append(test[test['userId']==user_id]['movieId'].values[0])

    items = np.array(items).reshape(-1, 1)

    user = np.full(len(items), user_id).reshape(-1, 1)

    preds = model.predict([user, items]).flatten()
    item_to_pred = {item: pred for item, pred in zip(items.flatten(), preds)}

    top_k = heapq.nlargest(top_k, item_to_pred, key=item_to_pred.get)
    
    if items[-1][0] in top_k:
            return 1
    return 0

def eval_hit_wrapper(model, test, item_ids, top_k):
    def f(user_id):
        return eval_hit(model, test, user_id, item_ids, top_k)
    return f

def eval_NDCG(model, test,user_id, item_ids, top_k):
    items = list(set(uiid) - set(df[df['userId']==user_id][df['rating']==1]['movieId'].values) - set(df[df['userId']==user_id]['negative'].values))
    np.random.shuffle(items)
    items = items[:99]
    items.append(test[test['userId']==user_id]['movieId'].values[0])

    items = np.array(items).reshape(-1, 1)

    user = np.full(len(items), user_id).reshape(-1, 1)

    preds = model.predict([user, items]).flatten()
    item_to_pred = {item: pred for item, pred in zip(items.flatten(), preds)}

    top_k = heapq.nlargest(top_k, item_to_pred, key=item_to_pred.get)
    
    for i, item in enumerate(top_k, 1):
        if item == test[test['userId']==user_id]['movieId'].values:
            return np.log(i) / np.log(i+2)
    return 0

def eval_NDCG_wrapper(model, test, item_ids, top_k):
    def f(user_id):
        return eval_NDCG(model, test, user_id, item_ids, top_k)
    return f

In [12]:
hr10 = list(map(eval_hit_wrapper(bpr, test, uiid, 10), uuid))
sum(hr10)/len(hr10)

0.7391304347826086

In [13]:
ndcg10 = list(map(eval_NDCG_wrapper(bpr, test, uiid, 10), uuid))
sum(ndcg10)/len(ndcg10)

0.3800978232646535