In [1]:
# ref: https://github.com/MogicianXD/CML_torch/tree/997690984989d41cef21fde0731b8bf0f8d96064

In [None]:
# TODO
## multiple neg items ~ ranking loss

In [2]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os,sys,inspect
import gc
from tqdm import tqdm
import random
import heapq

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

from load import *
def eval_NDCG(true, pred):
    top_k = pred

    for i, item in enumerate(top_k, 1):
        if item == true:
            return 1 / np.log2(i+1)
    return 0

import warnings
warnings.filterwarnings('ignore')

In [3]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import optimizers, callbacks, layers, losses, models
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input, Embedding, Flatten, Multiply
from tensorflow.keras.models import Model, Sequential, load_model

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
        print(e)
        
def mish(x):
    return x*tf.math.tanh(tf.math.softplus(x))

def leakyrelu(x, factor=0.2):
    return tf.maximum(x, factor*x)

In [4]:
df = load_data('../data/ml-100k/u.data', threshold=3)
df = df[df['rating']==1].reset_index(drop=True)
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)

cnt = tdf.sum(1)
df = df[df['userId'].isin(np.where(cnt >= 10)[0])].reset_index(drop=True)

In [5]:
test_idx = []
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)
for i in tdf.index:
    test_idx += list(np.random.choice(df[df['userId']==i].index, 1))
    
train = df.iloc[list(set(df.index)-set(test_idx)),:]
test = df.iloc[test_idx, :]

In [6]:
def make_trpilet(df, uiid):
    uid_map = {}
    for user in df['userId'].unique():
        uid_map[user] = list(set(uiid) - set(df[df['userId']==user]['movieId'].unique()))

    negs = []
    for i in tqdm(range(len(df))):
        user = df.values[i][0]
        valid_negs = uid_map[user]
        negs.append(np.random.choice(list(valid_negs)))
        
    df['neg'] = negs
    return df

In [7]:
class CML(models.Model):
    def __init__(self, n_users, n_items, emb_dim, feature_shape=None):
        super().__init__()
        self.emb_dim = emb_dim
        self.feature_shape=feature_shape
        self.margin = 1.
#         self.use_rank_weight = True
        self.use_cov_loss = False
        
        # reg weights
        self.feature_l2_reg = 0.1
        self.feature_projection_scaling_factor = 0.5
        self.cov_loss_weight = 0.1
        
        self.clip_norm = 1.
        
        self.user_embedding = Embedding(n_users, emb_dim)
        self.item_embedding = Embedding(n_items, emb_dim)
        
        if self.feature_shape is not None:
            self.mlp = Sequential([
                Dense(self.feature_shape[0], activation='relu'),
                Dense(emb_dim)
            ])
            
    def call(self, inputs):
        user = inputs[:,0]
        item = inputs[:,1]
        
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        
        return -tf.reduce_sum(
            tf.square(user_emb-item_emb), 1
        )
        
        
    def train_step(self, inputs):
        with tf.GradientTape() as tape:
            loss = self.get_loss(inputs)
        
        grads = tape.gradient(loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {'loss': loss}
            
            
    def get_loss(self, inputs):
        X = inputs
        loss = self._embedding_loss(X)
        if self.use_cov_loss:
            loss += self._covariance_loss()
        return loss
    
    def _embedding_loss(self, inputs):
        X = inputs
        users = self.user_embedding(X[:, 0])

        pos_items = self.item_embedding(X[:, 1])
        neg_items = self.item_embedding(X[:, 2])
        
        pos_distances = tf.reduce_sum((users - pos_items) ** 2, 1)
        distance_to_neg_items = tf.reduce_sum((users - neg_items) ** 2, 1)

        # best negative item (among W negative samples) their distance to the user embedding (N)
        closest_negative_item_distances = tf.reduce_min(distance_to_neg_items) #distance_to_neg_items.min(1)[0]

        # compute hinge loss (N)
        distance = pos_distances - closest_negative_item_distances + self.margin
        loss_per_pair = tf.nn.relu(distance) #[]+

#         if self.use_rank_weight:
#             # indicator matrix for impostors (N x W)
#             impostors = (pos_distances - distance_to_neg_items + self.margin) > 0
#             # approximate the rank of positive item by (number of impostor / W per user-positive pair)
#             rank = impostors.float().mean(1) * self.n_items
#             # apply rank weight
#             loss_per_pair *= torch.log(rank + 1)

        # the embedding loss
        loss = tf.reduce_sum(loss_per_pair)

        return loss
    
    def _feature_projection(self):
        if self.features is not None:
            output = self.mlp(self.features) * self.feature_projection_scaling_factor
            # projection to the embedding
            return tf.clip_by_norm(output, self.clip_norm)

    def _feature_loss(self):
        loss = 0
        if feature_projection is not None:
            feature_projection = self._feature_projection()
            loss = tf.reduce_sum((self.item_embedding.weights - feature_projection) ** 2) * self.feature_l2_reg
        return loss

    def _covariance_loss(self):
        X = tf.concat([self.item_embedding.weights[0], self.user_embedding.weights[0]], 0)
        n_rows = X.shape[0]
        X -= tf.reduce_mean(X, 0)
        cov = tf.matmul(X, X, transpose_a=True) / n_rows
        loss = tf.reduce_sum(cov) - tf.linalg.trace(cov)
        return loss * self.cov_loss_weight

In [8]:
train = make_trpilet(train, df['movieId'].unique())
train.head()

100%|██████████████████████████████████| 53378/53378 [00:08<00:00, 6095.17it/s]


Unnamed: 0,userId,movieId,rating,neg
0,297,473,1,405
1,252,464,1,915
2,285,1013,1,1462
3,199,221,1,943
4,121,386,1,964


In [9]:
n_user = df['userId'].unique().max()+1
n_item = df['movieId'].unique().max()+1

model = CML(n_user, n_item, 16)

In [10]:
model.compile(loss='mse', optimizer='adam')
model.fit(train.values,
         epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x169de6ac288>

In [12]:
uiid = df['movieId'].unique()
top_k = 10

scores = []
for user in tqdm(df['userId'].unique()):
    user_in = np.full((len(uiid)), user)
    inputs = np.dstack([user_in, uiid])[0]
    preds = model.predict(inputs)
    
    item_to_pred = dict(zip(uiid, preds))
    test_ = test[(test['userId']==i) & (test['rating']==1)]['movieId'].values
    used = train[train['userId']==user]['movieId'].values
    items = list(np.random.choice(list(filter(lambda x: x not in used, item_to_pred.keys())), 100)) + list(test_)
    top_k_items = heapq.nlargest(top_k, items, key=item_to_pred.get)
    
    score = eval_NDCG(test_, top_k_items)
    scores.append(score)

np.mean(scores)

100%|████████████████████████████████████████| 896/896 [00:59<00:00, 15.09it/s]


0.031343146472542605