# Import Necessary lib

In [1]:
from ipywidgets import FloatProgress, IntProgress
from IPython.display import display
from tqdm import tqdm

# Load Python File

In [2]:
# %load ../src/layers.py
import tensorflow as tf
from abc import abstractmethod

LAYER_IDS = {}


def get_layer_id(layer_name=''):
    if layer_name not in LAYER_IDS:
        LAYER_IDS[layer_name] = 0
        return 0
    else:
        LAYER_IDS[layer_name] += 1
        return LAYER_IDS[layer_name]


class Layer(object):
    def __init__(self, name):
        if not name:
            layer = self.__class__.__name__.lower()
            name = layer + '_' + str(get_layer_id(layer))
        self.name = name
        self.vars = []

    def __call__(self, inputs):
        outputs = self._call(inputs)
        return outputs

    @abstractmethod
    def _call(self, inputs):
        pass


class Dense(Layer):
    def __init__(self, input_dim, output_dim, dropout=0.0, act=tf.nn.relu, name=None):
        super(Dense, self).__init__(name)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.act = act
        with tf.variable_scope(self.name):
            self.weight = tf.get_variable(name='weight', shape=(input_dim, output_dim), dtype=tf.float32)
            self.bias = tf.get_variable(name='bias', shape=output_dim, initializer=tf.zeros_initializer())
        self.vars = [self.weight]

    def _call(self, inputs):
        x = tf.nn.dropout(inputs, 1-self.dropout)
        output = tf.matmul(x, self.weight) + self.bias
        return self.act(output)


class CrossCompressUnit(Layer):
    def __init__(self, dim, name=None):
        super(CrossCompressUnit, self).__init__(name)
        self.dim = dim
        with tf.variable_scope(self.name):
            self.weight_vv = tf.get_variable(name='weight_vv', shape=(dim, 1), dtype=tf.float32)
            self.weight_ev = tf.get_variable(name='weight_ev', shape=(dim, 1), dtype=tf.float32)
            self.weight_ve = tf.get_variable(name='weight_ve', shape=(dim, 1), dtype=tf.float32)
            self.weight_ee = tf.get_variable(name='weight_ee', shape=(dim, 1), dtype=tf.float32)
            self.bias_v = tf.get_variable(name='bias_v', shape=dim, initializer=tf.zeros_initializer())
            self.bias_e = tf.get_variable(name='bias_e', shape=dim, initializer=tf.zeros_initializer())
        self.vars = [self.weight_vv, self.weight_ev, self.weight_ve, self.weight_ee]

    def _call(self, inputs):
        # [batch_size, dim]
        v, e = inputs

        # [batch_size, dim, 1], [batch_size, 1, dim]
        v = tf.expand_dims(v, dim=2)
        e = tf.expand_dims(e, dim=1)

        # [batch_size, dim, dim]
        c_matrix = tf.matmul(v, e)
        c_matrix_transpose = tf.transpose(c_matrix, perm=[0, 2, 1])

        # [batch_size * dim, dim]
        c_matrix = tf.reshape(c_matrix, [-1, self.dim])
        c_matrix_transpose = tf.reshape(c_matrix_transpose, [-1, self.dim])

        # [batch_size, dim]
        v_output = tf.reshape(tf.matmul(c_matrix, self.weight_vv) + tf.matmul(c_matrix_transpose, self.weight_ev),
                              [-1, self.dim]) + self.bias_v
        e_output = tf.reshape(tf.matmul(c_matrix, self.weight_ve) + tf.matmul(c_matrix_transpose, self.weight_ee),
                              [-1, self.dim]) + self.bias_e

        return v_output, e_output


In [3]:
# %load ../src/model.py
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score


class MKR(object):
    def __init__(self, args, n_users, n_items, n_entities, n_relations):
        self._parse_args(n_users, n_items, n_entities, n_relations)
        self._build_inputs()
        self._build_model(args)
        self._build_loss(args)
        self._build_train(args)

    def _parse_args(self, n_users, n_items, n_entities, n_relations):
        self.n_user = n_users
        self.n_item = n_items
        self.n_entity = n_entities
        self.n_relation = n_relations

        # for computing l2 loss
        self.vars_rs = []
        self.vars_kge = []

    def _build_inputs(self):
        self.user_indices = tf.placeholder(tf.int32, [None], 'user_indices')
        self.item_indices = tf.placeholder(tf.int32, [None], 'item_indices')
        self.labels = tf.placeholder(tf.float32, [None], 'labels')
        self.head_indices = tf.placeholder(tf.int32, [None], 'head_indices')
        self.tail_indices = tf.placeholder(tf.int32, [None], 'tail_indices')
        self.relation_indices = tf.placeholder(tf.int32, [None], 'relation_indices')

    def _build_model(self, args):
        self._build_low_layers(args)
        self._build_high_layers(args)

    def _build_low_layers(self, args):
        self.user_emb_matrix = tf.get_variable('user_emb_matrix', [self.n_user, args.dim])
        self.item_emb_matrix = tf.get_variable('item_emb_matrix', [self.n_item, args.dim])
        self.entity_emb_matrix = tf.get_variable('entity_emb_matrix', [self.n_entity, args.dim])
        self.relation_emb_matrix = tf.get_variable('relation_emb_matrix', [self.n_relation, args.dim])

        # [batch_size, dim]
        self.user_embeddings = tf.nn.embedding_lookup(self.user_emb_matrix, self.user_indices)
        self.item_embeddings = tf.nn.embedding_lookup(self.item_emb_matrix, self.item_indices)
        self.head_embeddings = tf.nn.embedding_lookup(self.entity_emb_matrix, self.head_indices)
        self.relation_embeddings = tf.nn.embedding_lookup(self.relation_emb_matrix, self.relation_indices)
        self.tail_embeddings = tf.nn.embedding_lookup(self.entity_emb_matrix, self.tail_indices)

        for _ in range(args.L):
            user_mlp = Dense(input_dim=args.dim, output_dim=args.dim)
            tail_mlp = Dense(input_dim=args.dim, output_dim=args.dim)
            cc_unit = CrossCompressUnit(args.dim)
            self.user_embeddings = user_mlp(self.user_embeddings)
            self.item_embeddings, self.head_embeddings = cc_unit([self.item_embeddings, self.head_embeddings])
            self.tail_embeddings = tail_mlp(self.tail_embeddings)

            self.vars_rs.extend(user_mlp.vars)
            self.vars_rs.extend(cc_unit.vars)
            self.vars_kge.extend(tail_mlp.vars)
            self.vars_kge.extend(cc_unit.vars)

    def _build_high_layers(self, args):
        # RS
        use_inner_product = True
        if use_inner_product:
            # [batch_size]
            self.scores = tf.reduce_sum(self.user_embeddings * self.item_embeddings, axis=1)
        else:
            # [batch_size, dim * 2]
            self.user_item_concat = tf.concat([self.user_embeddings, self.item_embeddings], axis=1)
            for _ in range(args.H - 1):
                rs_mlp = Dense(input_dim=args.dim * 2, output_dim=args.dim * 2)
                # [batch_size, dim * 2]
                self.user_item_concat = rs_mlp(self.user_item_concat)
                self.vars_rs.extend(rs_mlp.vars)

            rs_pred_mlp = Dense(input_dim=args.dim * 2, output_dim=1)
            # [batch_size]
            self.scores = tf.squeeze(rs_pred_mlp(self.user_item_concat))
            self.vars_rs.extend(rs_pred_mlp.vars)
        self.scores_normalized = tf.nn.sigmoid(self.scores)

        # KGE
        # [batch_size, dim * 2]
        self.head_relation_concat = tf.concat([self.head_embeddings, self.relation_embeddings], axis=1)
        for _ in range(args.H - 1):
            kge_mlp = Dense(input_dim=args.dim * 2, output_dim=args.dim * 2)
            # [batch_size, dim]
            self.head_relation_concat = kge_mlp(self.head_relation_concat)
            self.vars_kge.extend(kge_mlp.vars)

        kge_pred_mlp = Dense(input_dim=args.dim * 2, output_dim=args.dim)
        # [batch_size, 1]
        self.tail_pred = kge_pred_mlp(self.head_relation_concat)
        self.vars_kge.extend(kge_pred_mlp.vars)
        self.tail_pred = tf.nn.sigmoid(self.tail_pred)

        self.scores_kge = tf.nn.sigmoid(tf.reduce_sum(self.tail_embeddings * self.tail_pred, axis=1))
        self.rmse = tf.reduce_mean(
            tf.sqrt(tf.reduce_sum(tf.square(self.tail_embeddings - self.tail_pred), axis=1) / args.dim))

    def _build_loss(self, args):
        # RS
        self.base_loss_rs = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels, logits=self.scores))
        self.l2_loss_rs = tf.nn.l2_loss(self.user_embeddings) + tf.nn.l2_loss(self.item_embeddings)
        for var in self.vars_rs:
            self.l2_loss_rs += tf.nn.l2_loss(var)
        self.loss_rs = self.base_loss_rs + self.l2_loss_rs * args.l2_weight

        # KGE
        self.base_loss_kge = -self.scores_kge
        self.l2_loss_kge = tf.nn.l2_loss(self.head_embeddings) + tf.nn.l2_loss(self.tail_embeddings)
        for var in self.vars_kge:
            self.l2_loss_kge += tf.nn.l2_loss(var)
        self.loss_kge = self.base_loss_kge + self.l2_loss_kge * args.l2_weight

    def _build_train(self, args):
        self.optimizer_rs = tf.train.AdamOptimizer(args.lr_rs).minimize(self.loss_rs)
        self.optimizer_kge = tf.train.AdamOptimizer(args.lr_kge).minimize(self.loss_kge)

    def train_rs(self, sess, feed_dict):
        return sess.run([self.optimizer_rs, self.loss_rs], feed_dict)

    def train_kge(self, sess, feed_dict):
        return sess.run([self.optimizer_kge, self.rmse], feed_dict)

    def eval(self, sess, feed_dict):
        labels, scores = sess.run([self.labels, self.scores_normalized], feed_dict)
        auc = roc_auc_score(y_true=labels, y_score=scores)
        predictions = [1 if i >= 0.5 else 0 for i in scores]
        acc = np.mean(np.equal(predictions, labels))
        return auc, acc

    def get_scores(self, sess, feed_dict):
        return sess.run([self.item_indices, self.scores_normalized], feed_dict)

In [4]:
# %load ../src/train.py
import pickle
import tensorflow as tf
import numpy as np
from datetime import datetime

timestamp = str(datetime.timestamp(datetime.now()))

# logger = Logger()
# session_log_path = "../log/{}/".format(timestamp)
# logger.create_session_folder(session_log_path)
# logger.set_default_filename(session_log_path + "log.txt")


def train(args, data, show_loss, show_topk):
    logger.log(str(args))
    n_user, n_item, n_entity, n_relation = data[0], data[1], data[2], data[3]
    train_data, eval_data, test_data = data[4], data[5], data[6]
    kg = data[7]

    n_item = n_item

    model = MKR(args, n_user, n_item, n_entity, n_relation)

    print("n_user : " , n_user, "\n")
    print("n_item : " , n_item, "\n")

    # top-K evaluation settings
    user_num = 100
    k_list = [1, 2, 5, 10, 20, 50, 100]
    train_record = get_user_record(train_data, True)
    test_record = get_user_record(test_data, False)
    user_list = list(set(train_record.keys()) & set(test_record.keys()))
    if len(user_list) > user_num:
        user_list = np.random.choice(user_list, size=user_num, replace=False)
    
    # item_set = set(list(range(n_item)))
    item_set = set()

    for data in train_data :
        item_set.add(int(data[1]))

    for data in eval_data :
        item_set.add(int(data[1]))

    for data in test_data :
        item_set.add(int(data[1]))


    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(max_to_keep=None)

        for step in range(args.n_epochs):
            # RS training
            np.random.shuffle(train_data)
            start = 0
            while start < train_data.shape[0]:
                _, loss = model.train_rs(sess, get_feed_dict_for_rs(model, train_data, start, start + args.batch_size))
                start += args.batch_size
                if show_loss:
                    print(loss)

            # KGE training
            if step % args.kge_interval == 0:
                np.random.shuffle(kg)
                start = 0
                while start < kg.shape[0]:
                    _, rmse = model.train_kge(sess, get_feed_dict_for_kge(model, kg, start, start + args.batch_size))
                    start += args.batch_size
                    if show_loss:
                        print(rmse)

            # CTR evaluation
            train_auc, train_acc = model.eval(sess, get_feed_dict_for_rs(model, train_data, 0, train_data.shape[0]))
            eval_auc, eval_acc = model.eval(sess, get_feed_dict_for_rs(model, eval_data, 0, eval_data.shape[0]))
            test_auc, test_acc = model.eval(sess, get_feed_dict_for_rs(model, test_data, 0, test_data.shape[0]))

            print('epoch %d    train auc: %.4f  acc: %.4f    eval auc: %.4f  acc: %.4f    test auc: %.4f  acc: %.4f'
                  % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc))
            logger.log('epoch %d    train auc: %.4f  acc: %.4f    eval auc: %.4f  acc: %.4f    test auc: %.4f  acc: %.4f'
                  % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc))

            # top-K evaluation
            if show_topk:
                precision, recall, f1 = topk_eval(
                    sess, model, user_list, train_record, test_record, item_set, k_list)
                print('precision: ', end='')
                logger.log('precision: ')
                for i in precision:
                    print('%.4f\t' % i, end='')
                    logger.log('%.4f\t' % i)
                print()
                print('recall: ', end='')
                logger.log('recall: ')
                for i in recall:
                    print('%.4f\t' % i, end='')
                    logger.log('%.4f\t' % i)
                print()
                print('f1: ', end='')
                logger.log('f1: ')
                for i in f1:
                    print('%.4f\t' % i, end='')
                    logger.log('%.4f\t' % i)
                print('\n')
            
            saver.save(sess, session_log_path + "models/epoch_{}".format(step))


def get_feed_dict_for_rs(model, data, start, end):
    feed_dict = {model.user_indices: data[start:end, 0],
                 model.item_indices: data[start:end, 1],
                 model.labels: data[start:end, 2],
                 model.head_indices: data[start:end, 1]}
    return feed_dict


def get_feed_dict_for_kge(model, kg, start, end):
    feed_dict = {model.item_indices: kg[start:end, 0],
                 model.head_indices: kg[start:end, 0],
                 model.relation_indices: kg[start:end, 1],
                 model.tail_indices: kg[start:end, 2]}
    return feed_dict


def topk_eval(sess, model, user_list, train_record, test_record, item_set, k_list):
    precision_list = {k: [] for k in k_list}
    recall_list = {k: [] for k in k_list}

    for user in user_list:
        test_item_list = list(item_set - train_record[user])
        item_score_map = dict()
        items, scores = model.get_scores(sess, {model.user_indices: [user] * len(test_item_list),
                                                model.item_indices: test_item_list,
                                                model.head_indices: test_item_list})
        for item, score in zip(items, scores):
            item_score_map[item] = score
        item_score_pair_sorted = sorted(item_score_map.items(), key=lambda x: x[1], reverse=True)
        item_sorted = [i[0] for i in item_score_pair_sorted]

        for k in k_list:
            hit_num = len(set(item_sorted[:k]) & test_record[user])
            precision_list[k].append(hit_num / k)
            recall_list[k].append(hit_num / len(test_record[user]))

    precision = [np.mean(precision_list[k]) for k in k_list]
    recall = [np.mean(recall_list[k]) for k in k_list]
    f1 = [2 / (1 / precision[i] + 1 / recall[i]) for i in range(len(k_list))]

    return precision, recall, f1


def get_user_record(data, is_train):
    user_history_dict = dict()
    for interaction in data:
        user = interaction[0]
        item = interaction[1]
        label = interaction[2]
        if is_train or label == 1:
            if user not in user_history_dict:
                user_history_dict[user] = set()
            user_history_dict[user].add(item)
    return user_history_dict


In [5]:
# %load ../src/data_loader.py
import numpy as np
import os


def load_data(args):
    n_user, n_item, train_data, eval_data, test_data = load_rating(args)
    n_entity, n_relation, kg = load_kg(args)
    print('data loaded.')

    return n_user, n_item, n_entity, n_relation, train_data, eval_data, test_data, kg


def load_rating(args):
    print('reading rating file ...')

    # reading rating file
    #rating_file = '../data/' + args.dataset + '/ratings_final'
    rating_file = '../data/' + 'intersect-14m' + '/ratings_final'
    if os.path.exists(rating_file + '.npy'):
        rating_np = np.load(rating_file + '.npy')
    else:
        rating_np = np.loadtxt(rating_file + '.txt', dtype=np.int32)
        np.save(rating_file + '.npy', rating_np)

    n_user = max(set(rating_np[:, 0])) + 1
    n_item = max(set(rating_np[:, 1])) + 1
    train_data, eval_data, test_data = dataset_split(rating_np)

    return n_user, n_item, train_data, eval_data, test_data


def dataset_split(rating_np):
    print('splitting dataset ...')

    # train:eval:test = 6:2:2
    eval_ratio = 0.2
    test_ratio = 0.2
    n_ratings = rating_np.shape[0]

    eval_indices = np.random.choice(list(range(n_ratings)), size=int(n_ratings * eval_ratio), replace=False)
    left = set(range(n_ratings)) - set(eval_indices)
    test_indices = np.random.choice(list(left), size=int(n_ratings * test_ratio), replace=False)
    train_indices = list(left - set(test_indices))

    train_data = rating_np[train_indices]
    eval_data = rating_np[eval_indices]
    test_data = rating_np[test_indices]

    return train_data, eval_data, test_data


def load_kg(args):
    print('reading KG file ...')

    # reading kg file
    #kg_file = '../data/' + args.dataset + '/kg_final'
    kg_file = '../data/' + 'intersect-14m' + '/kg_final'
    if os.path.exists(kg_file + '.npy'):
        kg = np.load(kg_file + '.npy')
    else:
        kg = np.loadtxt(kg_file + '.txt', dtype=np.int32)
        np.save(kg_file + '.npy', kg)

    n_entity = max(set(kg[:, 0]) | set(kg[:, 2]))+1
    n_relation = max(set(kg[:, 1]))+1

    return n_entity, n_relation, kg


# Create Args Object to Run Python

In [6]:
class Args:
    
    def __init__(self):
        self.dataset = 'movie'
        self.n_epoch = 20
        self.dim = 8
        self.L = 1
        self.H = 1
        self.batch_size = 4095
        self.l2_weight = 1e-6
        self.lr_rs = 0.000125
        self.lr_kge = 0.000125
        self.kge_interval = 3

args=Args()

# Import dataset

## 1. Load data

In [7]:
data_info = load_data(args)

reading rating file ...
splitting dataset ...
reading KG file ...
data loaded.


## 2. Separate the preprocesssing data

In [8]:
n_user = data_info[0]
n_item = data_info[1]
n_entity = data_info[2]
n_relation = data_info[3]
train_data = data_info[4]
eval_data = data_info[5]
test_data = data_info[6]
kg = data_info[7]

In [13]:
n_user

138493

In [14]:
n_item

15527

# Load the Model

## 1. Define path to choose which model and epoch

In [9]:
TEST_CODE = "1563359800.887134"
CHOSEN_EPOCH = 19

MODEL_PATH = "../log/{}/models/epoch_{}".format(TEST_CODE, CHOSEN_EPOCH)
LOG_PATH = "../log/{}/log.txt".format(TEST_CODE)

## 2. Initiate Model

In [10]:
model = MKR(args, n_user, n_item, n_entity, n_relation)

W0729 08:15:16.480361 140126775260992 deprecation.py:506] From /home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0729 08:15:16.596993 140126775260992 deprecation.py:506] From <ipython-input-2-a1149e15351b>:47: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0729 08:15:16.601031 140126775260992 deprecation.py:506] From /home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:180: calling expand_dims (from tensorflow.python.ops.array_ops) with dim is deprecated and will be removed 

## 3. Import Model

In [11]:
# Limit GPU usage
config = tf.ConfigProto()
config.gpu_options.allow_growth=True

In [12]:
# Add ops to save and restore all the variables.
saver = tf.train.Saver()

sess = tf.Session(config=config)
saver = tf.train.import_meta_graph(MODEL_PATH + ".meta")
saver.restore(sess, MODEL_PATH)

W0729 08:15:18.116761 140126775260992 deprecation.py:323] From /home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.


InvalidArgumentError: Restoring from checkpoint failed. This is most likely due to a mismatch between the current graph and the graph from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

2 root error(s) found.
  (0) Invalid argument: Assign requires shapes of both tensors to match. lhs shape= [138493,8] rhs shape= [6036,8]
	 [[node save/Assign_63 (defined at <ipython-input-12-91c373de4425>:2) ]]
	 [[save/RestoreV2/_36]]
  (1) Invalid argument: Assign requires shapes of both tensors to match. lhs shape= [138493,8] rhs shape= [6036,8]
	 [[node save/Assign_63 (defined at <ipython-input-12-91c373de4425>:2) ]]
0 successful operations.
0 derived errors ignored.

Errors may have originated from an input operation.
Input Source operations connected to node save/Assign_63:
 user_emb_matrix/Adam_1 (defined at <ipython-input-3-b42cb5c6bbd5>:120)

Input Source operations connected to node save/Assign_63:
 user_emb_matrix/Adam_1 (defined at <ipython-input-3-b42cb5c6bbd5>:120)

Original stack trace for 'save/Assign_63':
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 438, in run_forever
    self._run_once()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 1451, in _run_once
    handle._run()
  File "/usr/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tornado/ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tornado/gen.py", line 787, in inner
    self.run()
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 378, in dispatch_queue
    yield self.process_one()
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tornado/gen.py", line 225, in wrapper
    runner = Runner(result, future, yielded)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tornado/gen.py", line 714, in __init__
    self.run()
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 365, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 272, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 542, in execute_request
    user_expressions, allow_stdin,
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2848, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2874, in _run_cell
    return runner(coro)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3049, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3214, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-91c373de4425>", line 2, in <module>
    saver = tf.train.Saver()
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 825, in __init__
    self.build()
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 837, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 875, in _build
    build_restore=build_restore)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 508, in _build_internal
    restore_sequentially, reshape)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 350, in _AddRestoreOps
    assign_ops.append(saveable.restore(saveable_tensors, shapes))
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/training/saving/saveable_object_util.py", line 72, in restore
    self.op.get_shape().is_fully_defined())
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/ops/state_ops.py", line 227, in assign
    validate_shape=validate_shape)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/ops/gen_state_ops.py", line 66, in assign
    use_locking=use_locking, name=name)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3616, in create_op
    op_def=op_def)
  File "/home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2005, in __init__
    self._traceback = tf_stack.extract_stack()


# Prepare Data for Evaluation

## 1. Create positive data dictionary for each user

In [None]:
truth_dict = {}
for rating in tqdm(train_data):
    user_id, movie_id, score = rating
    
    if user_id not in truth_dict:
        truth_dict[user_id] = []
    
    if score == 1:
        truth_dict[user_id].append(movie_id)
        
for rating in tqdm(test_data):
    user_id, movie_id, score = rating
    
    if user_id not in truth_dict:
        truth_dict[user_id] = []
    
    if score == 1:
        truth_dict[user_id].append(movie_id)
        
for rating in tqdm(eval_data):
    user_id, movie_id, score = rating
    
    if user_id not in truth_dict:
        truth_dict[user_id] = []
    
    if score == 1:
        truth_dict[user_id].append(movie_id)

## 2. Check the frequency of user who liked n movies

In [None]:
from collections import Counter

ns = []
for key in truth_dict:
    n = len(truth_dict[key])
    ns.append(n)

ns = Counter(ns)

In [None]:
ns

In [None]:
nscum = {}
last = 0
for k in sorted(ns):
    nscum[k] = ns[k] + last
    last = nscum[k]

In [None]:
import matplotlib.pyplot as plt

# cummulative plot
plt.figure(figsize=(20,14))
plt.plot(list(nscum.keys())[:50], list(nscum.values())[:50])

## 3. Create set of user and movies

In [None]:
user_set = set(truth_dict.keys())

In [None]:
movie_set = set()
for user in tqdm(user_set) :
    for movie in truth_dict[user] :
        movie_set.add(movie)

## 4. Create n sample of user to be tested

In [None]:
import random

random.seed(1234)

user_sample_500 = random.sample(user_set,500)
user_sample_1000 = random.sample(user_set,1000)
user_sample_3000 = random.sample(user_set,3000)
user_sample_5000 = random.sample(user_set,5000)


# Evaluation

## 1. Define function to predict k recommendation

In [None]:
def get_top_suggestion(user,k) : 
    item_score_map = dict()
    items, scores = model.get_scores(sess, {model.user_indices: [user] * len(movie_set),
                                            model.item_indices: list(movie_set),
                                            model.head_indices: list(movie_set)})
    for item, score in zip(items, scores):
            item_score_map[item] = score

    item_score_pair_sorted = sorted(item_score_map.items(), key=lambda x: x[1], reverse=True)
    item_sorted = [i[0] for i in item_score_pair_sorted]
    
    return item_score_pair_sorted[:k]

In [None]:
def get_top_truth(user, k):
    if user not in truth_dict:
        return []
    #ERASE [:k]
    return truth_dict[user]

## 2. Define function to evaluate  prec@k

In [None]:
def get_intersect_pred_truth(pred, truth, k):
    pred_item_set = {x[0] for x in pred}
    truth_item_set = set(truth)
    
    return pred_item_set.intersection(truth_item_set)

def check_precision_at_k(sample_user, k):
    
    pred = get_top_suggestion(sample_user, k)
    truth = get_top_truth(sample_user, k)
    
    intersect = get_intersect_pred_truth(pred, truth, k)
    
    if len(truth) > 0 :
        return intersect, len(intersect) / len(truth)
    else:
        return {}, 0

## 3. Calculate the prec@k for every n user sample

### 500 user

In [None]:
prec = []
intersect = []

for i in tqdm(user_sample_500):
    
    try:
        isec, p = check_precision_at_k(i, 10)
    except:
        p = 0
        isec = {}
        print("error occur for {}".format(i))
        
    prec.append(p)
    intersect.append(isec)

In [None]:
import numpy as np

np.average(prec)

### 1000 user

In [None]:
prec = []
intersect = []

for i in tqdm(user_sample_1000):
    
    try:
        isec, p = check_precision_at_k(i, 10)
    except:
        p = 0
        isec = {}
        print("error occur for {}".format(i))
        
    prec.append(p)
    intersect.append(isec)

In [None]:
import numpy as np

np.average(prec)

### 3000 user

In [None]:
prec = []
intersect = []

for i in tqdm(user_sample_3000):
    
    try:
        isec, p = check_precision_at_k(i, 10)
    except:
        p = 0
        isec = {}
        print("error occur for {}".format(i))
        
    prec.append(p)
    intersect.append(isec)

In [None]:
import numpy as np

np.average(prec)

### 5000 user

In [None]:
prec = []
intersect = []

for i in tqdm(user_sample_5000):
    
    try:
        isec, p = check_precision_at_k(i, 10)
    except:
        p = 0
        isec = {}
        print("error occur for {}".format(i))
        
    prec.append(p)
    intersect.append(isec)

In [None]:
import numpy as np

np.average(prec)

## 4. Calculate Distinct Rate of every n sample user

## 500 sample user

In [None]:
offset = 190 # discard top n suggestion
k = 10

sample_user = user_sample_500

intersect = {x[1] for x in get_top_suggestion(sample_user[0], k + offset)[offset:]}
uni = intersect
for i in range(1, 10):
    s = {x[1] for x in get_top_suggestion(sample_user[i], k + offset)[offset:]}
    #print(sorted(s))
    intersect = intersect.intersection(s)
    uni = uni.union(s)
    
#print("\nintersect")
#print(intersect, len(intersect))
#print("\nunion")
#print(uni, len(uni))
print("\ndistinct rate")
print((len(uni)) / (10*k))

## 1000 sample user

In [None]:
offset = 190 # discard top n suggestion
k = 10

sample_user = user_sample_1000

intersect = {x[1] for x in get_top_suggestion(sample_user[0], k + offset)[offset:]}
uni = intersect
for i in tqdm(range(1, 10)):
    s = {x[1] for x in get_top_suggestion(sample_user[i], k + offset)[offset:]}
    #print(sorted(s))
    intersect = intersect.intersection(s)
    uni = uni.union(s)
    
#print("\nintersect")
#print(intersect, len(intersect))
#print("\nunion")
#print(uni, len(uni))
print("\ndistinct rate")
print((len(uni)) / (10*k))

## 3000 sample user

In [None]:
offset = 190 # discard top n suggestion
k = 10

sample_user = user_sample_3000

intersect = {x[1] for x in get_top_suggestion(sample_user[0], k + offset)[offset:]}
uni = intersect
for i in tqdm(range(1, 10)):
    s = {x[1] for x in get_top_suggestion(sample_user[i], k + offset)[offset:]}
    #print(sorted(s))
    intersect = intersect.intersection(s)
    uni = uni.union(s)
    
#print("\nintersect")
#print(intersect, len(intersect))
#print("\nunion")
#print(uni, len(uni))
print("\ndistinct rate")
print((len(uni)) / (10*k))

## 5000 sample user

In [None]:
offset = 190 # discard top n suggestion
k = 10

sample_user = user_sample_5000

intersect = {x[1] for x in get_top_suggestion(sample_user[0], k + offset)[offset:]}
uni = intersect
for i in tqdm(range(1, 10)):
    s = {x[1] for x in get_top_suggestion(sample_user[i], k + offset)[offset:]}
    #print(sorted(s))
    intersect = intersect.intersection(s)
    uni = uni.union(s)
    
#print("\nintersect")
#print(intersect, len(intersect))
#print("\nunion")
#print(uni, len(uni))
print("\ndistinct rate")
print((len(uni)) / (10*k))

## 5. Calculate Unique Movies recommended on n user

## 500 sample user

In [None]:
unique_movies = set()

for user in tqdm(user_sample_500) :    
    pred = get_top_suggestion(user,10)
    for x in pred :
        unique_movies.add(x[0])
        
print(len(unique_movies))    

## 1000 sample user

In [None]:
unique_movies = set()

for user in tqdm(user_sample_1000) :    
    pred = get_top_suggestion(user,10)
    for x in pred :
        unique_movies.add(x[0])
        
print(len(unique_movies))    

## 3000 sample user

In [None]:
unique_movies = set()

for user in tqdm(user_sample_3000) :    
    pred = get_top_suggestion(user,10)
    for x in pred :
        unique_movies.add(x[0])
        
print(len(unique_movies))    

## 5000 sample user

In [None]:
unique_movies = set()

for user in tqdm(user_sample_5000) :    
    pred = get_top_suggestion(user,10)
    for x in pred :
        unique_movies.add(x[0])
        
print(len(unique_movies))    

# Generate top k recommendation for 10 selected user

## 1. Initiate 10 sample user

In [None]:
user_test_list = [20169,66966,82374,4296,10204,123623,115870,128970,83750,97239]

## 2. Predict 10 recommendation for each user

In [None]:
topk_pred_dict = {}

for user in user_test_list :
    pred_tuple = get_top_suggestion(user-1,10)
    pred = [x[0] for x in pred_tuple]
    topk_pred_dict[user] = pred    

In [None]:
topk_pred_dict

## 3. Convert Movie ID to Title

In [None]:
movies = open("../data/intersect-14m/moviesIdx2.txt").readlines()

In [None]:
dict_entities = {}
for movie in movies:
    x = movie.strip().split()
    movie_id = int(x[0])
    movie_name = x[1]
    dict_entities[movie_id] = movie_name

In [None]:
def get_list_movie(id_movie_list, truth_list) :
    res = []
    for id_movie in id_movie_list :
        is_watched = "watched" if id_movie in truth_list else "nope"
        content = dict_entities[int(id_movie)] + " > " + is_watched 
        res.append(content)
    return res

In [None]:
topk_pred_dict_title = {}

for user in topk_pred_dict.keys() :
    topk_pred_dict_title[user] = get_list_movie(topk_pred_dict[user],truth_dict[user])

In [None]:
import pandas as pd

topk_pred_df = pd.DataFrame(topk_pred_dict_title)

In [None]:
topk_pred_path =  "../log/{}/topk_pred.csv".format(TEST_CODE)

In [None]:
topk_pred_df.to_csv(topk_pred_path)