# Import necessary lib

In [1]:
from ipywidgets import FloatProgress, IntProgress
from IPython.display import display
from tqdm import tqdm

# Load python file

In [2]:
# %load ../src/logger.py
import pickle
import os

class Logger:

    def set_default_filename(self, filename):
        self.default_filename = filename

    def create_session_folder(self, path):
        try:  
            os.makedirs(path)
        except OSError:  
            print ("Creation of the directory %s failed" % path)
        else:  
            print ("\n ===> Successfully created the directory %s \n" % path)

    def log(self, text):
        with open(self.default_filename, 'a') as f:
            f.writelines(text)
            f.write("\n")

    def save_model(self, model, filename):
        pickle.dump(model, open(filename, 'wb'))

In [3]:
# %load ../src/layers.py
import tensorflow as tf
from abc import abstractmethod

LAYER_IDS = {}


def get_layer_id(layer_name=''):
    if layer_name not in LAYER_IDS:
        LAYER_IDS[layer_name] = 0
        return 0
    else:
        LAYER_IDS[layer_name] += 1
        return LAYER_IDS[layer_name]


class Layer(object):
    def __init__(self, name):
        if not name:
            layer = self.__class__.__name__.lower()
            name = layer + '_' + str(get_layer_id(layer))
        self.name = name
        self.vars = []

    def __call__(self, inputs):
        outputs = self._call(inputs)
        return outputs

    @abstractmethod
    def _call(self, inputs):
        pass


class Dense(Layer):
    def __init__(self, input_dim, output_dim, dropout=0.0, act=tf.nn.relu, name=None):
        super(Dense, self).__init__(name)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.act = act
        with tf.variable_scope(self.name):
            self.weight = tf.get_variable(name='weight', shape=(input_dim, output_dim), dtype=tf.float32)
            self.bias = tf.get_variable(name='bias', shape=output_dim, initializer=tf.zeros_initializer())
        self.vars = [self.weight]

    def _call(self, inputs):
        x = tf.nn.dropout(inputs, 1-self.dropout)
        output = tf.matmul(x, self.weight) + self.bias
        return self.act(output)


class CrossCompressUnit(Layer):
    def __init__(self, dim, name=None):
        super(CrossCompressUnit, self).__init__(name)
        self.dim = dim
        with tf.variable_scope(self.name):
            self.weight_vv = tf.get_variable(name='weight_vv', shape=(dim, 1), dtype=tf.float32)
            self.weight_ev = tf.get_variable(name='weight_ev', shape=(dim, 1), dtype=tf.float32)
            self.weight_ve = tf.get_variable(name='weight_ve', shape=(dim, 1), dtype=tf.float32)
            self.weight_ee = tf.get_variable(name='weight_ee', shape=(dim, 1), dtype=tf.float32)
            self.bias_v = tf.get_variable(name='bias_v', shape=dim, initializer=tf.zeros_initializer())
            self.bias_e = tf.get_variable(name='bias_e', shape=dim, initializer=tf.zeros_initializer())
        self.vars = [self.weight_vv, self.weight_ev, self.weight_ve, self.weight_ee]

    def _call(self, inputs):
        # [batch_size, dim]
        v, e = inputs

        # [batch_size, dim, 1], [batch_size, 1, dim]
        v = tf.expand_dims(v, dim=2)
        e = tf.expand_dims(e, dim=1)

        # [batch_size, dim, dim]
        c_matrix = tf.matmul(v, e)
        c_matrix_transpose = tf.transpose(c_matrix, perm=[0, 2, 1])

        # [batch_size * dim, dim]
        c_matrix = tf.reshape(c_matrix, [-1, self.dim])
        c_matrix_transpose = tf.reshape(c_matrix_transpose, [-1, self.dim])

        # [batch_size, dim]
        v_output = tf.reshape(tf.matmul(c_matrix, self.weight_vv) + tf.matmul(c_matrix_transpose, self.weight_ev),
                              [-1, self.dim]) + self.bias_v
        e_output = tf.reshape(tf.matmul(c_matrix, self.weight_ve) + tf.matmul(c_matrix_transpose, self.weight_ee),
                              [-1, self.dim]) + self.bias_e

        return v_output, e_output


In [4]:
# %load ../src/model.py
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score


class MKR(object):
    def __init__(self, args, n_users, n_items, n_entities, n_relations):
        self._parse_args(n_users, n_items, n_entities, n_relations)
        self._build_inputs()
        self._build_model(args)
        self._build_loss(args)
        self._build_train(args)

    def _parse_args(self, n_users, n_items, n_entities, n_relations):
        self.n_user = n_users
        self.n_item = n_items
        self.n_entity = n_entities
        self.n_relation = n_relations

        # for computing l2 loss
        self.vars_rs = []
        self.vars_kge = []

    def _build_inputs(self):
        self.user_indices = tf.placeholder(tf.int32, [None], 'user_indices')
        self.item_indices = tf.placeholder(tf.int32, [None], 'item_indices')
        self.labels = tf.placeholder(tf.float32, [None], 'labels')
        self.head_indices = tf.placeholder(tf.int32, [None], 'head_indices')
        self.tail_indices = tf.placeholder(tf.int32, [None], 'tail_indices')
        self.relation_indices = tf.placeholder(tf.int32, [None], 'relation_indices')

    def _build_model(self, args):
        self._build_low_layers(args)
        self._build_high_layers(args)

    def _build_low_layers(self, args):
        self.user_emb_matrix = tf.get_variable('user_emb_matrix', [self.n_user, args.dim])
        self.item_emb_matrix = tf.get_variable('item_emb_matrix', [self.n_item, args.dim])
        self.entity_emb_matrix = tf.get_variable('entity_emb_matrix', [self.n_entity, args.dim])
        self.relation_emb_matrix = tf.get_variable('relation_emb_matrix', [self.n_relation, args.dim])

        # [batch_size, dim]
        self.user_embeddings = tf.nn.embedding_lookup(self.user_emb_matrix, self.user_indices)
        self.item_embeddings = tf.nn.embedding_lookup(self.item_emb_matrix, self.item_indices)
        self.head_embeddings = tf.nn.embedding_lookup(self.entity_emb_matrix, self.head_indices)
        self.relation_embeddings = tf.nn.embedding_lookup(self.relation_emb_matrix, self.relation_indices)
        self.tail_embeddings = tf.nn.embedding_lookup(self.entity_emb_matrix, self.tail_indices)

        for _ in range(args.L):
            user_mlp = Dense(input_dim=args.dim, output_dim=args.dim)
            tail_mlp = Dense(input_dim=args.dim, output_dim=args.dim)
            cc_unit = CrossCompressUnit(args.dim)
            self.user_embeddings = user_mlp(self.user_embeddings)
            self.item_embeddings, self.head_embeddings = cc_unit([self.item_embeddings, self.head_embeddings])
            self.tail_embeddings = tail_mlp(self.tail_embeddings)

            self.vars_rs.extend(user_mlp.vars)
            self.vars_rs.extend(cc_unit.vars)
            self.vars_kge.extend(tail_mlp.vars)
            self.vars_kge.extend(cc_unit.vars)

    def _build_high_layers(self, args):
        # RS
        use_inner_product = True
        if use_inner_product:
            # [batch_size]
            self.scores = tf.reduce_sum(self.user_embeddings * self.item_embeddings, axis=1)
        else:
            # [batch_size, dim * 2]
            self.user_item_concat = tf.concat([self.user_embeddings, self.item_embeddings], axis=1)
            for _ in range(args.H - 1):
                rs_mlp = Dense(input_dim=args.dim * 2, output_dim=args.dim * 2)
                # [batch_size, dim * 2]
                self.user_item_concat = rs_mlp(self.user_item_concat)
                self.vars_rs.extend(rs_mlp.vars)

            rs_pred_mlp = Dense(input_dim=args.dim * 2, output_dim=1)
            # [batch_size]
            self.scores = tf.squeeze(rs_pred_mlp(self.user_item_concat))
            self.vars_rs.extend(rs_pred_mlp.vars)
        self.scores_normalized = tf.nn.sigmoid(self.scores)

        # KGE
        # [batch_size, dim * 2]
        self.head_relation_concat = tf.concat([self.head_embeddings, self.relation_embeddings], axis=1)
        for _ in range(args.H - 1):
            kge_mlp = Dense(input_dim=args.dim * 2, output_dim=args.dim * 2)
            # [batch_size, dim]
            self.head_relation_concat = kge_mlp(self.head_relation_concat)
            self.vars_kge.extend(kge_mlp.vars)

        kge_pred_mlp = Dense(input_dim=args.dim * 2, output_dim=args.dim)
        # [batch_size, 1]
        self.tail_pred = kge_pred_mlp(self.head_relation_concat)
        self.vars_kge.extend(kge_pred_mlp.vars)
        self.tail_pred = tf.nn.sigmoid(self.tail_pred)

        self.scores_kge = tf.nn.sigmoid(tf.reduce_sum(self.tail_embeddings * self.tail_pred, axis=1))
        self.rmse = tf.reduce_mean(
            tf.sqrt(tf.reduce_sum(tf.square(self.tail_embeddings - self.tail_pred), axis=1) / args.dim))

    def _build_loss(self, args):
        # RS
        self.base_loss_rs = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels, logits=self.scores))
        self.l2_loss_rs = tf.nn.l2_loss(self.user_embeddings) + tf.nn.l2_loss(self.item_embeddings)
        for var in self.vars_rs:
            self.l2_loss_rs += tf.nn.l2_loss(var)
        self.loss_rs = self.base_loss_rs + self.l2_loss_rs * args.l2_weight

        # KGE
        self.base_loss_kge = -self.scores_kge
        self.l2_loss_kge = tf.nn.l2_loss(self.head_embeddings) + tf.nn.l2_loss(self.tail_embeddings)
        for var in self.vars_kge:
            self.l2_loss_kge += tf.nn.l2_loss(var)
        self.loss_kge = self.base_loss_kge + self.l2_loss_kge * args.l2_weight

    def _build_train(self, args):
        self.optimizer_rs = tf.train.AdamOptimizer(args.lr_rs).minimize(self.loss_rs)
        self.optimizer_kge = tf.train.AdamOptimizer(args.lr_kge).minimize(self.loss_kge)

    def train_rs(self, sess, feed_dict):
        return sess.run([self.optimizer_rs, self.loss_rs], feed_dict)

    def train_kge(self, sess, feed_dict):
        return sess.run([self.optimizer_kge, self.rmse], feed_dict)

    def eval(self, sess, feed_dict):
        labels, scores = sess.run([self.labels, self.scores_normalized], feed_dict)
        auc = roc_auc_score(y_true=labels, y_score=scores)
        predictions = [1 if i >= 0.5 else 0 for i in scores]
        acc = np.mean(np.equal(predictions, labels))
        return auc, acc

    def get_scores(self, sess, feed_dict):
        return sess.run([self.item_indices, self.scores_normalized], feed_dict)

In [5]:
# %load ../src/train.py
import pickle
import tensorflow as tf
import numpy as np
from datetime import datetime

timestamp = str(datetime.timestamp(datetime.now()))

# logger = Logger()
# session_log_path = "../log/{}/".format(timestamp)
# logger.create_session_folder(session_log_path)
# logger.set_default_filename(session_log_path + "log.txt")


def train(args, data, show_loss, show_topk):
    logger.log(str(args))
    n_user, n_item, n_entity, n_relation = data[0], data[1], data[2], data[3]
    train_data, eval_data, test_data = data[4], data[5], data[6]
    kg = data[7]

    n_item = n_item

    model = MKR(args, n_user, n_item, n_entity, n_relation)

    print("n_user : " , n_user, "\n")
    print("n_item : " , n_item, "\n")

    # top-K evaluation settings
    user_num = 100
    k_list = [1, 2, 5, 10, 20, 50, 100]
    train_record = get_user_record(train_data, True)
    test_record = get_user_record(test_data, False)
    user_list = list(set(train_record.keys()) & set(test_record.keys()))
    if len(user_list) > user_num:
        user_list = np.random.choice(user_list, size=user_num, replace=False)
    
    # item_set = set(list(range(n_item)))
    item_set = set()

    for data in train_data :
        item_set.add(int(data[1]))

    for data in eval_data :
        item_set.add(int(data[1]))

    for data in test_data :
        item_set.add(int(data[1]))


    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(max_to_keep=None)

        for step in range(args.n_epochs):
            # RS training
            np.random.shuffle(train_data)
            start = 0
            while start < train_data.shape[0]:
                _, loss = model.train_rs(sess, get_feed_dict_for_rs(model, train_data, start, start + args.batch_size))
                start += args.batch_size
                if show_loss:
                    print(loss)

            # KGE training
            if step % args.kge_interval == 0:
                np.random.shuffle(kg)
                start = 0
                while start < kg.shape[0]:
                    _, rmse = model.train_kge(sess, get_feed_dict_for_kge(model, kg, start, start + args.batch_size))
                    start += args.batch_size
                    if show_loss:
                        print(rmse)

            # CTR evaluation
            train_auc, train_acc = model.eval(sess, get_feed_dict_for_rs(model, train_data, 0, train_data.shape[0]))
            eval_auc, eval_acc = model.eval(sess, get_feed_dict_for_rs(model, eval_data, 0, eval_data.shape[0]))
            test_auc, test_acc = model.eval(sess, get_feed_dict_for_rs(model, test_data, 0, test_data.shape[0]))

            print('epoch %d    train auc: %.4f  acc: %.4f    eval auc: %.4f  acc: %.4f    test auc: %.4f  acc: %.4f'
                  % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc))
            logger.log('epoch %d    train auc: %.4f  acc: %.4f    eval auc: %.4f  acc: %.4f    test auc: %.4f  acc: %.4f'
                  % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc))

            # top-K evaluation
            if show_topk:
                precision, recall, f1 = topk_eval(
                    sess, model, user_list, train_record, test_record, item_set, k_list)
                print('precision: ', end='')
                logger.log('precision: ')
                for i in precision:
                    print('%.4f\t' % i, end='')
                    logger.log('%.4f\t' % i)
                print()
                print('recall: ', end='')
                logger.log('recall: ')
                for i in recall:
                    print('%.4f\t' % i, end='')
                    logger.log('%.4f\t' % i)
                print()
                print('f1: ', end='')
                logger.log('f1: ')
                for i in f1:
                    print('%.4f\t' % i, end='')
                    logger.log('%.4f\t' % i)
                print('\n')
            
            saver.save(sess, session_log_path + "models/epoch_{}".format(step))


def get_feed_dict_for_rs(model, data, start, end):
    feed_dict = {model.user_indices: data[start:end, 0],
                 model.item_indices: data[start:end, 1],
                 model.labels: data[start:end, 2],
                 model.head_indices: data[start:end, 1]}
    return feed_dict


def get_feed_dict_for_kge(model, kg, start, end):
    feed_dict = {model.item_indices: kg[start:end, 0],
                 model.head_indices: kg[start:end, 0],
                 model.relation_indices: kg[start:end, 1],
                 model.tail_indices: kg[start:end, 2]}
    return feed_dict


def topk_eval(sess, model, user_list, train_record, test_record, item_set, k_list):
    precision_list = {k: [] for k in k_list}
    recall_list = {k: [] for k in k_list}

    for user in user_list:
        test_item_list = list(item_set - train_record[user])
        item_score_map = dict()
        items, scores = model.get_scores(sess, {model.user_indices: [user] * len(test_item_list),
                                                model.item_indices: test_item_list,
                                                model.head_indices: test_item_list})
        for item, score in zip(items, scores):
            item_score_map[item] = score
        item_score_pair_sorted = sorted(item_score_map.items(), key=lambda x: x[1], reverse=True)
        item_sorted = [i[0] for i in item_score_pair_sorted]

        for k in k_list:
            hit_num = len(set(item_sorted[:k]) & test_record[user])
            precision_list[k].append(hit_num / k)
            recall_list[k].append(hit_num / len(test_record[user]))

    precision = [np.mean(precision_list[k]) for k in k_list]
    recall = [np.mean(recall_list[k]) for k in k_list]
    f1 = [2 / (1 / precision[i] + 1 / recall[i]) for i in range(len(k_list))]

    return precision, recall, f1


def get_user_record(data, is_train):
    user_history_dict = dict()
    for interaction in data:
        user = interaction[0]
        item = interaction[1]
        label = interaction[2]
        if is_train or label == 1:
            if user not in user_history_dict:
                user_history_dict[user] = set()
            user_history_dict[user].add(item)
    return user_history_dict


In [6]:
# %load ../src/data_loader.py
import numpy as np
import os


def load_data(args):
    n_user, n_item, train_data, eval_data, test_data = load_rating(args)
    n_entity, n_relation, kg = load_kg(args)
    print('data loaded.')

    return n_user, n_item, n_entity, n_relation, train_data, eval_data, test_data, kg


def load_rating(args):
    print('reading rating file ...')

    # reading rating file
    rating_file = '../data/' + args.dataset + '/ratings_final'
    #rating_file = '../data/' + 'intersect-14m' + '/ratings_final'
    if os.path.exists(rating_file + '.npy'):
        rating_np = np.load(rating_file + '.npy')
    else:
        rating_np = np.loadtxt(rating_file + '.txt', dtype=np.int32)
        np.save(rating_file + '.npy', rating_np)

    n_user = max(set(rating_np[:, 0])) + 1
    n_item = max(set(rating_np[:, 1])) + 1
    train_data, eval_data, test_data = dataset_split(rating_np)

    return n_user, n_item, train_data, eval_data, test_data


def dataset_split(rating_np):
    print('splitting dataset ...')

    # train:eval:test = 6:2:2
    eval_ratio = 0.2
    test_ratio = 0.2
    n_ratings = rating_np.shape[0]

    eval_indices = np.random.choice(list(range(n_ratings)), size=int(n_ratings * eval_ratio), replace=False)
    left = set(range(n_ratings)) - set(eval_indices)
    test_indices = np.random.choice(list(left), size=int(n_ratings * test_ratio), replace=False)
    train_indices = list(left - set(test_indices))

    train_data = rating_np[train_indices]
    eval_data = rating_np[eval_indices]
    test_data = rating_np[test_indices]

    return train_data, eval_data, test_data


def load_kg(args):
    print('reading KG file ...')

    # reading kg file
    kg_file = '../data/' + args.dataset + '/kg_final'
    #kg_file = '../data/' + 'intersect-14m' + '/kg_final'
    if os.path.exists(kg_file + '.npy'):
        kg = np.load(kg_file + '.npy')
    else:
        kg = np.loadtxt(kg_file + '.txt', dtype=np.int32)
        np.save(kg_file + '.npy', kg)

    n_entity = max(set(kg[:, 0]) | set(kg[:, 2]))+1
    n_relation = max(set(kg[:, 1]))+1

    return n_entity, n_relation, kg


# Create Args

In [7]:
class Args:
    
    def __init__(self):
        self.dataset = 'movie'
        self.n_epoch = 20
        self.dim = 8
        self.L = 1
        self.H = 1
        self.batch_size = 4095
        self.l2_weight = 1e-6
        self.lr_rs = 0.000125
        self.lr_kge = 0.000125
        self.kge_interval = 3

args=Args()

# Load dataset

In [8]:
data_info = load_data(args)

reading rating file ...
splitting dataset ...
reading KG file ...
data loaded.


# Separate the preprocesssing data

In [9]:
n_user = data_info[0]
n_item = data_info[1]
n_entity = data_info[2]
n_relation = data_info[3]
train_data = data_info[4]
eval_data = data_info[5]
test_data = data_info[6]
kg = data_info[7]

In [10]:
n_user

6036

# Load Model

In [11]:
TEST_CODE = "1564108661.9731"
CHOSEN_EPOCH = 19

MODEL_PATH = "../log/{}/models/epoch_{}".format(TEST_CODE, CHOSEN_EPOCH)
LOG_PATH = "../log/{}/log.txt".format(TEST_CODE)

In [12]:
model = MKR(args, n_user, n_item, n_entity, n_relation)

W0729 07:42:14.484763 139874676356928 deprecation.py:506] From /home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0729 07:42:14.603126 139874676356928 deprecation.py:506] From <ipython-input-3-a1149e15351b>:47: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0729 07:42:14.607710 139874676356928 deprecation.py:506] From /home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:180: calling expand_dims (from tensorflow.python.ops.array_ops) with dim is deprecated and will be removed 

In [13]:
# Limit GPU usage
config = tf.ConfigProto()
config.gpu_options.allow_growth=True

In [14]:
# Add ops to save and restore all the variables.
saver = tf.train.Saver()

sess = tf.Session(config=config)
saver = tf.train.import_meta_graph(MODEL_PATH + ".meta")
saver.restore(sess, MODEL_PATH)

W0729 07:42:16.139000 139874676356928 deprecation.py:323] From /home/syahbimaa/.local/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.


# Create dictionary of positive ground truth

In [15]:
truth_dict = {}
for rating in tqdm(train_data):
    user_id, movie_id, score = rating
    
    if user_id not in truth_dict:
        truth_dict[user_id] = []
    
    if score == 1:
        truth_dict[user_id].append(movie_id)
        
for rating in tqdm(test_data):
    user_id, movie_id, score = rating
    
    if user_id not in truth_dict:
        truth_dict[user_id] = []
    
    if score == 1:
        truth_dict[user_id].append(movie_id)
        
for rating in tqdm(eval_data):
    user_id, movie_id, score = rating
    
    if user_id not in truth_dict:
        truth_dict[user_id] = []
    
    if score == 1:
        truth_dict[user_id].append(movie_id)

100%|██████████| 452264/452264 [00:01<00:00, 303895.79it/s]
100%|██████████| 150754/150754 [00:00<00:00, 281757.02it/s]
100%|██████████| 150754/150754 [00:00<00:00, 278494.86it/s]


# Check the frequency of user who liked n movies

In [16]:
from collections import Counter

ns = []
for key in truth_dict:
    n = len(truth_dict[key])
    ns.append(n)

ns = Counter(ns)

In [17]:
ns

Counter({25: 76,
         52: 48,
         26: 92,
         14: 125,
         57: 53,
         17: 123,
         53: 28,
         198: 7,
         50: 38,
         11: 132,
         30: 63,
         68: 45,
         108: 18,
         114: 15,
         90: 16,
         4: 22,
         80: 32,
         83: 19,
         75: 30,
         37: 50,
         63: 36,
         34: 55,
         44: 40,
         36: 71,
         18: 88,
         39: 56,
         116: 14,
         77: 31,
         197: 4,
         21: 98,
         20: 104,
         35: 57,
         8: 89,
         99: 16,
         15: 133,
         81: 20,
         60: 19,
         12: 147,
         97: 20,
         10: 140,
         22: 82,
         332: 2,
         208: 4,
         65: 34,
         5: 29,
         156: 5,
         69: 32,
         43: 49,
         13: 127,
         88: 14,
         92: 21,
         49: 46,
         9: 99,
         56: 36,
         41: 43,
         16: 134,
         28: 68,
         29: 74,
      

In [18]:
nscum = {}
last = 0
for k in sorted(ns):
    nscum[k] = ns[k] + last
    last = nscum[k]

In [19]:
import matplotlib.pyplot as plt

# cummulative plot
plt.figure(figsize=(20,14))
plt.plot(list(nscum.keys())[:50], list(nscum.values())[:50])

[<matplotlib.lines.Line2D at 0x7f35f1d9dcf8>]

# Evaluation

## 1. Create set of user and movies

In [20]:
user_set = set(truth_dict.keys())

In [21]:
movie_set = set()
for user in tqdm(user_set) :
    for movie in truth_dict[user] :
        movie_set.add(movie)

100%|██████████| 6036/6036 [00:00<00:00, 77491.37it/s]


In [22]:
movie_set

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 73,
 74,
 75,
 76,
 77,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190

In [23]:
item_score_map = dict()
items, scores = model.get_scores(sess, {model.user_indices: [1] * len(movie_set),
                                        model.item_indices: list(movie_set),
                                        model.head_indices: list(movie_set)})
for item, score in zip(items, scores):
        item_score_map[item] = score

item_score_pair_sorted = sorted(item_score_map.items(), key=lambda x: x[1], reverse=True)
item_sorted = [i[0] for i in item_score_pair_sorted]

In [24]:
item_sorted

[694,
 1381,
 333,
 759,
 514,
 1159,
 371,
 162,
 636,
 0,
 704,
 921,
 1595,
 734,
 726,
 696,
 710,
 184,
 1483,
 201,
 736,
 721,
 658,
 762,
 695,
 1962,
 374,
 7,
 31,
 1139,
 564,
 1613,
 301,
 740,
 750,
 693,
 341,
 974,
 2035,
 226,
 372,
 1780,
 1998,
 1845,
 542,
 684,
 88,
 2001,
 807,
 1994,
 1349,
 724,
 1389,
 2216,
 1617,
 1808,
 701,
 16,
 2093,
 227,
 1117,
 544,
 26,
 1512,
 536,
 801,
 339,
 716,
 1261,
 623,
 709,
 2164,
 1732,
 1700,
 1701,
 2087,
 408,
 548,
 732,
 723,
 1684,
 2033,
 1775,
 1111,
 596,
 1891,
 717,
 2094,
 664,
 1622,
 1429,
 863,
 1702,
 12,
 1892,
 2075,
 1546,
 1698,
 1113,
 698,
 782,
 2153,
 2023,
 808,
 1739,
 1825,
 537,
 221,
 751,
 983,
 1826,
 754,
 375,
 1102,
 593,
 2296,
 97,
 629,
 1799,
 1108,
 240,
 1961,
 738,
 1214,
 1445,
 718,
 1889,
 1964,
 934,
 559,
 1654,
 757,
 543,
 463,
 733,
 1795,
 1846,
 713,
 1145,
 1960,
 1294,
 793,
 1251,
 539,
 1346,
 1854,
 1347,
 2190,
 1967,
 1105,
 2074,
 2060,
 1320,
 702,
 2189,
 232,
 4

## 2. Function to get scores for evrey user

In [25]:
def get_top_suggestion(user,k) : 
    item_score_map = dict()
    items, scores = model.get_scores(sess, {model.user_indices: [user] * len(movie_set),
                                            model.item_indices: list(movie_set),
                                            model.head_indices: list(movie_set)})
    for item, score in zip(items, scores):
            item_score_map[item] = score

    item_score_pair_sorted = sorted(item_score_map.items(), key=lambda x: x[1], reverse=True)
    item_sorted = [i[0] for i in item_score_pair_sorted]
    
    return item_score_pair_sorted[:k]

In [26]:
def get_top_truth(user, k):
    if user not in truth_dict:
        return []
    #ERASE [:k]
    return truth_dict[user]

In [27]:
def get_intersect_pred_truth(pred, truth, k):
    pred_item_set = {x[0] for x in pred}
    truth_item_set = set(truth)
    
    return pred_item_set.intersection(truth_item_set)

def check_precision_at_k(sample_user, k):
    
    pred = get_top_suggestion(sample_user, k)
    truth = get_top_truth(sample_user, k)
    
    intersect = get_intersect_pred_truth(pred, truth, k)
    
    if len(truth) > 0 :
        return intersect, len(intersect) / len(truth)
    else:
        return {}, 0

# Create n user of sample

In [28]:
import random

random.seed(1234)

user_sample_500 = random.sample(user_set,500)
user_sample_1000 = random.sample(user_set,1000)
user_sample_3000 = random.sample(user_set,3000)
user_sample_5000 = random.sample(user_set,5000)


# Check the precision ot K

## 500 user

In [29]:
prec = []
intersect = []

for i in tqdm(user_sample_500):
    
    try:
        isec, p = check_precision_at_k(i, 10)
    except:
        p = 0
        isec = {}
        print("error occur for {}".format(i))
        
    prec.append(p)
    intersect.append(isec)

100%|██████████| 500/500 [00:02<00:00, 243.88it/s]


In [30]:
import numpy as np

np.average(prec)

0.08280191963741572

## 1000 

In [31]:
prec = []
intersect = []

for i in tqdm(user_sample_1000):
    
    try:
        isec, p = check_precision_at_k(i, 10)
    except:
        p = 0
        isec = {}
        print("error occur for {}".format(i))
        
    prec.append(p)
    intersect.append(isec)

100%|██████████| 1000/1000 [00:04<00:00, 243.88it/s]


In [32]:
import numpy as np

np.average(prec)

0.0838251287102731

## 3000

In [33]:
prec = []
intersect = []

for i in tqdm(user_sample_3000):
    
    try:
        isec, p = check_precision_at_k(i, 10)
    except:
        p = 0
        isec = {}
        print("error occur for {}".format(i))
        
    prec.append(p)
    intersect.append(isec)

100%|██████████| 3000/3000 [00:12<00:00, 242.15it/s]


In [34]:
import numpy as np

np.average(prec)

0.08095399514205776

## 5000

In [35]:
prec = []
intersect = []

for i in tqdm(user_sample_5000):
    
    try:
        isec, p = check_precision_at_k(i, 10)
    except:
        p = 0
        isec = {}
        print("error occur for {}".format(i))
        
    prec.append(p)
    intersect.append(isec)

100%|██████████| 5000/5000 [00:20<00:00, 241.18it/s]


In [36]:
import numpy as np

np.average(prec)

0.08243254951731796

# Check Diversity

## 500 sample user

In [37]:
offset = 190 # discard top n suggestion
k = 10

sample_user = user_sample_500

intersect = {x[1] for x in get_top_suggestion(sample_user[0], k + offset)[offset:]}
uni = intersect
for i in range(1, 10):
    s = {x[1] for x in get_top_suggestion(sample_user[i], k + offset)[offset:]}
    #print(sorted(s))
    intersect = intersect.intersection(s)
    uni = uni.union(s)
    
#print("\nintersect")
#print(intersect, len(intersect))
#print("\nunion")
#print(uni, len(uni))
print("\ndistinct rate")
print((len(uni)) / (10*k))


distinct rate
1.0


## 1000 sample user

In [38]:
offset = 190 # discard top n suggestion
k = 10

sample_user = user_sample_1000

intersect = {x[1] for x in get_top_suggestion(sample_user[0], k + offset)[offset:]}
uni = intersect
for i in tqdm(range(1, 10)):
    s = {x[1] for x in get_top_suggestion(sample_user[i], k + offset)[offset:]}
    #print(sorted(s))
    intersect = intersect.intersection(s)
    uni = uni.union(s)
    
#print("\nintersect")
#print(intersect, len(intersect))
#print("\nunion")
#print(uni, len(uni))
print("\ndistinct rate")
print((len(uni)) / (10*k))

100%|██████████| 9/9 [00:00<00:00, 239.43it/s]


distinct rate
1.0





## 3000 sample user

In [39]:
offset = 190 # discard top n suggestion
k = 10

sample_user = user_sample_3000

intersect = {x[1] for x in get_top_suggestion(sample_user[0], k + offset)[offset:]}
uni = intersect
for i in tqdm(range(1, 10)):
    s = {x[1] for x in get_top_suggestion(sample_user[i], k + offset)[offset:]}
    #print(sorted(s))
    intersect = intersect.intersection(s)
    uni = uni.union(s)
    
#print("\nintersect")
#print(intersect, len(intersect))
#print("\nunion")
#print(uni, len(uni))
print("\ndistinct rate")
print((len(uni)) / (10*k))

100%|██████████| 9/9 [00:00<00:00, 243.57it/s]


distinct rate
1.0





## 5000 sample user

In [40]:
offset = 190 # discard top n suggestion
k = 10

sample_user = user_sample_5000

intersect = {x[1] for x in get_top_suggestion(sample_user[0], k + offset)[offset:]}
uni = intersect
for i in tqdm(range(1, 10)):
    s = {x[1] for x in get_top_suggestion(sample_user[i], k + offset)[offset:]}
    #print(sorted(s))
    intersect = intersect.intersection(s)
    uni = uni.union(s)
    
#print("\nintersect")
#print(intersect, len(intersect))
#print("\nunion")
#print(uni, len(uni))
print("\ndistinct rate")
print((len(uni)) / (10*k))

100%|██████████| 9/9 [00:00<00:00, 231.30it/s]


distinct rate
1.0





# Check Unique Movies

## 500 sample user

In [41]:
unique_movies = set()

for user in tqdm(user_sample_500) :    
    pred = get_top_suggestion(user,10)
    for x in pred :
        unique_movies.add(x[0])
        
print(len(unique_movies))    

100%|██████████| 500/500 [00:02<00:00, 245.86it/s]

157





## 1000 sample user

In [42]:
unique_movies = set()

for user in tqdm(user_sample_1000) :    
    pred = get_top_suggestion(user,10)
    for x in pred :
        unique_movies.add(x[0])
        
print(len(unique_movies))    

100%|██████████| 1000/1000 [00:04<00:00, 245.27it/s]

159





## 3000 sample user

In [43]:
unique_movies = set()

for user in tqdm(user_sample_3000) :    
    pred = get_top_suggestion(user,10)
    for x in pred :
        unique_movies.add(x[0])
        
print(len(unique_movies))    

100%|██████████| 3000/3000 [00:12<00:00, 243.55it/s]

168





## 5000 sample user

In [44]:
unique_movies = set()

for user in tqdm(user_sample_5000) :    
    pred = get_top_suggestion(user,10)
    for x in pred :
        unique_movies.add(x[0])
        
print(len(unique_movies))    

100%|██████████| 5000/5000 [00:20<00:00, 242.90it/s]

175





# Generate top k recommendation for 10 selected user

In [45]:
pred = get_top_suggestion(125,10)
pred_set = [x[0] for x in pred]

In [46]:
pred_set

[333, 31, 1381, 1680, 184, 201, 1595, 1548, 1846, 1739]

In [47]:
set(truth_dict[125]).intersection(pred)

set()

In [48]:
#user_test_list = [20169,66966,82374,4296,10204,123623,115870,128970,83750,97239]
user_test_list = random.sample(user_set,10)
user_test_list

[2164, 4106, 5516, 4583, 1637, 236, 779, 5651, 4920, 1252]

In [49]:
topk_pred_dict = {}

for user in user_test_list :
    pred_tuple = get_top_suggestion(user-1,10)
    pred = [x[0] for x in pred_tuple]
    topk_pred_dict[user] = pred    

In [50]:
topk_pred_dict

{2164: [694, 162, 2075, 371, 736, 704, 651, 301, 1483, 793],
 4106: [694, 162, 736, 371, 704, 301, 1613, 1483, 651, 2075],
 5516: [694, 736, 162, 1613, 371, 704, 808, 734, 341, 636],
 4583: [2075, 371, 1483, 162, 1512, 2243, 301, 704, 781, 694],
 1637: [2075, 700, 371, 731, 162, 1483, 694, 1208, 704, 341],
 236: [333, 1381, 1159, 1595, 371, 694, 734, 1483, 0, 184],
 779: [700, 694, 1483, 371, 333, 31, 184, 1595, 162, 1551],
 5651: [333, 31, 184, 1680, 734, 1595, 201, 1381, 1846, 374],
 4920: [333, 31, 1680, 1381, 201, 1548, 184, 1846, 1739, 1595],
 1252: [333, 1381, 31, 201, 12, 184, 684, 227, 724, 807]}

# Convert ID to Object

In [51]:
movies = open("../data/intersect-14m/moviesIdx2.txt").readlines()

In [52]:
dict_entities = {}
for movie in movies:
    x = movie.strip().split()
    movie_id = int(x[0])
    movie_name = x[1]
    dict_entities[movie_id] = movie_name

In [53]:
def get_list_movie(id_movie_list, truth_list) :
    res = []
    for id_movie in id_movie_list :
        is_watched = "watched" if id_movie in truth_list else "nope"
        content = dict_entities[int(id_movie)] + " > " + is_watched 
        res.append(content)
    return res

In [54]:
topk_pred_dict_title = {}

for user in topk_pred_dict.keys() :
    topk_pred_dict_title[user] = get_list_movie(topk_pred_dict[user],truth_dict[user])

In [55]:
topk_pred_dict_title

{2164: ['http://dbpedia.org/resource/American_Heart > watched',
  'http://dbpedia.org/resource/3_Bad_Men > watched',
  'http://dbpedia.org/resource/Branded_(1920_film) > watched',
  'http://dbpedia.org/resource/Adam_Resurrected > watched',
  'http://dbpedia.org/resource/American_Wedding > nope',
  'http://dbpedia.org/resource/American_Mullet > watched',
  'http://dbpedia.org/resource/Always_Leave_Them_Laughing > watched',
  'http://dbpedia.org/resource/Abduction_(2011_film) > watched',
  'http://dbpedia.org/resource/Belly_(film) > watched',
  'http://dbpedia.org/resource/And_When_Did_You_Last_See_Your_Father%3F > nope'],
 4106: ['http://dbpedia.org/resource/American_Heart > nope',
  'http://dbpedia.org/resource/3_Bad_Men > nope',
  'http://dbpedia.org/resource/American_Wedding > nope',
  'http://dbpedia.org/resource/Adam_Resurrected > nope',
  'http://dbpedia.org/resource/American_Mullet > nope',
  'http://dbpedia.org/resource/Abduction_(2011_film) > nope',
  'http://dbpedia.org/resour

In [56]:
import pandas as pd

topk_pred_df = pd.DataFrame(topk_pred_dict_title)

In [57]:
topk_pred_df.to_csv('topk_pred2.csv')