In [None]:
import tensorflow as tf
import numpy as np
from model import RippleNet

def train(args, data_info, show_loss):
    train_data = data_info[0]
    eval_data = data_info[1]
    test_data = data_info[2]
    n_entity = data_info[3]
    n_relation = data_info[4]
    ripple_set = data_info[5]

    model = RippleNet(args, n_entity, n_relation)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    for step in range(args.n_epoch):
        print("checker ---------------->")
        # training
        np.random.shuffle(train_data)
        start = 0
        while start < train_data.shape[0]:
            _, loss = model.train(
                sess, get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size))
            start += args.batch_size
            if show_loss:
                print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss))

        # evaluation
        train_auc, train_acc = evaluation(sess, args, model, train_data, ripple_set, args.batch_size)
        eval_auc, eval_acc = evaluation(sess, args, model, eval_data, ripple_set, args.batch_size)
        test_auc, test_acc = evaluation(sess, args, model, test_data, ripple_set, args.batch_size)

        print('epoch %d    train auc: %.4f  acc: %.4f    eval auc: %.4f  acc: %.4f    test auc: %.4f  acc: %.4f'
            % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc))

    return model, train_data, eval_data, test_data, n_entity, n_relation, ripple_set, sess, args


def get_feed_dict(args, model, data, ripple_set, start, end):
    feed_dict = dict()
    feed_dict[model.items] = data[start:end, 1]
    feed_dict[model.labels] = data[start:end, 2]
    for i in range(args.n_hop):
        feed_dict[model.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]]
        feed_dict[model.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]]
        feed_dict[model.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]]
    return feed_dict


def evaluation(sess, args, model, data, ripple_set, batch_size):
    start = 0
    auc_list = []
    acc_list = []
    while start < data.shape[0]:
        auc, acc = model.eval(sess, get_feed_dict(args, model, data, ripple_set, start, start + batch_size))
        auc_list.append(auc)
        acc_list.append(acc)
        start += batch_size
    return float(np.mean(auc_list)), float(np.mean(acc_list))


In [2]:
import argparse
import numpy as np
import sys
import argparse

RATING_FILE_NAME = dict({'movie': 'ratings.dat', 'book': 'BX-Book-Ratings.csv', 'news': 'ratings.txt', 'tender': 'train_data.txt'})
SEP = dict({'movie': '::', 'book': ';', 'news': '\t', 'tender': '::'})
THRESHOLD = dict({'movie': 4, 'book': 0, 'news': 0, 'tender': 0})


def read_item_index_to_entity_id_file():
    file = '../data/' + DATASET + '/item_index2entity_id_rehashed.txt'
    print('reading item index to entity id file: ' + file + ' ...')
    i = 0
    for line in open(file, encoding='utf-8').readlines():
        item_index = line.strip().split('\t')[0]
        satori_id = line.strip().split('\t')[1]
        item_index_old2new[item_index] = i
        entity_id2index[satori_id] = i
        i += 1


def convert_rating():
    file = '../data/' + DATASET + '/' + RATING_FILE_NAME[DATASET]

    print('reading rating file ...')
    item_set = set(item_index_old2new.values())
    user_pos_ratings = dict()
    user_neg_ratings = dict()

    for line in open(file, encoding='utf-8').readlines()[1:]:

        array = line.strip().split(SEP[DATASET])

        # remove prefix and suffix quotation marks for BX dataset
        if DATASET == 'book':
            array = list(map(lambda x: x[1:-1], array))

        item_index_old = array[1]
        if item_index_old not in item_index_old2new:  # the item is not in the final item set
            continue
        item_index = item_index_old2new[item_index_old]

        user_index_old = int(array[0])

        rating = float(array[2])
        if rating >= THRESHOLD[DATASET]:
            if user_index_old not in user_pos_ratings:
                user_pos_ratings[user_index_old] = set()
            user_pos_ratings[user_index_old].add(item_index)
        else:
            if user_index_old not in user_neg_ratings:
                user_neg_ratings[user_index_old] = set()
            user_neg_ratings[user_index_old].add(item_index)

    print('converting rating file ...')
    writer = open('../data/' + DATASET + '/ratings_final.txt', 'w', encoding='utf-8')
    user_cnt = 0
    user_index_old2new = dict()
    for user_index_old, pos_item_set in user_pos_ratings.items():
        if user_index_old not in user_index_old2new:
            user_index_old2new[user_index_old] = user_cnt
            user_cnt += 1
        user_index = user_index_old2new[user_index_old]

        for item in pos_item_set:
            writer.write('%d\t%d\t1\n' % (user_index, item))
        unwatched_set = item_set - pos_item_set
        if user_index_old in user_neg_ratings:
            unwatched_set -= user_neg_ratings[user_index_old]
        for item in np.random.choice(list(unwatched_set), size=len(pos_item_set), replace=False):
            writer.write('%d\t%d\t0\n' % (user_index, item))
    writer.close()
    print('number of users: %d' % user_cnt)
    print('number of items: %d' % len(item_set))


def convert_kg():
    print('converting kg file ...')
    entity_cnt = len(entity_id2index)
    relation_cnt = 0

    writer = open('../data/' + DATASET + '/kg_final.txt', 'w', encoding='utf-8')

    files = []
    if DATASET == 'movie':
        files.append(open('../data/' + DATASET + '/kg_part1_rehashed.txt', encoding='utf-8'))
        files.append(open('../data/' + DATASET + '/kg_part2_rehashed.txt', encoding='utf-8'))
    elif DATASET == 'tender':
        files.append(open('../data/' + DATASET + '/KG.csv', encoding='utf-8'))

    for file in files:
        for line in file:
            array = line.strip().split('\t')
            head_old = array[0]
            relation_old = array[1]
            tail_old = array[2]

            if head_old not in entity_id2index:
                entity_id2index[head_old] = entity_cnt
                entity_cnt += 1
            head = entity_id2index[head_old]

            if tail_old not in entity_id2index:
                entity_id2index[tail_old] = entity_cnt
                entity_cnt += 1
            tail = entity_id2index[tail_old]

            if relation_old not in relation_id2index:
                relation_id2index[relation_old] = relation_cnt
                relation_cnt += 1
            relation = relation_id2index[relation_old]

            writer.write('%d\t%d\t%d\n' % (head, relation, tail))

    writer.close()
    print('number of entities (containing items): %d' % entity_cnt)
    print('number of relations: %d' % relation_cnt)


if __name__ == '__main__':
    np.random.seed(555)

    sys.argv = ['preprocess.py', '--dataset', 'tender']
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--dataset', type=str, default='movie', help='which dataset to preprocess')
    args = parser.parse_args()
    DATASET = args.dataset

    entity_id2index = dict()
    relation_id2index = dict()
    item_index_old2new = dict()

    read_item_index_to_entity_id_file()
    convert_rating()
    convert_kg()

    print('done')

reading item index to entity id file: ../data/tender/item_index2entity_id_rehashed.txt ...
reading rating file ...


KeyboardInterrupt: 

In [2]:
import argparse
import numpy as np
from data_loader import load_data
import sys
import argparse

np.random.seed(555)

sys.argv = ['main.py', '--dataset', 'tender']

parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default='movie', help='which dataset to use')
parser.add_argument('--dim', type=int, default=16, help='dimension of entity and relation embeddings')
parser.add_argument('--n_hop', type=int, default=2, help='maximum hops')
parser.add_argument('--kge_weight', type=float, default=0.01, help='weight of the KGE term')
parser.add_argument('--l2_weight', type=float, default=1e-7, help='weight of the l2 regularization term')
parser.add_argument('--lr', type=float, default=0.02, help='learning rate')
parser.add_argument('--batch_size', type=int, default=1024, help='batch size')
parser.add_argument('--n_epoch', type=int, default=10, help='the number of epochs')
parser.add_argument('--n_memory', type=int, default=32, help='size of ripple set for each hop')
parser.add_argument('--item_update_mode', type=str, default='plus_transform',
                    help='how to update item at the end of each hop')
parser.add_argument('--using_all_hops', type=bool, default=True,
                    help='whether using outputs of all hops or just the last hop when making prediction')

'''
# default settings for Book-Crossing
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default='book', help='which dataset to use')
parser.add_argument('--dim', type=int, default=4, help='dimension of entity and relation embeddings')
parser.add_argument('--n_hop', type=int, default=2, help='maximum hops')
parser.add_argument('--kge_weight', type=float, default=1e-2, help='weight of the KGE term')
parser.add_argument('--l2_weight', type=float, default=1e-5, help='weight of the l2 regularization term')
parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')
parser.add_argument('--batch_size', type=int, default=1024, help='batch size')
parser.add_argument('--n_epoch', type=int, default=10, help='the number of epochs')
parser.add_argument('--n_memory', type=int, default=32, help='size of ripple set for each hop')
parser.add_argument('--item_update_mode', type=str, default='plus_transform',
                    help='how to update item at the end of each hop')
parser.add_argument('--using_all_hops', type=bool, default=True,
                    help='whether using outputs of all hops or just the last hop when making prediction')
'''

args = parser.parse_args()
print('args', args)
show_loss = False
data_info = load_data(args)
print('data', data_info)
result = train(args, data_info, show_loss)
print('result')


args Namespace(batch_size=1024, dataset='tender', dim=16, item_update_mode='plus_transform', kge_weight=0.01, l2_weight=1e-07, lr=0.02, n_epoch=10, n_hop=2, n_memory=32, using_all_hops=True)
reading rating file ...
splitting dataset ...
reading KG file ...
constructing knowledge graph ...
constructing ripple set ...
data (array([[     0, 108944,      1],
       [     0, 103443,      1],
       [     0,  68982,      0],
       ...,
       [  8108,  83549,      1],
       [  8108,  96566,      1],
       [  8108,  98834,      0]]), array([[  7266, 101660,      0],
       [  1560, 126991,      1],
       [  7877, 128264,      0],
       ...,
       [  6193,  39916,      0],
       [  7804,  89509,      1],
       [  4423, 117341,      1]]), array([[  5999, 119681,      1],
       [  8072,  84271,      1],
       [  5411,  60089,      0],
       ...,
       [  5146, 100414,      0],
       [  5782, 120775,      1],
       [  5636,  36408,      0]]), 116346, 8, defaultdict(<class 'list'>, {

InvalidArgumentError: indices[1009,0] = 125763 is not in [0, 116346)
	 [[node embedding_lookup_1 (defined at d:\Github\Diplom\RippleNet\env\lib\site-packages\tensorflow_core\python\framework\ops.py:1748) ]]

Original stack trace for 'embedding_lookup_1':
  File "C:\Users\Dell\AppData\Local\Programs\Python\Python37\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\Dell\AppData\Local\Programs\Python\Python37\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\traitlets\config\application.py", line 1043, in launch_instance
    app.start()
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\ipykernel\kernelapp.py", line 712, in start
    self.io_loop.start()
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\Dell\AppData\Local\Programs\Python\Python37\lib\asyncio\base_events.py", line 523, in run_forever
    self._run_once()
  File "C:\Users\Dell\AppData\Local\Programs\Python\Python37\lib\asyncio\base_events.py", line 1758, in _run_once
    handle._run()
  File "C:\Users\Dell\AppData\Local\Programs\Python\Python37\lib\asyncio\events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
    await self.process_one()
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
    await dispatch(*args)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
    await result
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\ipykernel\kernelbase.py", line 730, in execute_request
    reply_content = await reply_content
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\ipykernel\ipkernel.py", line 387, in do_execute
    cell_id=cell_id,
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\ipykernel\zmqshell.py", line 528, in run_cell
    return super().run_cell(*args, **kwargs)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\IPython\core\interactiveshell.py", line 2976, in run_cell
    raw_cell, store_history, silent, shell_futures, cell_id
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\IPython\core\interactiveshell.py", line 3030, in _run_cell
    return runner(coro)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\IPython\core\async_helpers.py", line 78, in _pseudo_sync_runner
    coro.send(None)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\IPython\core\interactiveshell.py", line 3258, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\IPython\core\interactiveshell.py", line 3473, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Dell\AppData\Local\Temp\ipykernel_25320\2368556571.py", line 49, in <module>
    result = train(args, data_info, show_loss)
  File "C:\Users\Dell\AppData\Local\Temp\ipykernel_25320\129870554.py", line 13, in train
    model = RippleNet(args, n_entity, n_relation)
  File "d:\Github\Diplom\RippleNet\src\model.py", line 11, in __init__
    self._build_model()
  File "d:\Github\Diplom\RippleNet\src\model.py", line 63, in _build_model
    self.h_emb_list.append(tf.nn.embedding_lookup(self.entity_emb_matrix, self.memories_h[i]))
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\tensorflow_core\python\ops\embedding_ops.py", line 317, in embedding_lookup
    transform_fn=None)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\tensorflow_core\python\ops\embedding_ops.py", line 135, in _embedding_lookup_and_transform
    array_ops.gather(params[0], ids, name=name), ids, max_norm)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\tensorflow_core\python\util\dispatch.py", line 180, in wrapper
    return target(*args, **kwargs)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\tensorflow_core\python\ops\array_ops.py", line 3956, in gather
    params, indices, axis, name=name)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\tensorflow_core\python\ops\gen_array_ops.py", line 4081, in gather_v2
    batch_dims=batch_dims, name=name)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\tensorflow_core\python\framework\op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\tensorflow_core\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "d:\Github\Diplom\RippleNet\env\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()


In [None]:
model, train_data, eval_data, test_data, n_entity, n_relation, ripple_set, sess, args = result
start = 0

model.eval(sess, get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size))

model.predict(sess, get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size))

(0.9574268201867445, 0.9072265625)