In [36]:
def update_kg_final(train, test, kg):
    """
    Function to update kg_final by checking the input train.txt and test.txt, to identify the largest item index. 
    Then, on kg_final, reduce all entities with index > that identified earlier.
    E.g. max(itemID) reduced from 100 to 50, then to kg_final:
        all triplets with index 51 to 100 are removed
        all indexes with values 101 and above are reduced by 50 (i.e. 101 -> 51, 102 -> 52)
    Argument
        train: train.txt:
        test: test.txt:
        kg: kg_final.txt
    Return:
        kg_final_chg.txt: Same as kg_final.txt, after removing all itemID not found in train/test.txt
    """
    import numpy as np
    
    # Train.txt
    user_dict = dict()
    inter_mat = list()
    lines = open(train, 'r').readlines()
    for l in lines:
        tmps = l.strip()
        inters = [int(i) for i in tmps.split(' ')]
        u_id, pos_ids = inters[0], inters[1:]
        pos_ids = list(set(pos_ids))
        for i_id in pos_ids:
            inter_mat.append([u_id, i_id])
        if len(pos_ids) > 0:
            user_dict[u_id] = pos_ids
    train_data = np.array(inter_mat)
    max_item_train = max(train_data[:,1])
    
    # Text.txt
    user_dict = dict()
    inter_mat = list()
    lines = open(test, 'r').readlines()
    for l in lines:
        tmps = l.strip()
        inters = [int(i) for i in tmps.split(' ')]
        u_id, pos_ids = inters[0], inters[1:]
        pos_ids = list(set(pos_ids))
        for i_id in pos_ids:
            inter_mat.append([u_id, i_id])
        if len(pos_ids) > 0:
            user_dict[u_id] = pos_ids
    test_data = np.array(inter_mat)
    max_item_test = max(test_data[:,1])
    
    # Check all indexes are valid
    has_all_index_train = np.isin(np.arange(max_item_train), train_data[:,1])
    miss_train_index = train_data[np.logical_not(has_all_index_train), 1]
    
    has_all_index_test = np.isin(np.arange(max_item_test), test_data[:,1])
    miss_test_index = test_data[np.logical_not(has_all_index_test), 1]
    
    return max_item_train, max_item_test, has_all_index_train.all(), has_all_index_test.all(), miss_train_index, miss_test_index

In [37]:
update_kg_final('train.txt', 'test.txt', 'kg_final.txt')

IndexError: boolean index did not match indexed array along dimension 0; dimension is 5023 but corresponding boolean dimension is 3490

In [133]:
import numpy as np
train = 'train.txt'
test = 'test.txt'
    
# Train.txt
user_dict = dict()
inter_mat = list()
lines = open(train, 'r').readlines()
for l in lines:
    tmps = l.strip()
    inters = [int(i) for i in tmps.split(' ')]
    u_id, pos_ids = inters[0], inters[1:]
    pos_ids = list(set(pos_ids))
    for i_id in pos_ids:
        inter_mat.append([u_id, i_id])
    if len(pos_ids) > 0:
        user_dict[u_id] = pos_ids
train_data = np.array(inter_mat)
max_item_train = max(train_data[:,1])

# Text.txt
user_dict = dict()
inter_mat = list()
lines = open(test, 'r').readlines()
for l in lines:
    tmps = l.strip()
    inters = [int(i) for i in tmps.split(' ')]
    u_id, pos_ids = inters[0], inters[1:]
    pos_ids = list(set(pos_ids))
    for i_id in pos_ids:
        inter_mat.append([u_id, i_id])
    if len(pos_ids) > 0:
        user_dict[u_id] = pos_ids
test_data = np.array(inter_mat)
max_item_test = max(test_data[:,1])

# Check all indexes are valid
has_all_index_train = np.isin(np.arange(max_item_train), train_data[:,1])
miss_train_index = np.arange(max_item_train)[np.logical_not(has_all_index_train)]
has_all_index_test = np.isin(np.arange(max_item_test), test_data[:,1])
miss_test_index = np.arange(max_item_test)[np.logical_not(has_all_index_test)]
print('Train\n', max_item_train, miss_train_index)
print('\nTest\n',  max_item_test, miss_test_index)

Train
 42153 []

Test
 45534 [    1     3    15 ... 45518 45525 45526]


In [115]:
DEFAULT_MAX_TRAIN_TEST = 45537
max_train_test = max(max_item_train, max_item_test)
entity_offset = DEFAULT_MAX_TRAIN_TEST - max_train_test

# Read kg_final
kg_np = np.loadtxt('kg_final.txt', dtype=np.int32)
kg_np = np.unique(kg_np, axis=0)

# Create new_kg_final
new_kg_np = np.copy(kg_np)
new_kg_np[:,0][kg_np[:,0] > DEFAULT_MAX_TRAIN_TEST] = kg_np[:,0][kg_np[:,0] > DEFAULT_MAX_TRAIN_TEST]-entity_offset
new_kg_np[:,2][kg_np[:,2] > DEFAULT_MAX_TRAIN_TEST] = kg_np[:,2][kg_np[:,2] > DEFAULT_MAX_TRAIN_TEST]-entity_offset
print('Saving to new text, where original :\n',
      'kg_np.shape = ', kg_np.shape, 
      '\nmaximum index at 0, 2 are {} and {}'.format(max(kg_np[:,0]), max(kg_np[:,2])),
      '\n\nand new :\n',
      'new_kg_np.shape = ', new_kg_np.shape, 
      '\nmaximum index at 0, 2 are {} and {}'.format(max(new_kg_np[:,0]), max(new_kg_np[:,2])))
#       new_kg_np.shape, kg_np.shape, max(new_kg_np[:,0]), max(new_kg_np[:,2]), max(kg_np[:,0]), max(kg_np[:,2]))
np.savetxt('new_kg_final.txt', new_kg_np, delimiter=' ', fmt='%i')

Saving to new text, where original :
 kg_np.shape =  (2005, 3) 
maximum index at 0, 2 are 46956 and 46932 

and new :
 new_kg_np.shape =  (2005, 3) 
maximum index at 0, 2 are 46955 and 46931


In [118]:
print(kg_np[:,0][kg_np[:,0] > DEFAULT_MAX_TRAIN_TEST-30][:20])
print(new_kg_np[:,0][kg_np[:,0] > DEFAULT_MAX_TRAIN_TEST-30][:20])
print(new_kg_np[:,2][kg_np[:,0] > DEFAULT_MAX_TRAIN_TEST-30][:20])

[45525 45538 45541 45543 45546 45548 45550 45553 45555 45556 45559 45562
 45563 45568 45570 45573 45576 45578 45580 45582]
[45525 45537 45540 45542 45545 45547 45549 45552 45554 45555 45558 45561
 45562 45567 45569 45572 45575 45577 45579 45581]
[45703 45538 45541 45543 45546 45548 45550 45553 45544 45556 45559 45544
 45563 45568 45564 45573 45576 45578 45544 45582]


In [73]:
print(kg_np[:100])

[[    5    16 45601]
 [   13     2 45567]
 [   23     2 45762]
 [   94     6 45549]
 [   96     9 45623]
 [   97    14 45683]
 [  101    22 45615]
 [  108    16 45601]
 [  138    16 45601]
 [  205    20 45626]
 [  254    16 45601]
 [  281    20 45609]
 [  285    10 45560]
 [  330    22 45615]
 [  371    13 45572]
 [  388    13 45572]
 [  493     2 45795]
 [  525    28 45687]
 [  531     0 45786]
 [  551    23 45618]
 [  650    16 45726]
 [  669     9 45554]
 [  699     2 45994]
 [  704    18 45592]
 [  707     5 45596]
 [  765    14 45648]
 [  783     0 46568]
 [  948     9 45554]
 [  973     1 45540]
 [ 1014     5 45547]
 [ 1079     2 45565]
 [ 1086     0 45772]
 [ 1086    14 45898]
 [ 1103     5 45547]
 [ 1194    18 45592]
 [ 1199    16 46093]
 [ 1244     2 46329]
 [ 1443    16 45587]
 [ 1468     0 46081]
 [ 1498     9 45554]
 [ 1564     0 45838]
 [ 1575    36 46206]
 [ 1606     4 45545]
 [ 1632     2 45567]
 [ 1656    23 45618]
 [ 1667    10 45560]
 [ 1671    18 45592]
 [ 1678     2

In [70]:
print(new_kg_np[:5])

[[    5    16 45600]
 [   13     2 45566]
 [   23     2 45761]
 [   94     6 45548]
 [   96     9 45622]]


In [12]:

# self.n_relations = len(set(kg_np[:, 1]))
# self.n_entities = len(set(kg_np[:, 0]) | set(kg_np[:, 2]))
n_relations = max(kg_np[:, 1]) + 1
n_entities = max(max(kg_np[:, 0]), max(kg_np[:, 2])) + 1
n_triples = len(kg_np)

kg_dict, relation_dict = _construct_kg(kg_np)

In [40]:
max_kg_0 = max(kg_np[:,0])
max_kg_2 = max(kg_np[:,2])
print(max_kg_0, max_kg_2)
has_all_index_kg = np.isin(np.arange(max_kg_0), kg_np[:,0])
has_all_index_kg_2 = np.isin(np.arange(max_kg_0), kg_np[:,2])
kg_entity_index = has_all_index_kg[max_item_train:]
kg_entity_index_2 = has_all_index_kg_2[max_item_train:]
print(max_kg_0, max_item_train, '\n', kg_entity_index[0:50], kg_entity_index[-10:])
print(max_kg_0, max_item_train, '\n', kg_entity_index_2[0:50], kg_entity_index_2[-10:])
np.logical_or(kg_entity_index[1:], kg_entity_index_2[1:]).all()

46956 46932
46956 45537 
 [False  True False False  True False  True False False  True False  True
 False  True False False  True False  True  True False False  True False
 False  True  True False False False False  True False  True False False
  True False False  True False  True False  True False  True False  True
 False  True] [ True  True  True  True  True  True  True  True  True  True]
46956 45537 
 [False False  True  True False  True False  True  True False  True False
  True False  True  True False  True False False  True  True False  True
  True False False  True  True  True  True False  True False  True  True
 False  True  True False  True False  True False  True False  True False
  True False] [False False False False False False False False False False]


True

In [27]:
print(kg_np.shape, '\n', kg_np)

(2005, 3) 
 [[    5    16 45601]
 [   13     2 45567]
 [   23     2 45762]
 ...
 [46954     2 45567]
 [46955    18 46151]
 [46956     0 45564]]


In [6]:
np.isin(np.arange(max_item_train), train_data[:,1])

array([ True,  True,  True, False,  True])

In [22]:
a = range(1,10)
b = list(list())
print(type(a), type(b), a, b, a==b)

<class 'range'> <class 'list'> range(1, 10) [] False


In [2]:
train_data

array([[0, 0],
       [1, 1],
       [2, 2],
       [3, 3],
       [3, 4],
       [4, 4]])

In [5]:
import numpy as np
user_dict = dict()
inter_mat = list()

lines = open('train.txt', 'r').readlines()
for l in lines:
    tmps = l.strip()
    inters = [int(i) for i in tmps.split(' ')]

    u_id, pos_ids = inters[0], inters[1:]
    pos_ids = list(set(pos_ids))

    for i_id in pos_ids:
        inter_mat.append([u_id, i_id])

    if len(pos_ids) > 0:
        user_dict[u_id] = pos_ids

train_data = np.array(inter_mat)

In [8]:
max(train_data[:,1])

45537

In [5]:
def load_ratings(file_name):
    import numpy as np
    user_dict = dict()
    inter_mat = list()

    lines = open(file_name, 'r').readlines()
    for l in lines:
        tmps = l.strip()
        inters = [int(i) for i in tmps.split(' ')]

        u_id, pos_ids = inters[0], inters[1:]
        pos_ids = list(set(pos_ids))

        for i_id in pos_ids:
            inter_mat.append([u_id, i_id])

        if len(pos_ids) > 0:
            user_dict[u_id] = pos_ids
    return np.array(inter_mat), user_dict

In [14]:
def load_kg(file_name):
    import collections
    import numpy as np
    def _construct_kg(kg_np):
        kg = collections.defaultdict(list)
        rd = collections.defaultdict(list)

        for head, relation, tail in kg_np:
            kg[head].append((tail, relation))
            rd[relation].append((head, tail))
        return kg, rd

    kg_np = np.loadtxt(file_name, dtype=np.int32)
    kg_np = np.unique(kg_np, axis=0)

    # self.n_relations = len(set(kg_np[:, 1]))
    # self.n_entities = len(set(kg_np[:, 0]) | set(kg_np[:, 2]))
    n_relations = max(kg_np[:, 1]) + 1
    n_entities = max(max(kg_np[:, 0]), max(kg_np[:, 2])) + 1
    n_triples = len(kg_np)

    kg_dict, relation_dict = _construct_kg(kg_np)

    return kg_np, kg_dict, relation_dict

In [20]:
inter_mat, user_dict = load_ratings('E:/GitHub/knowledge_graph_attention_network/Data/own_data/own_train.txt')
print(inter_mat.shape)
print(inter_mat)
print(user_dict)

(10, 2)
[[0 0]
 [0 1]
 [0 2]
 [0 3]
 [1 1]
 [2 2]
 [3 3]
 [4 1]
 [4 2]
 [4 3]]
{0: [0, 1, 2, 3], 1: [1], 2: [2], 3: [3], 4: [1, 2, 3]}


In [27]:
kg_np, kg_dict, relation_dict = load_kg('E:/GitHub/knowledge_graph_attention_network/Data/own_data/own_kg.txt')

In [28]:
print(kg_np)
print(kg_dict)
print(relation_dict)

[[0 3 1]
 [0 4 2]]
defaultdict(<class 'list'>, {0: [(1, 3), (2, 4)]})
defaultdict(<class 'list'>, {3: [(0, 1)], 4: [(0, 2)]})


In [29]:
45537+45918

91455