In [36]:
def update_kg_final(train, test, kg):
    """
    Function to update kg_final by checking the input train.txt and test.txt, to identify the largest item index. 
    Then, on kg_final, reduce all entities with index > that identified earlier.
    E.g. max(itemID) reduced from 100 to 50, then to kg_final:
        all triplets with index 51 to 100 are removed
        all indexes with values 101 and above are reduced by 50 (i.e. 101 -> 51, 102 -> 52)
    Argument
        train: train.txt:
        test: test.txt:
        kg: kg_final.txt
    Return:
        kg_final_chg.txt: Same as kg_final.txt, after removing all itemID not found in train/test.txt
    """
    import numpy as np
    
    # Train.txt
    user_dict = dict()
    inter_mat = list()
    lines = open(train, 'r').readlines()
    for l in lines:
        tmps = l.strip()
        inters = [int(i) for i in tmps.split(' ')]
        u_id, pos_ids = inters[0], inters[1:]
        pos_ids = list(set(pos_ids))
        for i_id in pos_ids:
            inter_mat.append([u_id, i_id])
        if len(pos_ids) > 0:
            user_dict[u_id] = pos_ids
    train_data = np.array(inter_mat)
    max_item_train = max(train_data[:,1])
    
    # Text.txt
    user_dict = dict()
    inter_mat = list()
    lines = open(test, 'r').readlines()
    for l in lines:
        tmps = l.strip()
        inters = [int(i) for i in tmps.split(' ')]
        u_id, pos_ids = inters[0], inters[1:]
        pos_ids = list(set(pos_ids))
        for i_id in pos_ids:
            inter_mat.append([u_id, i_id])
        if len(pos_ids) > 0:
            user_dict[u_id] = pos_ids
    test_data = np.array(inter_mat)
    max_item_test = max(test_data[:,1])
    
    # Check all indexes are valid
    has_all_index_train = np.isin(np.arange(max_item_train), train_data[:,1])
    miss_train_index = train_data[np.logical_not(has_all_index_train), 1]
    
    has_all_index_test = np.isin(np.arange(max_item_test), test_data[:,1])
    miss_test_index = test_data[np.logical_not(has_all_index_test), 1]
    
    return max_item_train, max_item_test, has_all_index_train.all(), has_all_index_test.all(), miss_train_index, miss_test_index

In [37]:
update_kg_final('train.txt', 'test.txt', 'kg_final.txt')

IndexError: boolean index did not match indexed array along dimension 0; dimension is 5023 but corresponding boolean dimension is 3490

In [18]:
import numpy as np
train = 'train.txt'
test = 'test.txt'
    
# Train.txt
user_dict = dict()
inter_mat = list()
lines = open(train, 'r').readlines()
for l in lines:
    tmps = l.strip()
    inters = [int(i) for i in tmps.split(' ')]
    u_id, pos_ids = inters[0], inters[1:]
    pos_ids = list(set(pos_ids))
    for i_id in pos_ids:
        inter_mat.append([u_id, i_id])
    if len(pos_ids) > 0:
        user_dict[u_id] = pos_ids
train_data = np.array(inter_mat)
max_item_train = max(train_data[:,1])

# Text.txt
user_dict = dict()
inter_mat = list()
lines = open(test, 'r').readlines()
for l in lines:
    tmps = l.strip()
    inters = [int(i) for i in tmps.split(' ')]
    u_id, pos_ids = inters[0], inters[1:]
    pos_ids = list(set(pos_ids))
    for i_id in pos_ids:
        inter_mat.append([u_id, i_id])
    if len(pos_ids) > 0:
        user_dict[u_id] = pos_ids
test_data = np.array(inter_mat)
max_item_test = max(test_data[:,1])

# Check all indexes are valid
has_all_index_train = np.isin(np.arange(max_item_train), train_data[:,1])
miss_train_index = np.arange(max_item_train)[np.logical_not(has_all_index_train)]
print(max_item_train, miss_train_index)

45537 []


In [6]:
np.isin(np.arange(max_item_train), train_data[:,1])

array([ True,  True,  True, False,  True])

In [22]:
a = range(1,10)
b = list(list())
print(type(a), type(b), a, b, a==b)

<class 'range'> <class 'list'> range(1, 10) [] False


In [2]:
train_data

array([[0, 0],
       [1, 1],
       [2, 2],
       [3, 3],
       [3, 4],
       [4, 4]])

In [5]:
import numpy as np
user_dict = dict()
inter_mat = list()

lines = open('train.txt', 'r').readlines()
for l in lines:
    tmps = l.strip()
    inters = [int(i) for i in tmps.split(' ')]

    u_id, pos_ids = inters[0], inters[1:]
    pos_ids = list(set(pos_ids))

    for i_id in pos_ids:
        inter_mat.append([u_id, i_id])

    if len(pos_ids) > 0:
        user_dict[u_id] = pos_ids

train_data = np.array(inter_mat)

In [8]:
max(train_data[:,1])

45537

In [5]:
def load_ratings(file_name):
    import numpy as np
    user_dict = dict()
    inter_mat = list()

    lines = open(file_name, 'r').readlines()
    for l in lines:
        tmps = l.strip()
        inters = [int(i) for i in tmps.split(' ')]

        u_id, pos_ids = inters[0], inters[1:]
        pos_ids = list(set(pos_ids))

        for i_id in pos_ids:
            inter_mat.append([u_id, i_id])

        if len(pos_ids) > 0:
            user_dict[u_id] = pos_ids
    return np.array(inter_mat), user_dict

In [14]:
def load_kg(file_name):
    import collections
    import numpy as np
    def _construct_kg(kg_np):
        kg = collections.defaultdict(list)
        rd = collections.defaultdict(list)

        for head, relation, tail in kg_np:
            kg[head].append((tail, relation))
            rd[relation].append((head, tail))
        return kg, rd

    kg_np = np.loadtxt(file_name, dtype=np.int32)
    kg_np = np.unique(kg_np, axis=0)

    # self.n_relations = len(set(kg_np[:, 1]))
    # self.n_entities = len(set(kg_np[:, 0]) | set(kg_np[:, 2]))
    n_relations = max(kg_np[:, 1]) + 1
    n_entities = max(max(kg_np[:, 0]), max(kg_np[:, 2])) + 1
    n_triples = len(kg_np)

    kg_dict, relation_dict = _construct_kg(kg_np)

    return kg_np, kg_dict, relation_dict

In [20]:
inter_mat, user_dict = load_ratings('E:/GitHub/knowledge_graph_attention_network/Data/own_data/own_train.txt')
print(inter_mat.shape)
print(inter_mat)
print(user_dict)

(10, 2)
[[0 0]
 [0 1]
 [0 2]
 [0 3]
 [1 1]
 [2 2]
 [3 3]
 [4 1]
 [4 2]
 [4 3]]
{0: [0, 1, 2, 3], 1: [1], 2: [2], 3: [3], 4: [1, 2, 3]}


In [27]:
kg_np, kg_dict, relation_dict = load_kg('E:/GitHub/knowledge_graph_attention_network/Data/own_data/own_kg.txt')

In [28]:
print(kg_np)
print(kg_dict)
print(relation_dict)

[[0 3 1]
 [0 4 2]]
defaultdict(<class 'list'>, {0: [(1, 3), (2, 4)]})
defaultdict(<class 'list'>, {3: [(0, 1)], 4: [(0, 2)]})


In [29]:
45537+45918

91455