In [4]:
import pickle
import numpy as np
import math
import itertools
import tqdm
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import sys
from os import path
sys.path.append(path.dirname(path.dirname(path.abspath('../constants'))))
import constants.consts as consts

In [5]:
pad_token = consts.PAD_TOKEN
type_to_ix = {'person': consts.PERSON_TYPE, 'user': consts.USER_TYPE, 'song': consts.SONG_TYPE, 
              pad_token: consts.PAD_TYPE}
relation_to_ix = {'song_person': consts.SONG_PERSON_REL, 'person_song': consts.PERSON_SONG_REL, 'user_song': 
                  consts.USER_SONG_REL, 'song_user': consts.SONG_USER_REL, '#UNK_RELATION': consts.UNK_REL, 
                  '#END_RELATION': consts.END_REL, pad_token: consts.PAD_REL}


Construct mappings from entity, type, and relation to idxs

In [6]:
#entity vocab set is combination of songs, users, and persons
#currently using dicts and not id_txt files since it seemed like there were missing songs
with open('../song_data/dense_song_user_edges.dict', 'rb') as handle:
    songs1 = set(pickle.load(handle).keys())
    
with open('../song_data/dense_song_person_edges.dict', 'rb') as handle:
    songs2 = set(pickle.load(handle).keys())

with open('../song_data/dense_user_song_edges.dict', 'rb') as handle:
    users = set(pickle.load(handle).keys())
    
with open('../song_data/dense_person_song_edges.dict', 'rb') as handle:
    persons = set(pickle.load(handle).keys())

songs = songs1|songs2
entities = songs|users|persons

#if we have singe map for all entities need to ensure no duplicates across categories
#assert len(entities) == len(songs) + len(persons) + len(users)

song_to_ix = {(song, consts.SONG_TYPE): idx for idx, song in enumerate(songs)}
user_to_ix = {(user, consts.USER_TYPE): idx+len(song_to_ix) for idx, user in enumerate(users)}
person_to_ix = {(person, consts.PERSON_TYPE): idx+len(song_to_ix)+len(user_to_ix) for idx, person in enumerate(persons)}

entity_to_ix = {(song, consts.SONG_TYPE): idx for idx, song in enumerate(songs)}
entity_to_ix.update({(user, consts.USER_TYPE): idx+len(song_to_ix) for idx, user in enumerate(users)})
entity_to_ix.update({(person, consts.PERSON_TYPE): idx+len(song_to_ix)+len(user_to_ix) for idx, person in enumerate(persons)})
# entity_to_ix = {entity: idx for idx, entity in enumerate(entities)}
entity_to_ix[pad_token] = len(entity_to_ix)
    

In [7]:
print(list(song_to_ix.items())[:10])
print(list(user_to_ix.items())[:10])
print(list(person_to_ix.items())[:10])
print(len(entity_to_ix))

[(('9g01XAsID4VdZkPzjGQ+m9ukp9KZlKCzUZbHdvBIK/M=', 2), 0), (('DHV6UsyY/SaG9sQJSMX8wp88lrzgtE/YZNqTljmVHxs=', 2), 1), (('Fb8+EVTFkAVdR7XZO3szbDMWodYnmCw/CVOajQ4nUk0=', 2), 2), (('v38wgNSD4Da1Ha4fnbRDbxf0K4GP0By5z7iMi8gWUg0=', 2), 3), (('2cm7JlohWjfq9ASqD76I3GPzguAXXPFJj7sHJtKUIbc=', 2), 4), (('OiY3sRquCoAuZzdtxLvnIdE8+Uz2vetQIx8z/stGRkA=', 2), 5), (('iuydOcKLh7ee7hL6+WEqc+XJ0TL08y/ft0HsDOP7Tv0=', 2), 6), (('VbEEN71Ph6HiY3k9hXXiVH4sBHv2SLaUl18P7Yyq/fg=', 2), 7), (('NsXl1RDjES8dOOu5LInbB/xoWI3bNhzYCYyRM5iJeg0=', 2), 8), (('hunillniveXl9KpspTBJibHG3N2tqxpmMlu5GY6+DkU=', 2), 9)]
[(('hLW9bmvBcywLyucaDKDZlKXvHDmC0UWtdd4jHwY37OU=', 1), 224214), (('YMQmW0gyc1CYR1aLrw4vAomobw3ZYHW8N2Sed+w+15k=', 1), 224215), (('NGBFuLGTCYZyIikX2bKpI2Z+zO7ILujTyuF5YUOqFzQ=', 1), 224216), (('k6RlVKRSRiifGsD+57TN2JeDvcbztXhgx8Uko6764t8=', 1), 224217), (('c1jkesN7viQZmVxXt8mBv7bMGiasmKupwVWK4B9JPSY=', 1), 224218), (('KXS4mDTuweetiTTxgkbg8uohB20KUXvkXz6g4l3r40c=', 1), 224219), (('pEpFpxyq3HMPhnigrVONiQjDODv55gG1AfvBQ

In [5]:
print(len(songs) + len(persons) + len(users))
print(len(entity_to_ix))

262653
262654


In [6]:
print('6uYPDXZJGYhq6WDg35xDLMB0Z46Rw3Y0XTLik5F/w9c=' in entities)
print('6uYPDXZJGYhq6WDg35xDLMB0Z46Rw3Y0XTLik5F/w9c=' in persons)
print('6uYPDXZJGYhq6WDg35xDLMB0Z46Rw3Y0XTLik5F/w9c=' in users)
print('6uYPDXZJGYhq6WDg35xDLMB0Z46Rw3Y0XTLik5F/w9c=' in songs)

True
False
False
True


Construct inverse idx mappings

In [7]:
ix_to_type = {v: k for k, v in type_to_ix.items()}
ix_to_relation = {v: k for k, v in relation_to_ix.items()}
ix_to_entity = {v: k for k, v in entity_to_ix.items()}

Save idx mappings as .dict files

In [8]:
with open('type_to_ix.dict', 'wb') as handle:
    pickle.dump(type_to_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('relation_to_ix.dict', 'wb') as handle:
    pickle.dump(relation_to_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('entity_to_ix.dict', 'wb') as handle:
    pickle.dump(entity_to_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('ix_to_type.dict', 'wb') as handle:
    pickle.dump(ix_to_type, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('ix_to_relation.dict', 'wb') as handle:
    pickle.dump(ix_to_relation, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('ix_to_entity.dict', 'wb') as handle:
    pickle.dump(ix_to_entity, handle, protocol=pickle.HIGHEST_PROTOCOL)

Construct relation and type dictionaries replacing names with ids

In [9]:
def convert_to_ids(rel_dict, start_type, end_type):
    new_rel = {}
    for key,values in rel_dict.items():
        key_id = entity_to_ix[(key, start_type)]
        value_ids = []
        for val in values:
            value_ids.append(entity_to_ix[(val, end_type)])
        new_rel[key_id] = value_ids
    
    return new_rel

In [11]:
print(1)
with open('../song_data/dense_song_user_edges.dict', 'rb') as handle:
    song_user = pickle.load(handle)
    song_user_ix = convert_to_ids(song_user, consts.SONG_TYPE, consts.USER_TYPE)

print(1)
with open('../song_data/dense_user_song_edges.dict', 'rb') as handle:
    user_song = pickle.load(handle)
    user_song_ix = convert_to_ids(user_song, consts.USER_TYPE, consts.SONG_TYPE)

print(1)
with open('../song_data/dense_song_person_edges.dict', 'rb') as handle:
    song_person = pickle.load(handle)
    song_person_ix = convert_to_ids(song_person, consts.SONG_TYPE, consts.PERSON_TYPE)

print(1)
with open('../song_data/dense_person_song_edges.dict', 'rb') as handle:
    person_song = pickle.load(handle)
    person_song_ix = convert_to_ids(person_song, consts.PERSON_TYPE, consts.SONG_TYPE)

1
1
1
1


In [12]:
#Convert entity to id dict to entity_ix to id dict
with open('../song_data/id_type.dict', 'rb') as handle:
    entity_type = pickle.load(handle)
    entity_ix_type = {}
    for key in entities:
        value = entity_type[key]
        key_id = entity_to_ix[(key, int(value))]    
        entity_ix_type[key_id] = value

In [None]:
with open('song_user_ix.dict', 'wb') as handle:
    pickle.dump(song_user_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_song_ix.dict', 'wb') as handle:
    pickle.dump(user_song_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('song_person_ix.dict', 'wb') as handle:
    pickle.dump(song_person_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('person_song_ix.dict', 'wb') as handle:
    pickle.dump(person_song_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('entity_ix_type.dict', 'wb') as handle:
    pickle.dump(entity_ix_type, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# convert training test pos neg dataset

In [13]:
with open('../song_data/song_user_train_dense.dict', 'rb') as handle:
    song_user_train = pickle.load(handle)
    song_user_train_ix = convert_to_ids(song_user_train, consts.SONG_TYPE, consts.USER_TYPE)
with open('../song_data/song_user_test_dense.dict', 'rb') as handle:
    song_user_test = pickle.load(handle)
    song_user_test_ix = convert_to_ids(song_user_test, consts.SONG_TYPE, consts.USER_TYPE)
with open('../song_data/user_song_train_dense.dict', 'rb') as handle:
    user_song_train = pickle.load(handle)
    user_song_train_ix = convert_to_ids(user_song, consts.USER_TYPE, consts.SONG_TYPE)
with open('../song_data/user_song_test_dense.dict', 'rb') as handle:
    user_song_test = pickle.load(handle)
    user_song_test_ix = convert_to_ids(user_song_test, consts.USER_TYPE, consts.SONG_TYPE)

In [14]:
with open('song_user_train_ix.dict', 'wb') as handle:
    pickle.dump(song_user_train_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('song_user_test_ix.dict', 'wb') as handle:
    pickle.dump(song_user_test_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_song_train_ix.dict', 'wb') as handle:
    pickle.dump(user_song_train_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_song_test_ix.dict', 'wb') as handle:
    pickle.dump(user_song_test_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
def convert_to_ids_tuples(tuples, start_type, end_type):
    new_tuples = []
    for user,song in tuples:
        user_id = entity_to_ix[(user, start_type)]
        song_id = entity_to_ix[(song, end_type)]
        new_tuples.append((user_id, song_id))
        
    return new_tuples

with open('../song_data/user_song_tuple_train_pos_dense.txt', 'rb') as handle:
    user_song_tuple_train_pos = pickle.load(handle)
    user_song_tuple_train_pos_ix = convert_to_ids_tuples(user_song_tuple_train_pos, consts.USER_TYPE, consts.SONG_TYPE)
with open('../song_data/user_song_tuple_test_pos_dense.txt', 'rb') as handle:
    user_song_tuple_test_pos = pickle.load(handle)
    user_song_tuple_test_pos_ix = convert_to_ids_tuples(user_song_tuple_test_pos, consts.USER_TYPE, consts.SONG_TYPE)
with open('../song_data/user_song_tuple_train_neg_dense.txt', 'rb') as handle:
    user_song_tuple_train_neg = pickle.load(handle)
    user_song_tuple_train_neg_ix = convert_to_ids_tuples(user_song_tuple_train_neg, consts.USER_TYPE, consts.SONG_TYPE)
with open('../song_data/user_song_tuple_test_neg_dense.txt', 'rb') as handle:
    user_song_tuple_test_neg = pickle.load(handle)
    user_song_tuple_test_neg_ix = convert_to_ids_tuples(user_song_tuple_test_neg, consts.USER_TYPE, consts.SONG_TYPE)

In [16]:
with open('user_song_tuple_train_pos_ix.txt', 'wb') as handle:
    pickle.dump(user_song_tuple_train_pos_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_song_tuple_train_neg_ix.txt', 'wb') as handle:
    pickle.dump(user_song_tuple_train_neg_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_song_tuple_test_pos_ix.txt', 'wb') as handle:
    pickle.dump(user_song_tuple_test_pos_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_song_tuple_test_neg_ix.txt', 'wb') as handle:
    pickle.dump(user_song_tuple_test_neg_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)