In [15]:
import pickle
import numpy as np
import math
import itertools
import tqdm
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [16]:
pad_token = '#PAD_TOKEN'
type_to_ix = {'person': 0, 'user': 1, 'song': 2, pad_token: 3}
relation_to_ix = {'song_person': 0, 'person_song': 1, 'user_song': 
                  2, 'song_user': 3, '#UNK_RELATION': 4, '#END_RELATION': 5, pad_token: 6}

PERSON_TYPE = 0
USER_TYPE = 1
SONG_TYPE = 2

Construct mappings from entity, type, and relation to idxs

In [17]:
#entity vocab set is combination of songs, users, and persons
#currently using dicts and not id_txt files since it seemed like there were missing songs
with open('../song_data/dense_song_user_edges.dict', 'rb') as handle:
    songs1 = set(pickle.load(handle).keys())
    
with open('../song_data/dense_song_person_edges.dict', 'rb') as handle:
    songs2 = set(pickle.load(handle).keys())

with open('../song_data/dense_user_song_edges.dict', 'rb') as handle:
    users = set(pickle.load(handle).keys())
    
with open('../song_data/dense_person_song_edges.dict', 'rb') as handle:
    persons = set(pickle.load(handle).keys())

songs = songs1|songs2
entities = songs|users|persons

#if we have singe map for all entities need to ensure no duplicates across categories
#assert len(entities) == len(songs) + len(persons) + len(users)

song_to_ix = {(song, SONG_TYPE): idx for idx, song in enumerate(songs)}
user_to_ix = {(user, USER_TYPE): idx+len(song_to_ix) for idx, user in enumerate(users)}
person_to_ix = {(person, PERSON_TYPE): idx+len(song_to_ix)+len(user_to_ix) for idx, person in enumerate(persons)}

entity_to_ix = {(song, SONG_TYPE): idx for idx, song in enumerate(songs)}
entity_to_ix.update({(user, USER_TYPE): idx+len(song_to_ix) for idx, user in enumerate(users)})
entity_to_ix.update({(person, PERSON_TYPE): idx+len(song_to_ix)+len(user_to_ix) for idx, person in enumerate(persons)})
# entity_to_ix = {entity: idx for idx, entity in enumerate(entities)}
entity_to_ix[pad_token] = len(entity_to_ix)
    

In [18]:
print(list(song_to_ix.items())[:10])
print(list(user_to_ix.items())[:10])
print(list(person_to_ix.items())[:10])
print(len(entity_to_ix))

[(('j5LGMfs4plGaq/D5+SL0LMjacBoUqalCy0EMgFcgMlc=', 2), 0), (('tjEAVmOfXQ8WMdYIrZS+R0g1pqIkAVDyNJopcS2JYb0=', 2), 1), (('YYE8FWKKw4SUWh5D6MT5scCzqzM1A06xu4cn5PS5Qmk=', 2), 2), (('rjGMmJQdMqSXYsBQqBIep5R6/EuSqOOw7esglyYTMvY=', 2), 3), (('Sn58k5eJwU+7S2ly9zh6Y76ij/+XRuO8qKPNoSNUQxg=', 2), 4), (('kRMRCgV7KrL58jWquyAWnG86og5OvdJPZm8QjzcMQbg=', 2), 5), (('h6skjx9MLDncJbnJO78z+1BNMlNb4lbHFwB3bb3IIXo=', 2), 6), (('KiMWEQZ8bRK2muIqMco7D+InjtRjhq0snyWZUzVtgq0=', 2), 7), (('LI7fY0r1sF1vWupMTikBOf1XVvdzGuEpO4z/25mR4sM=', 2), 8), (('N1FzNcXRVM56iEVz4ebZqXOjMdq5SKh+fDiCHCCVT3E=', 2), 9)]
[(('8ObZ/AgRUoSDPNFXyJI+0ZFiiIbIkF071v3pblsq15Y=', 1), 224214), (('WtuOpsYQ8St9zt7yySdryqDUCxeT4/v2rLRA7+lsahQ=', 1), 224215), (('QI93Z+LZKpTMf9FiVRH6ofB8mM1niwlqP4SOIi11Nyo=', 1), 224216), (('Pd6ojPIwyQr9kVrW0//vMGDYVs+BIyUaLBY1SkcM6dM=', 1), 224217), (('jbV+lsBsMOOrGuh7x3jME4IsgVa1ia0ousrp2xeKCF4=', 1), 224218), (('jngE9C59J4eDq9zTJA/xjsXZceghW7Mb2g1RHFHtNFY=', 1), 224219), (('PwNcK9pmv+Z246XF98nPW9ID/9xvqEzCh6IG/

In [19]:
print(len(songs) + len(persons) + len(users))
print(len(entity_to_ix))

262653
262654


In [20]:
print('6uYPDXZJGYhq6WDg35xDLMB0Z46Rw3Y0XTLik5F/w9c=' in entities)
print('6uYPDXZJGYhq6WDg35xDLMB0Z46Rw3Y0XTLik5F/w9c=' in persons)
print('6uYPDXZJGYhq6WDg35xDLMB0Z46Rw3Y0XTLik5F/w9c=' in users)
print('6uYPDXZJGYhq6WDg35xDLMB0Z46Rw3Y0XTLik5F/w9c=' in songs)

True
False
False
True


Construct inverse idx mappings

In [21]:
ix_to_type = {v: k for k, v in type_to_ix.items()}
ix_to_relation = {v: k for k, v in relation_to_ix.items()}
ix_to_entity = {v: k for k, v in entity_to_ix.items()}

Save idx mappings as .dict files

In [22]:
with open('type_to_ix.dict', 'wb') as handle:
    pickle.dump(type_to_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('relation_to_ix.dict', 'wb') as handle:
    pickle.dump(relation_to_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('entity_to_ix.dict', 'wb') as handle:
    pickle.dump(entity_to_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('ix_to_type.dict', 'wb') as handle:
    pickle.dump(ix_to_type, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('ix_to_relation.dict', 'wb') as handle:
    pickle.dump(ix_to_relation, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('ix_to_entity.dict', 'wb') as handle:
    pickle.dump(ix_to_entity, handle, protocol=pickle.HIGHEST_PROTOCOL)

Construct relation and type dictionaries replacing names with ids

In [23]:
def convert_to_ids(rel_dict, start_type, end_type):
    new_rel = {}
    for key,values in rel_dict.items():
        key_id = entity_to_ix[(key, start_type)]
        value_ids = []
        for val in values:
            value_ids.append(entity_to_ix[(val, end_type)])
        new_rel[key_id] = value_ids
    
    return new_rel

In [24]:
print(1)
with open('../song_data/dense_song_user_edges.dict', 'rb') as handle:
    song_user = pickle.load(handle)
    song_user_ix = convert_to_ids(song_user, SONG_TYPE, USER_TYPE)

print(1)
with open('../song_data/dense_user_song_edges.dict', 'rb') as handle:
    user_song = pickle.load(handle)
    user_song_ix = convert_to_ids(user_song, USER_TYPE, SONG_TYPE)

print(1)
with open('../song_data/dense_song_person_edges.dict', 'rb') as handle:
    song_person = pickle.load(handle)
    song_person_ix = convert_to_ids(song_person, SONG_TYPE, PERSON_TYPE)

print(1)
with open('../song_data/dense_person_song_edges.dict', 'rb') as handle:
    person_song = pickle.load(handle)
    person_song_ix = convert_to_ids(person_song, PERSON_TYPE, SONG_TYPE)

1
1
1
1


In [25]:
#Convert entity to id dict to entity_ix to id dict
with open('../song_data/id_type.dict', 'rb') as handle:
    entity_type = pickle.load(handle)
    entity_ix_type = {}
    for key in entities:
        value = entity_type[key]
        key_id = entity_to_ix[(key, int(value))]    
        entity_ix_type[key_id] = value

KeyError: 'a0S959XXxdHq02RRyJIYYysamjxwJNmoCnkadrWVrjA='

In [26]:
with open('song_user_ix.dict', 'wb') as handle:
    pickle.dump(song_user_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_song_ix.dict', 'wb') as handle:
    pickle.dump(user_song_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('song_person_ix.dict', 'wb') as handle:
    pickle.dump(song_person_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('person_song_ix.dict', 'wb') as handle:
    pickle.dump(person_song_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('entity_ix_type.dict', 'wb') as handle:
    pickle.dump(entity_ix_type, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [27]:
# convert training test pos neg dataset

In [31]:
with open('../song_data/song_user_train_dense.dict', 'rb') as handle:
    song_user_train = pickle.load(handle)
    song_user_train_ix = convert_to_ids(song_user_train, SONG_TYPE, USER_TYPE)
with open('../song_data/song_user_test_dense.dict', 'rb') as handle:
    song_user_test = pickle.load(handle)
    song_user_test_ix = convert_to_ids(song_user_test, SONG_TYPE, USER_TYPE)
with open('../song_data/user_song_train_dense.dict', 'rb') as handle:
    user_song_train = pickle.load(handle)
    user_song_train_ix = convert_to_ids(user_song, USER_TYPE, SONG_TYPE)
with open('../song_data/user_song_test_dense.dict', 'rb') as handle:
    user_song_test = pickle.load(handle)
    user_song_test_ix = convert_to_ids(user_song_test, USER_TYPE, SONG_TYPE)

In [32]:
with open('song_user_train_ix.dict', 'wb') as handle:
    pickle.dump(song_user_train_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('song_user_test_ix.dict', 'wb') as handle:
    pickle.dump(song_user_test_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_song_train_ix.dict', 'wb') as handle:
    pickle.dump(user_song_train_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_song_test_ix.dict', 'wb') as handle:
    pickle.dump(user_song_test_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [30]:
def convert_to_ids_tuples(tuples, start_type, end_type):
    new_tuples = []
    for user,song in tuples:
        user_id = entity_to_ix[(user, start_type)]
        song_id = entity_to_ix[(song, end_type)]
        new_tuples.append((user_id, song_id))
        
    return new_tuples

with open('../song_data/user_song_tuple_train_pos_dense.txt', 'rb') as handle:
    user_song_tuple_train_pos = pickle.load(handle)
    user_song_tuple_train_pos_ix = convert_to_ids_tuples(user_song_tuple_train_pos, USER_TYPE, SONG_TYPE)
with open('../song_data/user_song_tuple_test_pos_dense.txt', 'rb') as handle:
    user_song_tuple_test_pos = pickle.load(handle)
    user_song_tuple_test_pos_ix = convert_to_ids_tuples(user_song_tuple_test_pos, USER_TYPE, SONG_TYPE)
with open('../song_data/user_song_tuple_train_neg_dense.txt', 'rb') as handle:
    user_song_tuple_train_neg = pickle.load(handle)
    user_song_tuple_train_neg_ix = convert_to_ids_tuples(user_song_tuple_train_neg, USER_TYPE, SONG_TYPE)
with open('../song_data/user_song_tuple_test_neg_dense.txt', 'rb') as handle:
    user_song_tuple_test_neg = pickle.load(handle)
    user_song_tuple_test_neg_ix = convert_to_ids_tuples(user_song_tuple_test_neg, USER_TYPE, SONG_TYPE)

In [34]:
with open('user_song_tuple_train_pos_ix.txt', 'wb') as handle:
    pickle.dump(user_song_tuple_train_pos_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_song_tuple_train_neg_ix.txt', 'wb') as handle:
    pickle.dump(user_song_tuple_train_neg_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_song_tuple_test_pos_ix.txt', 'wb') as handle:
    pickle.dump(user_song_tuple_test_pos_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_song_tuple_test_neg_ix.txt', 'wb') as handle:
    pickle.dump(user_song_tuple_test_neg_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)