In [None]:
import pandas as pd
import pickle
import numpy as np
import math
import itertools
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
songs = pd.read_csv("songs.csv")
train = pd.read_csv("train.csv")

In [None]:
person_type_idx = 0
user_type_idx = 1
song_type_idx = 2

## Serialize into dict & arrays

In [None]:
# song_person.dict & person_song.dict
# dict where key = song_id, value = list of persons (artists, composers, lyricists) of the song
def make_person_list(row):
    person_set = set()
    if not isinstance(row['artist_name'], float):
        for x in row['artist_name'].split('|'):
            person_set.add(x.strip())
    if not isinstance(row['composer'], float):
        for x in row['composer'].split('|'):
            person_set.add(x.strip())
    if not isinstance(row['lyricist'], float):
        for x in row['lyricist'].split('|'):
            person_set.add(x.strip())
    return list(person_set)

person = songs[['song_id','artist_name', 'composer', 'lyricist']]
person_list = person.apply(lambda x: make_person_list(x), axis=1)
song_person = pd.concat([songs['song_id'], person_list], axis=1)
song_person.columns=['song_id', 'person_list']
song_person_dict = song_person.set_index('song_id')['person_list'].to_dict()
with open('song_person.dict', 'wb') as handle:
    pickle.dump(song_person_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# dict where key = a person, value = list of songs related to this person
person_song_dict = {}
for row in song_person.iterrows():
     for person in row[1]['person_list']:
            if person not in person_song_dict:
                person_song_dict[person]=[]
            person_song_dict[person].append(row[1]['song_id'])
with open('person_song.dict', 'wb') as handle:
    pickle.dump(person_song_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# id_type.dict
#dict where key = entity id, value = entity type
#person_id: the set of artist_name, composer, lyricist of all songs
#type index: person = 0, user = 1, song = 2
song_id_songs = songs['song_id'].drop_duplicates().values #numpy.ndarray
song_id_train = train['song_id'].drop_duplicates().values #numpy.ndarray
song_id = np.hstack([song_id_songs, song_id_train])
song_id = np.unique(song_id)
song_id = np.expand_dims(song_id, axis=1)
song_type = np.full(song_id.shape, song_type_idx, dtype=int)
song_id_type = np.concatenate((song_id, song_type), axis=1)

user_id = train['msno'].drop_duplicates().values #numpy.ndarray
user_id = np.expand_dims(user_id, axis=1)
user_type = np.full(user_id.shape, user_type_idx, dtype=int)
user_id_type = np.concatenate((user_id, user_type), axis=1)

person_list = songs[['artist_name', 'composer', 'lyricist']].stack(dropna=True).drop_duplicates().to_numpy()
person_set = set()
for pseudo_person in person_list:
    for person in pseudo_person.split('|'):
        person_set.add(person.strip())
person_arr = np.array(list(person_set))
person_id = np.expand_dims(person_arr, axis=1)
person_type = np.full(person_id.shape, person_type_idx, dtype=int)
person_id_type = np.concatenate((person_id, person_type), axis=1)

id_type_ndarray = np.concatenate((song_id_type, user_id_type, person_id_type), axis=0)

id_type = {entity_id : entity_type for entity_id, entity_type in id_type_ndarray}
with open('id_type.dict', 'wb') as handle:
    pickle.dump(id_type, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
print('6uYPDXZJGYhq6WDg35xDLMB0Z46Rw3Y0XTLik5F/w9c=' in id_type)

In [None]:
# person_id.txt
# numpy array of sorted unique person_ids
person_arr.sort()
with open('person_id.txt', 'wb') as handle:
    pickle.dump(person_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# song_user.dict
# dict where key = song_id, value = list of user_ids
song_user = train[['song_id', 'msno']].set_index('song_id').groupby('song_id')['msno'].apply(list).to_dict()
# msno is the user_id
with open('song_user.dict', 'wb') as handle:
    pickle.dump(song_user, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# user_song.dict
# dict where key = user_id, value = list of song_ids
user_song = train[['msno', 'song_id']].set_index('msno').groupby('msno')['song_id'].apply(list).to_dict()
with open('user_song.dict', 'wb') as handle:
    pickle.dump(user_song, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# song_id.txt
# numpy array of sorted unique song_ids
song_id = songs['song_id'].drop_duplicates().values
song_id.sort()
with open('song_id.txt', 'wb') as handle:
    pickle.dump(song_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# user_id.txt
# numpy array of sorted unique user_ids
user_id = train['msno'].sort_values().drop_duplicates().values
with open('user_id.txt', 'wb') as handle:
    pickle.dump(user_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# user_song_tuple.txt
# numpy array of [user_id, song_id] pairs sorted in the order of user_id
user_song_tuple = train[['msno', 'song_id']].sort_values(by='msno').to_string(header=False, index=False, index_names=False).split('\n')
# user_song_tuple = '\n'.join(['\t'.join(row.split()) for row in user_song_tuple]) 
user_song_tuple = [row.split() for row in user_song_tuple]
with open('user_song_tuple.txt', 'wb') as handle:
    pickle.dump(user_song_tuple, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Deserialize checks (preview of output format)

In [None]:
with open('song_person.dict', 'rb') as handle:
    unserialized_song_person = pickle.load(handle)
{k: v for k, v in unserialized_song_person.items() if len(v) >1} # to show that the list is working

with open('person_song.dict', 'rb') as handle:
    unserialized_person_song = pickle.load(handle)
{k: v for k, v in unserialized_person_song.items() if len(v) >1} # to show that the list is working

with open('id_type.dict', 'rb') as handle:
    unserialized_id_type = pickle.load(handle)
dict(itertools.islice(unserialized_id_type.items(), 3))

with open('song_user.dict', 'rb') as handle:
    unserialized_song_user = pickle.load(handle)
unserialized_song_user['BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik='] # one instance
{k: v for k, v in unserialized_song_user.items() if len(v) >1} # to show that the list is working

with open('song_id.txt', 'rb') as handle:
    unserialized_song_id = pickle.load(handle)
unserialized_song_id[:5]

with open('user_song_tuple.txt', 'rb') as handle:
    unserialized_user_song_tuple = pickle.load(handle)
unserialized_user_song_tuple[:5]