In [35]:
import pandas as pd
import pickle
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
songs = pd.read_csv("songs.csv")
train = pd.read_csv("train.csv")[:100]
members = pd.read_csv("members.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [37]:
person_type_idx = 0
user_type_idx = 1
song_type_idx = 2

## Serialize into dict & arrays

In [4]:
# dict where key = song_id, value = list of user_ids
song_user = train[['song_id', 'msno']].set_index('song_id').groupby('song_id')['msno'].apply(list).to_dict()
with open('song_user.dict', 'wb') as handle:
    pickle.dump(song_user, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
# dict where key = user_id, value = list of song_ids
user_song = train[['msno', 'song_id']].set_index('msno').groupby('msno')['song_id'].apply(list).to_dict()
with open('user_song.dict', 'wb') as handle:
    pickle.dump(user_song, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
# numpy array of sorted song_ids
song_id = train['song_id'].values
song_id.sort()
with open('song_id.txt', 'wb') as handle:
    pickle.dump(song_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
# numpy array of sorted user_ids
user_id = train[['msno']].sort_values(by='msno').values[0]
with open('user_id.txt', 'wb') as handle:
    pickle.dump(user_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
# numpy array of [user_id, song_id] pairs sorted in the order of user_id
user_song_tuple = train[['msno', 'song_id']].sort_values(by='msno').to_string(header=False, index=False, index_names=False).split('\n')
# user_song_tuple = '\n'.join(['\t'.join(row.split()) for row in user_song_tuple]) 
user_song_tuple = [row.split() for row in user_song_tuple]
with open('user_song_tuple.txt', 'wb') as handle:
    pickle.dump(user_song_tuple, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [64]:
#dict where key = entity id, value = entity type
#person_id: artist_name
song_id = train['song_id'].values #numpy.ndarray
song_id = np.expand_dims(song_id, axis=1)
song_type = np.full(song_id.shape, song_type_idx, dtype=int)
song_id_type = np.concatenate((song_id, song_type), axis=1)

user_id = train['msno'].values #numpy.ndarray
user_id = np.expand_dims(user_id, axis=1)
user_type = np.full(user_id.shape, user_type_idx, dtype=int)
user_id_type = np.concatenate((user_id, user_type), axis=1)

person_id = songs['artist_name'].values #numpy.ndarray
person_id = np.expand_dims(person_id, axis=1)
person_type = np.full(person_id.shape, person_type_idx, dtype=int)
person_id_type = np.concatenate((person_id, person_type), axis=1)

id_type_ndarray = np.concatenate((song_id_type, user_id_type, person_id_type), axis=0)

id_type = {entity_id : entity_type for entity_id, entity_type in id_type_ndarray}
with open('id_type.dict', 'wb') as handle:
    pickle.dump(id_type, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
print(id_type)

## Deserialize checks (preview of output format)

In [9]:
with open('song_person.dict', 'rb') as handle:
    unserialized_song_person = pickle.load(handle)
unserialized_song_person['BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik='] # one instance
{k: v for k, v in unserialized_song_person.items() if len(v) >1} # to show that the list is working

with open('song_id.txt', 'rb') as handle:
    unserialized_song_id = pickle.load(handle)
unserialized_song_id
type(unserialized_song_id)

with open('user_song_tuple.txt', 'rb') as handle:
    unserialized_user_song_tuple = pickle.load(handle)
unserialized_user_song_tuple[:6]

['FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=']

{'u6/Pb7X4u7KU4gXrBgGqt8RlRrNNFLn03tLAHyxRxwA=': ['TJU0Gfvy7FB+r89bWovPKXTjuApTCiv3xg/tt5shR78=',
  '0LhkakIQDn36HZXI6ClQSO7W7jkpZAy+9MvYgPOZGrA=']}

array(['+Sm75wnBf/sjm/QMUAFx8N+Ae04kWCXGlgH50tTeM6c=',
       '+u3iqLNaEoGlLwRbnbU0kM3CjctokMh1MPRFLrrzuQQ=',
       '/2CWoqYh5/DUNi48voCXqA9V347liPy2WpEJj6CZro0=',
       '/56EZoEb7TAm0G8GLJpA2Uzgg6QiJcYDwvI08I+w8JA=',
       '/bU6IRSK+YNlNbaTkxo7bhsb2EDLPrnksdX3ggcZNhI=',
       '0cHV60MVnGOKMb/GRHWEExfpahkCwIZQ0wxdYBJfoqE=',
       '1R5HlX765HuV3uvLmJ0FqUQmlAmjdcqF0GbOqf9qUGs=',
       '1ysr+JKN0mCCeW0UqWIj2Vlxz3N5aQOEW5mkSW/OBPA=',
       '20e8yHxKn8p+G2zFcxC/3UDM5rYFCIoozlTCUB0YumU=',
       '2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=',
       '2bj5oqCPPzY6E0TPgwySkfj8/l/c+DVQBqnABx0qPSk=',
       '2fhzNUTd3X1n8IKBAd0xM2HwOuDIc0Qb6ZcVQ3OmqX4=',
       '34IXh04bdV42ClAF+3lHsAjNQEA9yljeyamS6+Wv1BQ=',
       '3DU6F6k6dFSdoQa2tsia5spMXFCxWh4JJdA3OxSo3rM=',
       '3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=',
       '3RnXpHSbz61HoSCJEB7B9Sn3bqKPUvgYVwM8ceOEd1M=',
       '3W6VBSN09a/Vh9w1Fwmdus4H4xxmde/yu0RBgMcTtZM=',
       '3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=',
       '4Z

numpy.ndarray

[['0LhkakIQDn36HZXI6ClQSO7W7jkpZAy+9MvYgPOZGrA=',
  'MM+huiuccu4p5mNVhWxWgJreIsRWuEmvbvpCu0q79XM='],
 ['0LhkakIQDn36HZXI6ClQSO7W7jkpZAy+9MvYgPOZGrA=',
  'K2jkXlozk/+HPjs1RyDlBUHJx2Ky+ITavwBXPcFqtHY='],
 ['0LhkakIQDn36HZXI6ClQSO7W7jkpZAy+9MvYgPOZGrA=',
  'L6w2d0w84FjTvFr+BhMfgu7dZAsGiOqUGmvvxIG3gvQ='],
 ['0LhkakIQDn36HZXI6ClQSO7W7jkpZAy+9MvYgPOZGrA=',
  'LBFRdBacp7Ob6Ada2N7uCQ47DhiNT2KjUtwOA7oSm7I='],
 ['0LhkakIQDn36HZXI6ClQSO7W7jkpZAy+9MvYgPOZGrA=',
  'NDtSbLpLMkWyBAhfRimU73D/pKVWE6Z3Q2IEdmFuWNk='],
 ['0LhkakIQDn36HZXI6ClQSO7W7jkpZAy+9MvYgPOZGrA=',
  'P6dllZw/ZnO7qcufxOKHAXGwlCvkmm4djJT/82d3ihA=']]