In [3]:
import pandas as pd
import pickle
import numpy as np
import math
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
songs = pd.read_csv("songs.csv")[:100]
train = pd.read_csv("train.csv")[:100]
members = pd.read_csv("members.csv")
sample_submission = pd.read_csv("sample_submission.csv")

## Serialize into dict & arrays

In [16]:
# dict where key = song_id, value = list of user_ids
song_user = train[['song_id', 'msno']].set_index('song_id').groupby('song_id')['msno'].apply(list).to_dict()
# msno is the user_id
with open('song_user.dict', 'wb') as handle:
    pickle.dump(song_user, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
# dict where key = user_id, value = list of song_ids
user_song = train[['msno', 'song_id']].set_index('msno').groupby('msno')['song_id'].apply(list).to_dict()
with open('user_song.dict', 'wb') as handle:
    pickle.dump(user_song, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
# numpy array of sorted song_ids
song_id = train['song_id'].values
song_id.sort()
with open('song_id.txt', 'wb') as handle:
    pickle.dump(song_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
# numpy array of sorted user_ids
user_id = train[['msno']].sort_values(by='msno').values[0]
with open('user_id.txt', 'wb') as handle:
    pickle.dump(user_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
# numpy array of [user_id, song_id] pairs sorted in the order of user_id
user_song_tuple = train[['msno', 'song_id']].sort_values(by='msno').to_string(header=False, index=False, index_names=False).split('\n')
# user_song_tuple = '\n'.join(['\t'.join(row.split()) for row in user_song_tuple]) 
user_song_tuple = [row.split() for row in user_song_tuple]
with open('user_song_tuple.txt', 'wb') as handle:
    pickle.dump(user_song_tuple, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
# dict where key = song_id, value = list of persons (artists, composers, lyricists) of the song
def make_person_list(row):
    person_set = set()
    for x in row['artist_name'].split('|'):
        person_set.add(x.strip())
    if not isinstance(row['composer'], float):
        for x in row['composer'].split('|'):
            person_set.add(x.strip())
    if not isinstance(row['lyricist'], float):
        for x in row['composer'].split('|'):
            person_set.add(x.strip())
    return list(person_set)

person = songs[['song_id','artist_name', 'composer', 'lyricist']]
person_list = person.apply(lambda x: make_person_list(x), axis=1)
song_person = pd.concat([songs['song_id'], person_list], axis=1)
song_person.columns=['song_id', 'person_list']
song_person_dict = song_person.set_index('song_id')['person_list'].to_dict()
with open('song_person.dict', 'wb') as handle:
    pickle.dump(song_person_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Deserialize checks (preview of output format)

In [18]:
with open('song_user.dict', 'rb') as handle:
    unserialized_song_user = pickle.load(handle)
unserialized_song_user['BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik='] # one instance
{k: v for k, v in unserialized_song_user.items() if len(v) >1} # to show that the list is working

with open('song_id.txt', 'rb') as handle:
    unserialized_song_id = pickle.load(handle)
unserialized_song_id[:5]

with open('user_song_tuple.txt', 'rb') as handle:
    unserialized_user_song_tuple = pickle.load(handle)
unserialized_user_song_tuple[:5]

with open('song_person.dict', 'rb') as handle:
    unserialized_song_person = pickle.load(handle)
{k: v for k, v in unserialized_song_person.items() if len(v) >1} # to show that the list is working

['Vgeu+u3vXE0FhQtG/Vr3I/U3V0TX/jzQAEBhi3S3qi0=']

{'u6/Pb7X4u7KU4gXrBgGqt8RlRrNNFLn03tLAHyxRxwA=': ['rAI+tZ85/S4CEHBEhUkHDLO1afqVLArCu0nsBmORwfQ=',
  'rAI+tZ85/S4CEHBEhUkHDLO1afqVLArCu0nsBmORwfQ=']}

array(['+Sm75wnBf/sjm/QMUAFx8N+Ae04kWCXGlgH50tTeM6c=',
       '+u3iqLNaEoGlLwRbnbU0kM3CjctokMh1MPRFLrrzuQQ=',
       '/2CWoqYh5/DUNi48voCXqA9V347liPy2WpEJj6CZro0=',
       '/56EZoEb7TAm0G8GLJpA2Uzgg6QiJcYDwvI08I+w8JA=',
       '/bU6IRSK+YNlNbaTkxo7bhsb2EDLPrnksdX3ggcZNhI='], dtype=object)

[['0LhkakIQDn36HZXI6ClQSO7W7jkpZAy+9MvYgPOZGrA=',
  'MM+huiuccu4p5mNVhWxWgJreIsRWuEmvbvpCu0q79XM='],
 ['0LhkakIQDn36HZXI6ClQSO7W7jkpZAy+9MvYgPOZGrA=',
  'K2jkXlozk/+HPjs1RyDlBUHJx2Ky+ITavwBXPcFqtHY='],
 ['0LhkakIQDn36HZXI6ClQSO7W7jkpZAy+9MvYgPOZGrA=',
  'L6w2d0w84FjTvFr+BhMfgu7dZAsGiOqUGmvvxIG3gvQ='],
 ['0LhkakIQDn36HZXI6ClQSO7W7jkpZAy+9MvYgPOZGrA=',
  'LBFRdBacp7Ob6Ada2N7uCQ47DhiNT2KjUtwOA7oSm7I='],
 ['0LhkakIQDn36HZXI6ClQSO7W7jkpZAy+9MvYgPOZGrA=',
  'NDtSbLpLMkWyBAhfRimU73D/pKVWE6Z3Q2IEdmFuWNk=']]

{'CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=': ['張信哲 (Jeff Chang)', '董貞'],
 'o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=': ['BLACKPINK',
  'FUTURE BOUNCE',
  'Bekuh BOOM',
  'TEDDY'],
 'dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=': ['湯小康', 'S.H.E'],
 'W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=': ['Traditional', '貴族精選'],
 'kKJ2JNU5h8rphyW21ovC+RZU+yEHPM+3w85J37p7vEQ=': ['Joe Hisaishi', '貴族精選'],
 'N9vbanw7BSMoUgdfJlgX1aZPE1XZg8OS1wf88AQEcMc=': ['Jonathan Lee',
  '伍佰 & China Blue'],
 'GsCpr618xfveHYJdo+E5SybrpR906tsjLMeKyrCNw8s=': ['光良 (Michael Wong)', '光良'],
 'oTi7oINPX+rxoGp+3O6llSltQTl80jDqHoULfRoLcG4=': ['林俊傑 (JJ Lin)', 'JJ Lin'],
 'btcG03OHY3GNKWccPP0auvtSbhxog/kllIIOx5grE/k=': ['Vincent May',
  'Jacknife Lee',
  'Kodaline',
  'Mark Prendergast',
  'Stephen Garrigan',
  'Jason Boland'],
 'HulM/OaHgD5kUyjNQjDUf8VZdsy7h4EJUIff79Cifwo=': ['陳偉強',
  'D.L 羅時豐 (Daniel Lo)'],
 'wypPzqFNdUJAqyBVxmFGaK4z7krUNWr5YqA0q0wi9eE=': ['白安', '白安 (Ann)'],
 'fAZLdfQaLG76a6Ei4alt1eSjBM9rshQkiQEC6+n+