In [1]:
spark

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
#preprocess
#reference: https://github.com/VasiliyRubtsov/wsdm_music_recommendations/blob/master/pipeline.ipynb
date_columns = ['expiration_date', 'registration_init_time']

train_data = pd.read_csv('music_data_raw/train.csv')
test_data = pd.read_csv('music_data_raw/test.csv', index_col=0)
item_data = pd.read_csv('music_data_raw/songs.csv')
user_data = pd.read_csv('music_data_raw/members.csv', parse_dates=date_columns)

  mask |= (ar1 == a)


In [4]:
all_data = pd.concat([train_data, test_data], sort=True)

all_data = all_data.merge(item_data, on='song_id', how='left')
all_data = all_data.merge(user_data, on='msno', how='left')

In [5]:
all_data.head()

Unnamed: 0,msno,song_id,source_screen_name,source_system_tab,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language,city,bd,gender,registered_via,registration_init_time,expiration_date
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,Explore,explore,online-playlist,1.0,206471.0,359,Bastille,Dan Smith| Mark Crew,,52.0,1,0,,7,2012-01-02,2017-10-05
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,Local playlist more,my library,local-playlist,1.0,284584.0,1259,Various Artists,,,52.0,13,24,female,9,2011-05-25,2017-09-11
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,Local playlist more,my library,local-playlist,1.0,225396.0,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,,52.0,13,24,female,9,2011-05-25,2017-09-11
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,Local playlist more,my library,local-playlist,1.0,255512.0,1019,Soundway,Kwadwo Donkoh,,-1.0,13,24,female,9,2011-05-25,2017-09-11
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,Explore,explore,online-playlist,1.0,187802.0,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,,52.0,1,0,,7,2012-01-02,2017-10-05


In [6]:
all_data.isnull().any()

msno                      False
song_id                   False
source_screen_name         True
source_system_tab          True
source_type                True
target                     True
song_length                True
genre_ids                  True
artist_name                True
composer                   True
lyricist                   True
language                   True
city                      False
bd                        False
gender                     True
registered_via            False
registration_init_time    False
expiration_date           False
dtype: bool

In [7]:
enc = LabelEncoder()

for col in [
    'msno', 'song_id', 'source_screen_name', 
    'source_system_tab', 'source_type', 'genre_ids', 
    'artist_name', 'composer', 'lyricist', 'gender'
]:
    all_data[col] = enc.fit_transform(all_data[col].fillna('nan'))
    
for col in ['language', 'city', 'registered_via']:
    all_data[col] = enc.fit_transform(all_data[col].fillna(-2))
    

# Question: Why is this time column needed
all_data['time'] = all_data.index / len(all_data)

n = len(train_data)
#train_data = all_data[:n]
#test_data = all_data[n:]

# train_data.to_hdf('music_data_raw/train_data.hdf', key='wsdm')
# test_data.to_hdf('music_data_raw/test_data.hdf', key='wsdm')

In [8]:
all_data.isnull().sum()

msno                            0
song_id                         0
source_screen_name              0
source_system_tab               0
source_type                     0
target                    2556790
song_length                   139
genre_ids                       0
artist_name                     0
composer                        0
lyricist                        0
language                        0
city                            0
bd                              0
gender                          0
registered_via                  0
registration_init_time          0
expiration_date                 0
time                            0
dtype: int64

In [9]:
all_data_no_na = all_data.copy()
all_data_no_na.replace(["NaN", 'NaT'], np.nan, inplace = True)
all_data_no_na = all_data_no_na.dropna()
all_data_no_na.isnull().sum()

msno                      0
song_id                   0
source_screen_name        0
source_system_tab         0
source_type               0
target                    0
song_length               0
genre_ids                 0
artist_name               0
composer                  0
lyricist                  0
language                  0
city                      0
bd                        0
gender                    0
registered_via            0
registration_init_time    0
expiration_date           0
time                      0
dtype: int64

In [10]:
display(all_data_no_na[:10])
len(all_data_no_na)

Unnamed: 0,msno,song_id,source_screen_name,source_system_tab,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language,city,bd,gender,registered_via,registration_init_time,expiration_date,time
0,9176,86884,7,1,7,1.0,206471.0,307,3784,16653,26024,9,0,0,2,2,2012-01-02,2017-10-05,0.0
1,19273,260594,8,3,4,1.0,284584.0,97,36867,74276,26024,9,11,24,0,3,2011-05-25,2017-09-11,1.006623e-07
2,19273,140755,8,3,4,1.0,225396.0,97,24601,51540,26024,9,11,24,0,3,2011-05-25,2017-09-11,2.013246e-07
3,19273,27577,8,3,4,1.0,255512.0,6,31651,41991,26024,1,11,24,0,3,2011-05-25,2017-09-11,3.019868e-07
4,9176,38706,7,1,7,1.0,187802.0,2,5190,9701,26024,9,0,0,2,2,2012-01-02,2017-10-05,4.026491e-07
5,9176,35087,7,1,7,1.0,247803.0,97,9452,63673,26024,9,0,0,2,2,2012-01-02,2017-10-05,5.033114e-07
6,19273,221459,8,3,4,1.0,229982.0,371,3348,66331,22941,6,11,24,0,3,2011-05-25,2017-09-11,6.039737e-07
7,9176,258674,7,1,7,1.0,181115.0,2,35342,67760,26024,9,0,0,2,2,2012-01-02,2017-10-05,7.046359e-07
8,31394,10642,8,3,3,1.0,278964.0,200,25767,60384,26024,9,13,26,1,3,2011-11-07,2018-03-04,8.052982e-07
9,31394,109197,8,3,3,1.0,257369.0,371,25767,60384,26024,9,13,26,1,3,2011-11-07,2018-03-04,9.059605e-07


7377304

In [11]:
# test_data = all_data.loc[all_data['target'].isnull()]
# train_data = all_data.loc[all_data['target'].isnull()==False]

In [12]:
# test_data.isnull().sum()

In [13]:
# train_data.isnull().sum()

In [14]:
# # y = all_data['target']
# # X = all_data.drop(['target'], axis = 1)

# real_test_X = test_data.drop(['target'], axis = 1)

# y = train_data[['target']]
# X = train_data.drop(['target'], axis = 1)

# train_X, test_X, train_y, test_y = train_test_split(X,y,test_size=0.4,random_state=1)
# test_X, val_X, test_y, val_y = train_test_split(test_X,test_y,test_size=0.5,random_state=1)

In [15]:
#creat dataset from all_data_no_na
y = all_data_no_na[['target']]
X = all_data_no_na.drop(['target'], axis = 1)

train_X, test_X, train_y, test_y = train_test_split(X,y,test_size=0.4,random_state=1)
test_X, val_X, test_y, val_y = train_test_split(test_X,test_y,test_size=0.5,random_state=1)

In [17]:
ratio_1 = np.sum(train_y[train_y['target'] == 1]) / len(train_y)
print(ratio_1)

target    0.503572
dtype: float64


In [67]:
print(len(train_X),len(train_y))
print(len(val_X),len(val_y))
print(len(test_X),len(test_y))

4426382 4426382
1475461 1475461
1475461 1475461


In [20]:
#add tiny data for debugging
train_tiny_X = train_X[:10000]
train_tiny_y = train_y[:10000]
train_tiny_X.to_csv('music_data/train_tiny_X.csv')
train_tiny_y.to_csv('music_data/train_tiny_Y.csv')
display(train_tiny_y)

Unnamed: 0,target
2942719,0.0
4875524,0.0
6589819,0.0
1172060,1.0
2069395,0.0
4966246,0.0
69897,1.0
933241,1.0
1122481,1.0
2680592,0.0


In [12]:
# train_data.to_csv('music_data_raw/train_data_updated.csv')
# test_data.to_csv('music_data_raw/test_data_updated.csv')

In [68]:
#real_test_X.to_csv('music_data/real_test_X_no_label.csv')
train_X.to_csv('music_data/train_X.csv')
test_X.to_csv('music_data/test_X.csv')
val_X.to_csv('music_data/valid_X.csv')
train_y.to_csv('music_data/train_Y.csv')
test_y.to_csv('music_data/test_Y.csv')
val_y.to_csv('music_data/valid_Y.csv')