### Music Recommendation Sandbox

In [10]:
import numpy as np
import pandas as pd

# SKLEARN
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

In [11]:
INPUT_DATA_PATH = '/Users/youngtodd/kbox_music_kaggle/data/'

df_test = pd.read_csv(INPUT_DATA_PATH + 'test.csv',dtype={'msno' : 'category',
                                                          'source_system_tab' : 'category',
                                                          'source_screen_name' : 'category',
                                                          'source_type' : 'category',
                                                          'song_id' : 'category'})

df_train = pd.read_csv(INPUT_DATA_PATH + 'train.csv',dtype={'msno' : 'category',
                                                            'source_system_tab' : 'category',
                                                            'source_screen_name' : 'category',
                                                            'source_type' : 'category',
                                                            'target' : np.uint8,
                                                            'song_id' : 'category'})

df_members = pd.read_csv(INPUT_DATA_PATH + 'members.csv',dtype={'city' : 'category',
                                                                'bd' : np.uint8,
                                                                'gender' : 'category',
                                                                'registered_via' : 'category'},
                                                                parse_dates=['registration_init_time','expiration_date'])

# Load the songs dataframe
df_songs = pd.read_csv(INPUT_DATA_PATH + 'songs.csv',dtype={'genre_ids': 'category',
                                                            'language' : 'category',
                                                            'artist_name' : 'category',
                                                            'composer' : 'category',
                                                            'lyricist' : 'category',
                                                            'song_id' : 'category'})

In [12]:
songs_extra = pd.read_csv(INPUT_DATA_PATH + 'song_extra_info.csv')

def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan
        
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

df_train = df_train.merge(songs_extra, on = 'song_id', how = 'left')
df_test = df_test.merge(songs_extra, on = 'song_id', how = 'left')


In [13]:
# Convert date to number of days
df_members['membership_days'] = (df_members['expiration_date'] - df_members['registration_init_time']).dt.days.astype(int)

# Remove both date fieldsa since we already have the number of days between them
df_members = df_members.drop(['registration_init_time','expiration_date'], axis=1)

In [14]:
# Merge the members dataframe into the test dataframe
df_test = pd.merge(left = df_test,right = df_members,how='left',on='msno')
df_test.msno = df_test.msno.astype('category')

# Merge the member dataframe into the train dataframe
df_train = pd.merge(left = df_train,right = df_members,how='left',on='msno')
df_train.msno = df_train.msno.astype('category')

# Release memory
del df_members

# Merge the Test Dataframe with the SONGS dataframe
df_test = pd.merge(left = df_test,right = df_songs,how = 'left',on='song_id')
df_test.song_length.fillna(200000,inplace=True)
df_test.song_length = df_test.song_length.astype(np.uint32)
df_test.song_id = df_test.song_id.astype('category')

# Merge the Train dataframe with the SONGS dataframe
df_train = pd.merge(left = df_train,right = df_songs,how = 'left',on='song_id')
df_train.song_length.fillna(200000,inplace=True)
df_train.song_length = df_train.song_length.astype(np.uint32)
df_train.song_id = df_train.song_id.astype('category')

# Release memory
del df_songs

In [36]:
train_rows = len(df_train) - len(df_test)

val = df_train.tail(len(df_test))
train = df_train.head(train_rows)

In [11]:
import lightgbm as lgb

In [17]:
# Create a Cross Validation with 3 splits
kf = KFold(n_splits=3)

# This array will store the predictions made.
predictions = np.zeros(shape=[len(df_test)])

# For each KFold
for train_indices ,validate_indices in kf.split(df_train) : 
    train_data = lgb.Dataset(df_train.drop(['target'],axis=1).loc[train_indices,:],label=df_train.loc[train_indices,'target'])
    val_data = lgb.Dataset(df_train.drop(['target'],axis=1).loc[validate_indices,:],label=df_train.loc[validate_indices,'target'])
    

    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.1 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 128,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc',
        } 
 
    # Train the model
    bst = lgb.train(params, train_data, 100, valid_sets=[val_data])
    
    # Make the predictions storing them on the predictions array
    predictions += bst.predict(df_test.drop(['id'],axis=1))



[1]	valid_0's auc: 0.684222
[2]	valid_0's auc: 0.691467
[3]	valid_0's auc: 0.694714
[4]	valid_0's auc: 0.696836
[5]	valid_0's auc: 0.699719
[6]	valid_0's auc: 0.702587
[7]	valid_0's auc: 0.707606
[8]	valid_0's auc: 0.713453
[9]	valid_0's auc: 0.717344
[10]	valid_0's auc: 0.719286
[11]	valid_0's auc: 0.720562
[12]	valid_0's auc: 0.720995
[13]	valid_0's auc: 0.722194
[14]	valid_0's auc: 0.724534
[15]	valid_0's auc: 0.72526
[16]	valid_0's auc: 0.726134
[17]	valid_0's auc: 0.727197
[18]	valid_0's auc: 0.72809
[19]	valid_0's auc: 0.728917
[20]	valid_0's auc: 0.72986
[21]	valid_0's auc: 0.730415
[22]	valid_0's auc: 0.732331
[23]	valid_0's auc: 0.732712
[24]	valid_0's auc: 0.733179
[25]	valid_0's auc: 0.733333
[26]	valid_0's auc: 0.733882
[27]	valid_0's auc: 0.734493
[28]	valid_0's auc: 0.735409
[29]	valid_0's auc: 0.735578
[30]	valid_0's auc: 0.735798
[31]	valid_0's auc: 0.736187
[32]	valid_0's auc: 0.736772
[33]	valid_0's auc: 0.737295
[34]	valid_0's auc: 0.737509
[35]	valid_0's auc: 0.7376

[82]	valid_0's auc: 0.694742
[83]	valid_0's auc: 0.694827
[84]	valid_0's auc: 0.694887
[85]	valid_0's auc: 0.694972
[86]	valid_0's auc: 0.695115
[87]	valid_0's auc: 0.695221
[88]	valid_0's auc: 0.695333
[89]	valid_0's auc: 0.695316
[90]	valid_0's auc: 0.695446
[91]	valid_0's auc: 0.69553
[92]	valid_0's auc: 0.695679
[93]	valid_0's auc: 0.695772
[94]	valid_0's auc: 0.695859
[95]	valid_0's auc: 0.695948
[96]	valid_0's auc: 0.696049
[97]	valid_0's auc: 0.696142
[98]	valid_0's auc: 0.696193
[99]	valid_0's auc: 0.696234
[100]	valid_0's auc: 0.696316
[101]	valid_0's auc: 0.696413
[102]	valid_0's auc: 0.696381
[103]	valid_0's auc: 0.696452
[104]	valid_0's auc: 0.696569
[105]	valid_0's auc: 0.696648
[106]	valid_0's auc: 0.696743
[107]	valid_0's auc: 0.696862
[108]	valid_0's auc: 0.696907
[109]	valid_0's auc: 0.696984
[110]	valid_0's auc: 0.697098
[111]	valid_0's auc: 0.697164
[112]	valid_0's auc: 0.69745
[113]	valid_0's auc: 0.697499
[114]	valid_0's auc: 0.697615
[115]	valid_0's auc: 0.697653


[161]	valid_0's auc: 0.655467
[162]	valid_0's auc: 0.655513
[163]	valid_0's auc: 0.655543
[164]	valid_0's auc: 0.655584
[165]	valid_0's auc: 0.655601
[166]	valid_0's auc: 0.655628
[167]	valid_0's auc: 0.655649
[168]	valid_0's auc: 0.655686
[169]	valid_0's auc: 0.65574
[170]	valid_0's auc: 0.655757
[171]	valid_0's auc: 0.655794
[172]	valid_0's auc: 0.655853
[173]	valid_0's auc: 0.655877
[174]	valid_0's auc: 0.655897
[175]	valid_0's auc: 0.655928
[176]	valid_0's auc: 0.655975
[177]	valid_0's auc: 0.656031
[178]	valid_0's auc: 0.656066
[179]	valid_0's auc: 0.656075
[180]	valid_0's auc: 0.656162
[181]	valid_0's auc: 0.656225
[182]	valid_0's auc: 0.656273
[183]	valid_0's auc: 0.656314
[184]	valid_0's auc: 0.65637
[185]	valid_0's auc: 0.656416
[186]	valid_0's auc: 0.656422
[187]	valid_0's auc: 0.656474
[188]	valid_0's auc: 0.656535
[189]	valid_0's auc: 0.65658
[190]	valid_0's auc: 0.656649
[191]	valid_0's auc: 0.656685
[192]	valid_0's auc: 0.656717
[193]	valid_0's auc: 0.656761
[194]	valid_0

In [18]:
predictions

array([ 1.61759444,  1.64994088,  0.5475667 , ...,  1.31299786,
        1.11887249,  1.24299766])

In [38]:
2556790 == len(val)

True

In [20]:
predictions = predictions/3

# Read the sample_submission CSV
submission = pd.read_csv(INPUT_DATA_PATH + '/sample_submission.csv')
# Set the target to our predictions
submission.target=predictions
# Save the submission file
submission.to_csv('submission.csv',index=False)