In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from catboost import CatBoostClassifier, Pool

import sklearn.datasets
import gc

### Data Prep.

In [2]:
# train_X = pd.read_csv('music_data/train_X.csv')
train_X = pd.read_csv('music_data/train_X_date_converted.csv')
train_y = pd.read_csv('music_data/train_Y.csv')

# val_X = pd.read_csv('music_data/valid_X.csv')
val_X = pd.read_csv('music_data/valid_X_date_converted.csv')
val_y = pd.read_csv('music_data/valid_Y.csv')

# test_X = pd.read_csv('music_data/test_X.csv')
test_X = pd.read_csv('music_data/test_X_date_converted.csv')
test_y = pd.read_csv('music_data/test_Y.csv')

In [3]:
# train_tiny_X = pd.read_csv('music_data/train_tiny_X.csv')
train_tiny_X = train_X[:10000].copy()
train_tiny_y = train_y[:10000].copy()

val_tiny_X = val_X[:1000].copy()
#val_tiny_X = pd.read_csv('music_data/val_tiny_X_date_converted.csv')
val_tiny_y = val_y[:1000].copy()

In [11]:
# Observing reasonable date for date time conversion
# 2004 seems to be the earliest date. '1970-01-01' is essentially null value (1 in train).
print('train x', train_X['registration_init_time'].min(), train_X['expiration_date'].min())
print('val x', val_X['registration_init_time'].min(), val_X['expiration_date'].min())
print('test x', test_X['registration_init_time'].min(), test_X['expiration_date'].min())


# Not many records smaller than '2000-01-01'. Reasonable to not treat them specially
print(train_X['expiration_date'].apply(lambda date: date == '1970-01-01').sum(), '1970-01-01 in train set')
print(val_X['expiration_date'].apply(lambda date: date == '1970-01-01').sum(), '1970-01-01 in val set')

print(train_X['expiration_date'].apply(lambda date: date < '2000-01-01').sum(), 'smaller than 2000-01-01 in train set')
print(val_X['expiration_date'].apply(lambda date: date < '2000-01-01').sum(), 'smaller than 2000-01-01 in val set')

train x 2004-03-26 1970-01-01
val x 2004-03-26 1970-01-01
test x 2004-03-26 2004-10-16
1 1970-01-01 in train set
2 1970-01-01 in val set
1 smaller than 2000-01-01 in train set
2 smaller than 2000-01-01 in val set


In [None]:
def date_to_int(column, base_date=datetime.strptime('2000-01-01', "%Y-%m-%d")):
    """
    Convert date to day counts since base_date for given columns.
    
    :param column: pandas column containing date representations as str value
    :param base_date: base date from which date is counted. i.e. 2000-01-02 will be day '1' comparing to '2000-01-01'
    """
    
    def date_diff(date):
        date = datetime.strptime(date, "%Y-%m-%d")
        return (date - base_date).days
    
    return column.apply(date_diff)

def convert_date_columns_to_int(dataframes, cols=['registration_init_time', 'expiration_date'], saving=True):
    count = 0
    total = len(dataframes)
    for df_name, df in dataframes.items():
        count += 1
        print("Starting {} ...".format(df_name))
        for col in cols:
            df[col+'_int'] = date_to_int(df[col])
        df = df.drop(columns=cols)
        dataframes[df_name] = df
        
        if saving:
            print("Saving {} ...".format(df_name))
            df.to_csv('music_data/'+ df_name + '_date_converted.csv')
        
        print("====== Done {} / {} ======".format(count, total))
    return dataframes


In [None]:
# Converting all data
converted_dfs = convert_date_columns_to_int({
    'train_X': train_X,
    'valid_X': val_X,
    'test_X': test_X,
    'train_tiny_X': train_tiny_X,
    'val_tiny_X': val_tiny_X,
})
train_X = converted_dfs['train_X']
val_X = converted_dfs['valid_X']
test_X = converted_dfs['test_X']
train_tiny_X = converted_dfs['train_tiny_X']
val_tiny_X = converted_dfs['val_tiny_X']

Starting val_tiny_X ...
Saving val_tiny_X ...
Starting train_X ...
Saving train_X ...


## Gradient Boosting

### LightGBM

In [19]:
def run_lgbc(train_X, train_y, val_X, val_y):
    params = {
        "objective" : "binary",
        "metric" : "binary_logloss", 
        "num_leaves" : 30,
        "min_child_samples" : 100,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100)

    pred_val_y = model.predict(val_X, num_iteration=model.best_iteration)
    return model, pred_val_y

#### Testing on tiny dataset

In [31]:
modelC, pred_val_y = run_lgbc(train_tiny_X, train_tiny_y['target'], val_tiny_X, val_tiny_y['target'])
print(classification_report(val_tiny_y['target'], modelC.predict(val_tiny_X)>0.5))
print(roc_auc_score(val_tiny_y['target'], modelC.predict(val_tiny_X)))

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.651618
Early stopping, best iteration is:
[30]	valid_0's binary_logloss: 0.639451
              precision    recall  f1-score   support

         0.0       0.58      0.69      0.63       477
         1.0       0.66      0.55      0.60       523

   micro avg       0.62      0.62      0.62      1000
   macro avg       0.62      0.62      0.62      1000
weighted avg       0.62      0.62      0.61      1000

0.6819830761892165


#### Running large data

In [32]:
modelC, pred_val_y = run_lgbc(train_X, train_y['target'], val_X, val_y['target'])
print(classification_report(val_y['target'], modelC.predict(val_X)>0.5))
print(roc_auc_score(val_y['target'], modelC.predict(val_X)))

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.6224
[200]	valid_0's binary_logloss: 0.616636
[300]	valid_0's binary_logloss: 0.612586
[400]	valid_0's binary_logloss: 0.609499
[500]	valid_0's binary_logloss: 0.606984
[600]	valid_0's binary_logloss: 0.604466
[700]	valid_0's binary_logloss: 0.602129
[800]	valid_0's binary_logloss: 0.60016
[900]	valid_0's binary_logloss: 0.598241
[1000]	valid_0's binary_logloss: 0.596447
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.596447
              precision    recall  f1-score   support

         0.0       0.67      0.70      0.68    732500
         1.0       0.69      0.66      0.67    742961

   micro avg       0.68      0.68      0.68   1475461
   macro avg       0.68      0.68      0.68   1475461
weighted avg       0.68      0.68      0.68   1475461

0.7449332780580912


### CatBoosting

In [7]:
train_X.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,msno,song_id,source_screen_name,source_system_tab,source_type,song_length,genre_ids,artist_name,composer,lyricist,language,city,bd,gender,registered_via,time,registration_init_time_int,expiration_date_int
0,0,2942719,8145,253733,16,6,8,267517.0,371,4252,8389,2681,9,0,0,2,1,0.296221,6193,6196
1,1,4875524,5224,145235,16,6,8,200620.0,371,34892,74276,26024,6,3,41,1,2,0.490781,5000,6475
2,2,6589819,5474,22231,11,0,7,213342.0,371,20609,27775,9110,9,0,0,2,1,0.663346,6150,6436
3,3,1172060,23177,70181,8,3,3,262246.0,371,44425,83027,34734,2,0,0,2,0,0.117982,5181,6498
4,4,2069395,3269,128141,12,2,2,310753.0,371,42400,81151,32836,2,0,0,2,2,0.20831,4393,6275


In [13]:
train_X.info()
val_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4426382 entries, 0 to 4426381
Data columns (total 20 columns):
Unnamed: 0                    int64
Unnamed: 0.1                  int64
msno                          int64
song_id                       int64
source_screen_name            int64
source_system_tab             int64
source_type                   int64
song_length                   float64
genre_ids                     int64
artist_name                   int64
composer                      int64
lyricist                      int64
language                      int64
city                          int64
bd                            int64
gender                        int64
registered_via                int64
time                          float64
registration_init_time_int    int64
expiration_date_int           int64
dtypes: float64(2), int64(18)
memory usage: 675.4 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1475461 entries, 0 to 1475460
Data columns (total 20 columns

In [9]:
val_X.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,msno,song_id,source_screen_name,source_system_tab,source_type,song_length,genre_ids,artist_name,composer,lyricist,language,city,bd,gender,registered_via,time,registration_init_time_int,expiration_date_int
0,0,3799309,31795,334708,8,3,4,228728.0,364,40825,76813,30697,2,4,36,1,3,0.382447,3185,6849
1,1,4507937,27540,263438,8,3,4,266448.0,371,44911,84600,28968,2,3,18,1,0,0.453779,6198,6475
2,2,1558036,9658,205474,8,3,4,218592.0,371,43531,84801,29868,2,1,25,1,3,0.156835,4713,6779
3,3,2350700,33013,201734,8,3,3,263779.0,371,28850,59304,26024,9,2,0,2,3,0.236627,5359,6460
4,4,5334751,22614,347771,22,0,9,273554.0,608,41970,78847,34456,2,0,0,2,2,0.537008,6202,6476


In [9]:
train_X.shape

(4426382, 20)

In [4]:
cat_features = [2,3,4,5,6,8,9,10,11,12,13,14,15,16]

In [90]:
def run_cb(train_X, train_y, val_X, val_y, cat_features,iterations=300):
    model_cb = cb.CatBoostClassifier(
        iterations=iterations, 
        learning_rate=0.1, 
        depth=6, 
        loss_function='Logloss',
        thread_count=30,
        random_seed=229,
        use_best_model = True,
        l2_leaf_reg = 2
    )
    eval_dataset = Pool(val_X,val_y,cat_features=cat_features)
    
    model = model_cb.fit(train_X, train_y,cat_features,eval_set=eval_dataset,verbose_eval=25,early_stopping_rounds = 100)

    pred_val_y = model.predict(val_X)
    return model, pred_val_y

In [91]:
# Tiny dataset
model_cb, pred_val_y = run_cb(train_tiny_X, train_tiny_y['target'], val_tiny_X, val_tiny_y['target'],cat_features)


0:	learn: 0.6831207	test: 0.6836009	best: 0.6836009 (0)	total: 155ms	remaining: 46.3s
25:	learn: 0.6197703	test: 0.6200814	best: 0.6200814 (25)	total: 2.46s	remaining: 26s
50:	learn: 0.6059473	test: 0.6125745	best: 0.6123165 (49)	total: 4.54s	remaining: 22.2s
75:	learn: 0.5958593	test: 0.6109585	best: 0.6109585 (75)	total: 6.84s	remaining: 20.2s
100:	learn: 0.5861887	test: 0.6098933	best: 0.6094656 (96)	total: 9.03s	remaining: 17.8s
125:	learn: 0.5759205	test: 0.6099270	best: 0.6094656 (96)	total: 11.2s	remaining: 15.5s
150:	learn: 0.5659411	test: 0.6100231	best: 0.6094656 (96)	total: 13.6s	remaining: 13.4s
175:	learn: 0.5563937	test: 0.6119087	best: 0.6094656 (96)	total: 16.1s	remaining: 11.3s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.609465602
bestIteration = 96

Shrink model to first 97 iterations.


In [93]:
#Analysis
print(classification_report(val_tiny_y['target'], pred_val_y>0.5))
print(roc_auc_score(val_tiny_y['target'],pred_val_y))

              precision    recall  f1-score   support

         0.0       0.65      0.74      0.69       477
         1.0       0.73      0.63      0.68       523

   micro avg       0.68      0.68      0.68      1000
   macro avg       0.69      0.69      0.68      1000
weighted avg       0.69      0.68      0.68      1000

0.6854163409775085


In [101]:
model_cb.get_feature_importance()

array([ 2.01723401,  8.75170032,  3.26461969, 11.93417377,  6.74938415,
       11.20750328, 12.10402081,  2.41687205,  3.02796949,  4.01081244,
        2.43845576,  0.73392845,  3.13813829,  3.8346574 ,  4.28264634,
        2.37046235,  2.26047314,  8.30894994,  1.86662922,  5.2813691 ])

In [95]:
#save score
np.savez('output/catboost_small.npz', val_target=val_tiny_y['target'], val_preds= pred_val_y)

In [97]:
#load score
val_scores = np.load('output/catboost_small.npz')
val_scores.files

['val_target', 'val_preds']

In [99]:
val_scores['val_preds'][:10]

array([0., 1., 1., 1., 0., 0., 1., 0., 0., 0.])

In [29]:
# Full dataset
model_cb, pred_val_y = run_cb(train_X, train_y['target'], val_X, val_y['target'], 200)
print(classification_report(val_y['target'], model_cb.predict(val_X)>0.5))
print(roc_auc_score(val_y['target'], model_cb.predict(val_X)))

0:	learn: 0.6899809	total: 1.72s	remaining: 5m 41s
1:	learn: 0.6870093	total: 3.39s	remaining: 5m 35s
2:	learn: 0.6842562	total: 4.95s	remaining: 5m 24s
3:	learn: 0.6816333	total: 6.52s	remaining: 5m 19s
4:	learn: 0.6791754	total: 8.15s	remaining: 5m 17s
5:	learn: 0.6768604	total: 9.71s	remaining: 5m 13s
6:	learn: 0.6746900	total: 11.3s	remaining: 5m 12s
7:	learn: 0.6726329	total: 13s	remaining: 5m 11s
8:	learn: 0.6707317	total: 14.5s	remaining: 5m 8s
9:	learn: 0.6689819	total: 16.1s	remaining: 5m 6s
10:	learn: 0.6672654	total: 17.7s	remaining: 5m 4s
11:	learn: 0.6657201	total: 19.2s	remaining: 5m 1s
12:	learn: 0.6642033	total: 20.8s	remaining: 4m 59s
13:	learn: 0.6627350	total: 22.5s	remaining: 4m 58s
14:	learn: 0.6614417	total: 24s	remaining: 4m 56s
15:	learn: 0.6602177	total: 25.7s	remaining: 4m 55s
16:	learn: 0.6590673	total: 27.3s	remaining: 4m 53s
17:	learn: 0.6579760	total: 28.9s	remaining: 4m 52s
18:	learn: 0.6569631	total: 30.5s	remaining: 4m 50s
19:	learn: 0.6560151	total: 32

156:	learn: 0.6320324	total: 4m 13s	remaining: 1m 9s
157:	learn: 0.6319765	total: 4m 15s	remaining: 1m 7s
158:	learn: 0.6319394	total: 4m 16s	remaining: 1m 6s
159:	learn: 0.6319015	total: 4m 18s	remaining: 1m 4s
160:	learn: 0.6318600	total: 4m 20s	remaining: 1m 3s
161:	learn: 0.6318285	total: 4m 21s	remaining: 1m 1s
162:	learn: 0.6317866	total: 4m 23s	remaining: 59.8s
163:	learn: 0.6317481	total: 4m 25s	remaining: 58.2s
164:	learn: 0.6317012	total: 4m 26s	remaining: 56.6s
165:	learn: 0.6316597	total: 4m 28s	remaining: 55s
166:	learn: 0.6316277	total: 4m 30s	remaining: 53.4s
167:	learn: 0.6315804	total: 4m 31s	remaining: 51.8s
168:	learn: 0.6315378	total: 4m 33s	remaining: 50.1s
169:	learn: 0.6315068	total: 4m 34s	remaining: 48.5s
170:	learn: 0.6314652	total: 4m 36s	remaining: 46.9s
171:	learn: 0.6314217	total: 4m 38s	remaining: 45.3s
172:	learn: 0.6313962	total: 4m 39s	remaining: 43.6s
173:	learn: 0.6313657	total: 4m 41s	remaining: 42s
174:	learn: 0.6313379	total: 4m 42s	remaining: 40.

In [30]:
model_cb, pred_val_y = run_cb(train_X, train_y['target'], val_X, val_y['target'], 20)
print(classification_report(val_y['target'], model_cb.predict(val_X)>0.5))
print(roc_auc_score(val_y['target'], model_cb.predict(val_X)))

0:	learn: 0.6899809	total: 1.68s	remaining: 32s
1:	learn: 0.6870093	total: 3.28s	remaining: 29.5s
2:	learn: 0.6842562	total: 4.81s	remaining: 27.3s
3:	learn: 0.6816333	total: 6.37s	remaining: 25.5s
4:	learn: 0.6791754	total: 7.98s	remaining: 23.9s
5:	learn: 0.6768604	total: 9.52s	remaining: 22.2s
6:	learn: 0.6746900	total: 11.1s	remaining: 20.6s
7:	learn: 0.6726329	total: 12.7s	remaining: 19.1s
8:	learn: 0.6707317	total: 14.3s	remaining: 17.4s
9:	learn: 0.6689819	total: 15.8s	remaining: 15.8s
10:	learn: 0.6672654	total: 17.4s	remaining: 14.2s
11:	learn: 0.6657201	total: 18.9s	remaining: 12.6s
12:	learn: 0.6642033	total: 20.5s	remaining: 11s
13:	learn: 0.6627350	total: 22.1s	remaining: 9.47s
14:	learn: 0.6614417	total: 23.7s	remaining: 7.9s
15:	learn: 0.6602177	total: 25.4s	remaining: 6.34s
16:	learn: 0.6590673	total: 26.9s	remaining: 4.75s
17:	learn: 0.6579760	total: 28.5s	remaining: 3.17s
18:	learn: 0.6569631	total: 30.1s	remaining: 1.58s
19:	learn: 0.6560151	total: 31.7s	remaining: 0