In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from catboost import CatBoostClassifier, Pool

import sklearn.datasets
import gc

### Data Prep.

In [2]:
# train_X = pd.read_csv('music_data/train_X.csv')
train_X = pd.read_csv('music_data/train_X_date_converted.csv')
train_y = pd.read_csv('music_data/train_Y.csv')

# val_X = pd.read_csv('music_data/valid_X.csv')
val_X = pd.read_csv('music_data/valid_X_date_converted.csv')
val_y = pd.read_csv('music_data/valid_Y.csv')

# test_X = pd.read_csv('music_data/test_X.csv')
test_X = pd.read_csv('music_data/test_X_date_converted.csv')
test_y = pd.read_csv('music_data/test_Y.csv')

In [3]:
# train_tiny_X = pd.read_csv('music_data/train_tiny_X.csv')
train_tiny_X = train_X[:10000].copy()
train_tiny_y = train_y[:10000].copy()

val_tiny_X = val_X[:1000].copy()
#val_tiny_X = pd.read_csv('music_data/val_tiny_X_date_converted.csv')
val_tiny_y = val_y[:1000].copy()

In [11]:
# Observing reasonable date for date time conversion
# 2004 seems to be the earliest date. '1970-01-01' is essentially null value (1 in train).
print('train x', train_X['registration_init_time'].min(), train_X['expiration_date'].min())
print('val x', val_X['registration_init_time'].min(), val_X['expiration_date'].min())
print('test x', test_X['registration_init_time'].min(), test_X['expiration_date'].min())


# Not many records smaller than '2000-01-01'. Reasonable to not treat them specially
print(train_X['expiration_date'].apply(lambda date: date == '1970-01-01').sum(), '1970-01-01 in train set')
print(val_X['expiration_date'].apply(lambda date: date == '1970-01-01').sum(), '1970-01-01 in val set')

print(train_X['expiration_date'].apply(lambda date: date < '2000-01-01').sum(), 'smaller than 2000-01-01 in train set')
print(val_X['expiration_date'].apply(lambda date: date < '2000-01-01').sum(), 'smaller than 2000-01-01 in val set')

train x 2004-03-26 1970-01-01
val x 2004-03-26 1970-01-01
test x 2004-03-26 2004-10-16
1 1970-01-01 in train set
2 1970-01-01 in val set
1 smaller than 2000-01-01 in train set
2 smaller than 2000-01-01 in val set


In [None]:
def date_to_int(column, base_date=datetime.strptime('2000-01-01', "%Y-%m-%d")):
    """
    Convert date to day counts since base_date for given columns.
    
    :param column: pandas column containing date representations as str value
    :param base_date: base date from which date is counted. i.e. 2000-01-02 will be day '1' comparing to '2000-01-01'
    """
    
    def date_diff(date):
        date = datetime.strptime(date, "%Y-%m-%d")
        return (date - base_date).days
    
    return column.apply(date_diff)

def convert_date_columns_to_int(dataframes, cols=['registration_init_time', 'expiration_date'], saving=True):
    count = 0
    total = len(dataframes)
    for df_name, df in dataframes.items():
        count += 1
        print("Starting {} ...".format(df_name))
        for col in cols:
            df[col+'_int'] = date_to_int(df[col])
        df = df.drop(columns=cols)
        dataframes[df_name] = df
        
        if saving:
            print("Saving {} ...".format(df_name))
            df.to_csv('music_data/'+ df_name + '_date_converted.csv')
        
        print("====== Done {} / {} ======".format(count, total))
    return dataframes


In [None]:
# Converting all data
converted_dfs = convert_date_columns_to_int({
    'train_X': train_X,
    'valid_X': val_X,
    'test_X': test_X,
    'train_tiny_X': train_tiny_X,
    'val_tiny_X': val_tiny_X,
})
train_X = converted_dfs['train_X']
val_X = converted_dfs['valid_X']
test_X = converted_dfs['test_X']
train_tiny_X = converted_dfs['train_tiny_X']
val_tiny_X = converted_dfs['val_tiny_X']

Starting val_tiny_X ...
Saving val_tiny_X ...
Starting train_X ...
Saving train_X ...


## Gradient Boosting

### LightGBM

In [19]:
def run_lgbc(train_X, train_y, val_X, val_y):
    params = {
        "objective" : "binary",
        "metric" : "binary_logloss", 
        "num_leaves" : 30,
        "min_child_samples" : 100,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100)

    pred_val_y = model.predict(val_X, num_iteration=model.best_iteration)
    return model, pred_val_y

#### Testing on tiny dataset

In [31]:
modelC, pred_val_y = run_lgbc(train_tiny_X, train_tiny_y['target'], val_tiny_X, val_tiny_y['target'])
print(classification_report(val_tiny_y['target'], modelC.predict(val_tiny_X)>0.5))
print(roc_auc_score(val_tiny_y['target'], modelC.predict(val_tiny_X)))

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.651618
Early stopping, best iteration is:
[30]	valid_0's binary_logloss: 0.639451
              precision    recall  f1-score   support

         0.0       0.58      0.69      0.63       477
         1.0       0.66      0.55      0.60       523

   micro avg       0.62      0.62      0.62      1000
   macro avg       0.62      0.62      0.62      1000
weighted avg       0.62      0.62      0.61      1000

0.6819830761892165


#### Running large data

In [32]:
modelC, pred_val_y = run_lgbc(train_X, train_y['target'], val_X, val_y['target'])
print(classification_report(val_y['target'], modelC.predict(val_X)>0.5))
print(roc_auc_score(val_y['target'], modelC.predict(val_X)))

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.6224
[200]	valid_0's binary_logloss: 0.616636
[300]	valid_0's binary_logloss: 0.612586
[400]	valid_0's binary_logloss: 0.609499
[500]	valid_0's binary_logloss: 0.606984
[600]	valid_0's binary_logloss: 0.604466
[700]	valid_0's binary_logloss: 0.602129
[800]	valid_0's binary_logloss: 0.60016
[900]	valid_0's binary_logloss: 0.598241
[1000]	valid_0's binary_logloss: 0.596447
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.596447
              precision    recall  f1-score   support

         0.0       0.67      0.70      0.68    732500
         1.0       0.69      0.66      0.67    742961

   micro avg       0.68      0.68      0.68   1475461
   macro avg       0.68      0.68      0.68   1475461
weighted avg       0.68      0.68      0.68   1475461

0.7449332780580912


### CatBoosting

In [4]:
train_X.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,msno,song_id,source_screen_name,source_system_tab,source_type,song_length,genre_ids,artist_name,composer,lyricist,language,city,bd,gender,registered_via,time,registration_init_time_int,expiration_date_int
0,4014890,7039846,785,141648,18,7,7,219454.0,371,44988,20734,4032,2,0,0,2,2,0.708647,4155,6465
1,865446,2067552,9176,156565,16,6,8,270001.0,127,8947,74276,26024,9,0,0,2,2,0.208124,4384,6487
2,1151370,5503771,31040,43786,11,0,7,265822.0,371,40039,84290,35947,2,12,33,1,3,0.554022,2890,6479
3,575108,639057,26412,387047,8,3,4,266031.0,364,42827,71260,26024,2,0,0,2,2,0.064329,6153,6486
4,460130,6368926,24075,136842,4,0,7,280346.0,364,43072,81070,31877,2,0,0,2,3,0.641111,5634,6874


In [5]:
val_X.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,msno,song_id,source_screen_name,source_system_tab,source_type,song_length,genre_ids,artist_name,composer,lyricist,language,city,bd,gender,registered_via,time,registration_init_time_int,expiration_date_int
0,481907,1914774,16381,209587,11,3,3,296542.0,364,42827,72356,26024,2,0,0,1,2,0.192746,4884,6467
1,1001872,2221821,22888,410176,8,3,3,333740.0,127,22395,74276,26024,9,13,26,1,3,0.223654,4148,6458
2,2468130,5800477,28551,102966,8,3,4,82464.0,608,27715,39002,26024,1,0,0,2,1,0.583889,6142,6487
3,1795430,349540,33321,181586,8,3,4,235075.0,371,36867,74276,26024,9,0,0,2,2,0.035185,6091,6486
4,621167,2966646,16427,73221,8,3,3,173557.0,364,43595,85837,34739,2,0,0,2,2,0.298629,6101,6465


In [6]:
train_X.shape

(552909, 20)

In [13]:
cat_features = [2,3,4,5,6,8,9,10,11,12,13,14,15,16]

In [87]:
def run_cb(train_X, train_y, val_X, val_y, cat_features, iterations=20, learning_rate=0.1, depth=6):
    model_cb = cb.CatBoostClassifier(
        iterations=iterations, 
        learning_rate=learning_rate, 
        depth=depth, 
        loss_function='Logloss',
        thread_count=30,
        random_seed=229,
        use_best_model = True,
        l2_leaf_reg = 2
        *kwargs
    )
    eval_dataset = Pool(val_X,val_y,cat_features=cat_features)
    
    model = model_cb.fit(train_X, train_y, cat_features, eval_set=eval_dataset,
                         verbose_eval=25, early_stopping_rounds = 10)

    pred_val_y = model.predict(val_X)
    return model, pred_val_y

### Testing on Tiny Dataset

In [91]:
# Tiny dataset
model_cb, pred_val_y = run_cb(train_tiny_X, train_tiny_y['target'], val_tiny_X, val_tiny_y['target'],cat_features)


0:	learn: 0.6831207	test: 0.6836009	best: 0.6836009 (0)	total: 155ms	remaining: 46.3s
25:	learn: 0.6197703	test: 0.6200814	best: 0.6200814 (25)	total: 2.46s	remaining: 26s
50:	learn: 0.6059473	test: 0.6125745	best: 0.6123165 (49)	total: 4.54s	remaining: 22.2s
75:	learn: 0.5958593	test: 0.6109585	best: 0.6109585 (75)	total: 6.84s	remaining: 20.2s
100:	learn: 0.5861887	test: 0.6098933	best: 0.6094656 (96)	total: 9.03s	remaining: 17.8s
125:	learn: 0.5759205	test: 0.6099270	best: 0.6094656 (96)	total: 11.2s	remaining: 15.5s
150:	learn: 0.5659411	test: 0.6100231	best: 0.6094656 (96)	total: 13.6s	remaining: 13.4s
175:	learn: 0.5563937	test: 0.6119087	best: 0.6094656 (96)	total: 16.1s	remaining: 11.3s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.609465602
bestIteration = 96

Shrink model to first 97 iterations.


In [93]:
#Analysis
print(classification_report(val_tiny_y['target'], pred_val_y>0.5))
print(roc_auc_score(val_tiny_y['target'],pred_val_y))

              precision    recall  f1-score   support

         0.0       0.65      0.74      0.69       477
         1.0       0.73      0.63      0.68       523

   micro avg       0.68      0.68      0.68      1000
   macro avg       0.69      0.69      0.68      1000
weighted avg       0.69      0.68      0.68      1000

0.6854163409775085


In [101]:
model_cb.get_feature_importance()

array([ 2.01723401,  8.75170032,  3.26461969, 11.93417377,  6.74938415,
       11.20750328, 12.10402081,  2.41687205,  3.02796949,  4.01081244,
        2.43845576,  0.73392845,  3.13813829,  3.8346574 ,  4.28264634,
        2.37046235,  2.26047314,  8.30894994,  1.86662922,  5.2813691 ])

In [95]:
#save score
np.savez('output/catboost_small.npz', val_target=val_tiny_y['target'], val_preds= pred_val_y)

In [97]:
#load score
val_scores = np.load('output/catboost_small.npz')
val_scores.files

['val_target', 'val_preds']

In [99]:
val_scores['val_preds'][:10]

array([0., 1., 1., 1., 0., 0., 1., 0., 0., 0.])

### Testing on Full Dataset (Old data)

In [6]:
# Full dataset
model_cb, pred_val_y = run_cb(train_X, train_y['target'], val_X, val_y['target'], cat_features,
                              iterations=2000, learning_rate=0.3)

0:	learn: 0.6284429	test: 0.6235821	best: 0.6235821 (0)	total: 9.48s	remaining: 5h 15m 52s
25:	learn: 0.5141280	test: 0.4905989	best: 0.4905989 (25)	total: 3m 27s	remaining: 4h 22m 15s
50:	learn: 0.5065182	test: 0.4815930	best: 0.4815930 (50)	total: 6m 47s	remaining: 4h 19m 36s
75:	learn: 0.5030026	test: 0.4775023	best: 0.4775023 (75)	total: 10m 8s	remaining: 4h 16m 38s
100:	learn: 0.5004828	test: 0.4747343	best: 0.4747343 (100)	total: 13m 33s	remaining: 4h 14m 56s
125:	learn: 0.4986242	test: 0.4727999	best: 0.4727999 (125)	total: 16m 54s	remaining: 4h 11m 26s
150:	learn: 0.4969893	test: 0.4710671	best: 0.4710671 (150)	total: 20m 27s	remaining: 4h 10m 32s
175:	learn: 0.4952983	test: 0.4692347	best: 0.4692347 (175)	total: 23m 54s	remaining: 4h 7m 49s
200:	learn: 0.4940872	test: 0.4679293	best: 0.4679293 (200)	total: 27m 15s	remaining: 4h 3m 57s
225:	learn: 0.4931417	test: 0.4669468	best: 0.4669468 (225)	total: 30m 38s	remaining: 4h 34s
250:	learn: 0.4920234	test: 0.4657479	best: 0.46574

In [7]:
#Analysis
print(classification_report(val_y['target'], pred_val_y>0.5))
print(roc_auc_score(val_y['target'],pred_val_y))

              precision    recall  f1-score   support

         0.0       0.78      0.79      0.78    732500
         1.0       0.79      0.78      0.78    742961

   micro avg       0.78      0.78      0.78   1475461
   macro avg       0.78      0.78      0.78   1475461
weighted avg       0.78      0.78      0.78   1475461

0.7817807913913396


In [8]:
pred_test_y = model_cb.predict(test_X)
print(classification_report(test_y['target'], pred_test_y>0.5))
print(roc_auc_score(test_y['target'],pred_test_y))

              precision    recall  f1-score   support

         0.0       0.78      0.79      0.78    732814
         1.0       0.79      0.78      0.78    742647

   micro avg       0.78      0.78      0.78   1475461
   macro avg       0.78      0.78      0.78   1475461
weighted avg       0.78      0.78      0.78   1475461

0.7822849047973764


In [9]:
#save results
feat_imp = model_cb.get_feature_importance()
np.savez('output/catboost_results.npz', val_target=val_y['target'], val_preds= pred_val_y,
         test_target=test_y['target'], test_preds= pred_test_y, feature_importance = feat_imp)

In [10]:
feat_imp

array([ 0.22045868, 12.08757098, 25.96270734,  8.28026767,  6.24937646,
        3.04467856, 17.37165761,  0.50718669,  2.4003231 ,  7.06447148,
        1.73862936,  2.03445932,  6.09314579,  0.29964326,  0.32316643,
        0.21332816,  0.32438211,  3.46237783,  0.8278253 ,  1.49434386])

In [11]:
pred_test_y

array([0., 1., 1., ..., 0., 0., 0.])

### Testing on Full Dataset (New data)

In [2]:
train_X = pd.read_csv('new_data/new_train_X.csv')
train_y = pd.read_csv('new_data/new_train_y.csv')

val_X = pd.read_csv('new_data/new_valid_X.csv')
val_y = pd.read_csv('new_data/new_valid_y.csv')

test_X = pd.read_csv('new_data/new_test_X.csv')
test_y = pd.read_csv('new_data/new_test_y.csv')

In [3]:
cluster_train_X = pd.read_csv('new_data/cluster_train_X.csv')
cluster_val_X = pd.read_csv('new_data/cluster_valid_X.csv')
cluster_test_X = pd.read_csv('new_data/cluster_test_X.csv')

In [52]:
train_X.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,msno,song_id,source_screen_name,source_system_tab,source_type,song_length,genre_ids,artist_name,composer,lyricist,language,city,bd,gender,registered_via,time,registration_init_time_int,expiration_date_int
0,2971195,4410275,19953,290929,4,0,7,227631.0,548,7701,29509,11455,6,0,0,2,2,0.443948,5671,6482
1,543856,3016976,1574,234767,3,0,7,226069.0,371,2742,5696,26024,9,13,36,0,0,0.303696,4732,6532
2,3077911,1936206,5432,90102,18,7,7,246256.0,371,20609,20790,6777,9,2,22,1,0,0.194903,5519,6480
3,1160706,4269820,30083,206348,22,0,11,229041.0,364,43437,63250,26024,2,0,0,2,2,0.42981,5997,6319
4,1796776,526083,9703,161841,16,6,8,312450.0,371,44848,29258,9600,2,20,36,1,3,0.052957,4577,6486


In [57]:
cluster_train_X_reduced.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,song_length,registered_via,time,registration_init_time_int,expiration_date_int,song_cluster,ui_cluster
0,2971195,4410275,227631.0,2,0.443948,5671,6482,6,2
1,543856,3016976,226069.0,0,0.303696,4732,6532,12,2
2,3077911,1936206,246256.0,0,0.194903,5519,6480,12,2
3,1160706,4269820,229041.0,2,0.42981,5997,6319,5,2
4,1796776,526083,312450.0,3,0.052957,4577,6486,2,1


In [4]:
train_X_sample = train_X[:10000]
train_y_sample = train_y[:10000]
len(train_X_sample)

10000

In [14]:
def run_cb(train_X, train_y, val_X, val_y, cat_features, **kwards):
    print(kwargs)
    
    model_cb = cb.CatBoostClassifier(
        loss_function='Logloss',
        thread_count=30,
        random_seed=229,
        use_best_model = True,
        l2_leaf_reg = 2,
        **kwargs
    )
    
    eval_dataset = Pool(val_X,val_y,cat_features=cat_features)
    
    model = model_cb.fit(train_X, train_y, cat_features, eval_set=eval_dataset,
                         verbose_eval=25, early_stopping_rounds = 10)

    pred_val_y = model.predict(val_X)
    return model, pred_val_y

In [15]:
kwargs={'iterations': 2000, 'learning_rate': 0.1, 'depth': 6, 'grow_policy': 'Depthwise'}
model_cb, pred_val_y = run_cb(train_X_sample, train_y_sample['target'], val_X, val_y['target'], cat_features, **kwargs)

print("====== Validation ======")
print(classification_report(val_y['target'], pred_val_y>0.5))
print(roc_auc_score(val_y['target'],pred_val_y))

print("====== Test ======")
pred_test_y = model_cb.predict(test_X)
print(classification_report(test_y['target'], pred_test_y>0.5))
print(roc_auc_score(test_y['target'], pred_test_y))

{'iterations': 2000, 'learning_rate': 0.1, 'depth': 6, 'grow_policy': 'Depthwise'}


CatBoostError: catboost/libs/options/json_helper.h:223: Error: change of option grow_policy is unimplemented for task type CPU and was not default in previous run

In [16]:
# Full dataset
kwargs={'iterations': 2000, 'learning_rate': 0.3, 'depth': 6}
model_cb, pred_val_y = run_cb(train_X, train_y['target'], val_X, val_y['target'], cat_features, **kwargs)

{'iterations': 2000, 'learning_rate': 0.3, 'depth': 6}
0:	learn: 0.6274706	test: 0.6232051	best: 0.6232051 (0)	total: 1.3s	remaining: 43m 22s
25:	learn: 0.5092070	test: 0.4878547	best: 0.4878547 (25)	total: 22s	remaining: 27m 49s
50:	learn: 0.5012721	test: 0.4801034	best: 0.4801034 (50)	total: 42.1s	remaining: 26m 48s
75:	learn: 0.4970894	test: 0.4755828	best: 0.4755828 (75)	total: 1m 2s	remaining: 26m 17s
100:	learn: 0.4941539	test: 0.4726991	best: 0.4726991 (100)	total: 1m 22s	remaining: 25m 47s
125:	learn: 0.4915802	test: 0.4705364	best: 0.4705364 (125)	total: 1m 42s	remaining: 25m 17s
150:	learn: 0.4898084	test: 0.4691844	best: 0.4691844 (150)	total: 2m 1s	remaining: 24m 51s
175:	learn: 0.4884625	test: 0.4683720	best: 0.4683720 (175)	total: 2m 21s	remaining: 24m 23s
200:	learn: 0.4872431	test: 0.4678459	best: 0.4678459 (200)	total: 2m 40s	remaining: 23m 54s
225:	learn: 0.4852771	test: 0.4663719	best: 0.4663719 (225)	total: 2m 59s	remaining: 23m 32s
250:	learn: 0.4842119	test: 0.465

In [18]:
pred_val_y

array([1., 1., 1., ..., 1., 1., 1.])

In [17]:
#Analysis
print('====== Validation ======')
print(classification_report(val_y['target'], pred_val_y>0.5))
print(roc_auc_score(val_y['target'],pred_val_y))

print('====== Test ======')
pred_test_y = model_cb.predict(test_X)
print(classification_report(test_y['target'], pred_test_y>0.5))
print(roc_auc_score(test_y['target'],pred_test_y))

              precision    recall  f1-score   support

         0.0       0.77      0.79      0.78     45023
         1.0       0.79      0.77      0.78     46757

   micro avg       0.78      0.78      0.78     91780
   macro avg       0.78      0.78      0.78     91780
weighted avg       0.78      0.78      0.78     91780

0.7791967416657036
              precision    recall  f1-score   support

         0.0       0.77      0.78      0.78     45095
         1.0       0.79      0.77      0.78     46685

   micro avg       0.78      0.78      0.78     91780
   macro avg       0.78      0.78      0.78     91780
weighted avg       0.78      0.78      0.78     91780

0.7785385613224058


              precision    recall  f1-score   support

         0.0       0.71      0.72      0.72     45095
         1.0       0.73      0.72      0.72     46685

   micro avg       0.72      0.72      0.72     91780
   macro avg       0.72      0.72      0.72     91780
weighted avg       0.72      0.72      0.72     91780

0.7198437549336987


In [30]:
#save results
feat_imp = model_cb.get_feature_importance()
np.savez('output/catboost_results_new.npz', val_target=val_y['target'], val_preds= pred_val_y,
         test_target=test_y['target'], test_preds= pred_test_y, feature_importance = feat_imp)

In [18]:
feat_imp

array([ 0.51052487,  6.5144231 , 22.14314636, 12.19364801,  6.44038638,
        5.99666057, 16.63497658,  1.05133631,  2.51083374,  5.03806821,
        0.57331622,  2.85103998,  4.1725927 ,  0.53724677,  0.57618427,
        0.39445002,  0.51814337,  7.19154633,  1.47933179,  2.67214442])

In [19]:
pred_test_y

array([1., 1., 1., ..., 0., 1., 1.])