In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

import lightgbm as lgb
import xgboost as xgb
import sklearn.datasets
import gc

### Data Prep.

In [2]:
# # train_X = pd.read_csv('music_data/train_X.csv')
# train_X = pd.read_csv('music_data/train_X_date_converted.csv')
# train_y = pd.read_csv('music_data/train_Y.csv')

# # val_X = pd.read_csv('music_data/valid_X.csv')
# val_X = pd.read_csv('music_data/valid_X_date_converted.csv')
# val_y = pd.read_csv('music_data/valid_Y.csv')

# # test_X = pd.read_csv('music_data/test_X.csv')
# test_X = pd.read_csv('music_data/test_X_date_converted.csv')
# test_y = pd.read_csv('music_data/test_Y.csv')

# # train_tiny_X = pd.read_csv('music_data/train_tiny_X.csv')
# train_tiny_X = pd.read_csv('music_data/train_tiny_X_date_converted.csv')
# train_tiny_y = pd.read_csv('music_data/train_tiny_Y.csv')

# # val_tiny_X = val_X[:1000].copy()
# val_tiny_X = pd.read_csv('music_data/val_tiny_X_date_converted.csv')
# val_tiny_y = val_y[:1000].copy()

new_train_X = pd.read_csv('new_data/new_train_X.csv')
new_train_y = pd.read_csv('new_data/new_train_y.csv')

new_val_X = pd.read_csv('new_data/new_valid_X.csv')
new_val_y = pd.read_csv('new_data/new_valid_y.csv')

new_test_X = pd.read_csv('new_data/new_test_X.csv')
new_test_y = pd.read_csv('new_data/new_test_y.csv')

In [7]:
# Observing reasonable date for date time conversion
# 2004 seems to be the earliest date. '1970-01-01' is essentially null value (1 in train).
print('train x', train_X['registration_init_time'].min(), train_X['expiration_date'].min())
print('val x', val_X['registration_init_time'].min(), val_X['expiration_date'].min())
print('test x', test_X['registration_init_time'].min(), test_X['expiration_date'].min())


# Not many records smaller than '2000-01-01'. Reasonable to not treat them specially
print(train_X['expiration_date'].apply(lambda date: date == '1970-01-01').sum(), '1970-01-01 in train set')
print(val_X['expiration_date'].apply(lambda date: date == '1970-01-01').sum(), '1970-01-01 in val set')

print(train_X['expiration_date'].apply(lambda date: date < '2000-01-01').sum(), 'smaller than 2000-01-01 in train set')
print(val_X['expiration_date'].apply(lambda date: date < '2000-01-01').sum(), 'smaller than 2000-01-01 in val set')

KeyError: 'registration_init_time'

In [None]:
def date_to_int(column, base_date=datetime.strptime('2000-01-01', "%Y-%m-%d")):
    """
    Convert date to day counts since base_date for given columns.
    
    :param column: pandas column containing date representations as str value
    :param base_date: base date from which date is counted. i.e. 2000-01-02 will be day '1' comparing to '2000-01-01'
    """
    
    def date_diff(date):
        date = datetime.strptime(date, "%Y-%m-%d")
        return (date - base_date).days
    
    return column.apply(date_diff)

def convert_date_columns_to_int(dataframes, cols=['registration_init_time', 'expiration_date'], saving=True):
    count = 0
    total = len(dataframes)
    for df_name, df in dataframes.items():
        count += 1
        print("Starting {} ...".format(df_name))
        for col in cols:
            df[col+'_int'] = date_to_int(df[col])
        df = df.drop(columns=cols)
        dataframes[df_name] = df
        
        if saving:
            print("Saving {} ...".format(df_name))
            df.to_csv('music_data/'+ df_name + '_date_converted.csv')
        
        print("====== Done {} / {} ======".format(count, total))
    return dataframes


In [None]:
# Converting all data
converted_dfs = convert_date_columns_to_int({
    'train_X': train_X,
    'valid_X': val_X,
    'test_X': test_X,
    'train_tiny_X': train_tiny_X,
    'val_tiny_X': val_tiny_X,
})
train_X = converted_dfs['train_X']
val_X = converted_dfs['valid_X']
test_X = converted_dfs['test_X']
train_tiny_X = converted_dfs['train_tiny_X']
val_tiny_X = converted_dfs['val_tiny_X']

In [None]:
train_X.head()

## Gradient Boosting

### LightGBM

In [37]:
def run_lgbc(train_X, train_y, val_X, val_y):
    params = {
        "objective" : "binary",
        "metric" : "binary_logloss", 
        "num_leaves" : 30,
        "min_child_samples" : 100,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100)

    pred_val_y = model.predict(val_X, num_iteration=model.best_iteration)
    return model, pred_val_y

#### Testing on tiny dataset

In [38]:
modelC, pred_val_y = run_lgbc(train_tiny_X, train_tiny_y['target'], val_tiny_X, val_tiny_y['target'])
print(classification_report(val_tiny_y['target'], modelC.predict(val_tiny_X)>0.5))
print(roc_auc_score(val_tiny_y['target'], modelC.predict(val_tiny_X)))

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.651618
Early stopping, best iteration is:
[30]	valid_0's binary_logloss: 0.639451
              precision    recall  f1-score   support

         0.0       0.58      0.69      0.63       477
         1.0       0.66      0.55      0.60       523

   micro avg       0.62      0.62      0.62      1000
   macro avg       0.62      0.62      0.62      1000
weighted avg       0.62      0.62      0.61      1000

0.6819830761892165


#### Running large data

In [39]:
modelC, pred_val_y = run_lgbc(train_X, train_y['target'], val_X, val_y['target'])
print(classification_report(val_y['target'], modelC.predict(val_X)>0.5))
print(roc_auc_score(val_y['target'], modelC.predict(val_X)))

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.6224
[200]	valid_0's binary_logloss: 0.616636
[300]	valid_0's binary_logloss: 0.612586
[400]	valid_0's binary_logloss: 0.609499
[500]	valid_0's binary_logloss: 0.606984
[600]	valid_0's binary_logloss: 0.604466
[700]	valid_0's binary_logloss: 0.602129
[800]	valid_0's binary_logloss: 0.60016
[900]	valid_0's binary_logloss: 0.598241
[1000]	valid_0's binary_logloss: 0.596447
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.596447
              precision    recall  f1-score   support

         0.0       0.67      0.70      0.68    732500
         1.0       0.69      0.66      0.67    742961

   micro avg       0.68      0.68      0.68   1475461
   macro avg       0.68      0.68      0.68   1475461
weighted avg       0.68      0.68      0.68   1475461

0.7449332780580912


In [40]:
# val_y_predC = modelC.predict(val_X)
# val_y_predC[val_y_predC>0.5]=1
# val_y_predC[val_y_predC<0.5]=0
# print("model score: %.3f" % accuracy_score(val_y['target'],  val_y_predC))

model score: 0.679


# XGBoost

In [3]:
def run_xgb(X_train, y_train, X_val, y_val):
    params = {'objective': 'binary:logistic',
              #'eval_metric':"binary_logloss",
              'eta': 0.03,
              'max_depth': 16,
              'subsample': 0.6,
              'colsample_bytree': 0.6,
              'alpha':0.001,
              'random_state': 42,
              'silent': True}

    xgb_train_data = xgb.DMatrix(X_train.values, y_train.values)
    xgb_val_data = xgb.DMatrix(X_val.values, y_val.values)
#     xgb_test_data = xgb.DMatrix(X_test.values, y_test.values)
    #xgb_submit_data = xgb.DMatrix(X_test)
    model = xgb.train(params, xgb_train_data, 
                      num_boost_round=2000, 
                      evals= [(xgb_train_data, 'train'), (xgb_val_data, 'valid')],
                      early_stopping_rounds=100, 
                      verbose_eval=10
                     )

    y_pred_train = model.predict(xgb_train_data, ntree_limit=model.best_ntree_limit)
    y_pred_val = model.predict(xgb_val_data, ntree_limit=model.best_ntree_limit)
#     y_pred_test = model.predict(xgb_test_data, ntree_limit=model.best_ntree_limit)
    #y_pred_submit = model.predict(xgb_submit_data, ntree_limit=model.best_ntree_limit)

    return model, y_pred_val

In [42]:
#tiny data debugging
#starttime = time.time()
xgb_model, xgb_preds_val = run_xgb(train_tiny_X,train_tiny_y['target'], val_tiny_X, val_tiny_y['target'])
#print('time: {}'.format(time.time()-starttime))
#print(np.sqrt(metrics.mean_squared_error(val_tiny_y['target'], xgb_preds_val)))

[0]	train-error:0.3236	valid-error:0.436
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 100 rounds.
[100]	train-error:0.2271	valid-error:0.365
[200]	train-error:0.222	valid-error:0.37
Stopping. Best iteration:
[111]	train-error:0.2241	valid-error:0.361



In [43]:
print(classification_report(val_tiny_y['target'], xgb_preds_val>0.5))
print(roc_auc_score(val_tiny_y['target'],  xgb_preds_val))

              precision    recall  f1-score   support

         0.0       0.60      0.71      0.65       477
         1.0       0.69      0.57      0.62       523

   micro avg       0.64      0.64      0.64      1000
   macro avg       0.64      0.64      0.64      1000
weighted avg       0.65      0.64      0.64      1000

0.6828288658802026


In [45]:
#full data
xgb_model, xgb_preds_val = run_xgb(train_X,train_y['target'], val_X, val_y['target'])

[0]	train-error:0.360643	valid-error:0.362191
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 100 rounds.
[10]	train-error:0.355012	valid-error:0.356888
[20]	train-error:0.351621	valid-error:0.353356
[30]	train-error:0.352364	valid-error:0.354116
[40]	train-error:0.352319	valid-error:0.354123
[50]	train-error:0.352876	valid-error:0.354636
[60]	train-error:0.353116	valid-error:0.354815
[70]	train-error:0.353022	valid-error:0.354775
[80]	train-error:0.353339	valid-error:0.354946
[90]	train-error:0.353264	valid-error:0.354866
[100]	train-error:0.353177	valid-error:0.354818
[110]	train-error:0.353122	valid-error:0.354846
[120]	train-error:0.35286	valid-error:0.354529
Stopping. Best iteration:
[20]	train-error:0.351621	valid-error:0.353356



In [46]:
print(classification_report(val_y['target'], xgb_preds_val>0.5))
print(roc_auc_score(val_y['target'],  xgb_preds_val))

              precision    recall  f1-score   support

         0.0       0.64      0.67      0.65    732500
         1.0       0.66      0.62      0.64    742961

   micro avg       0.65      0.65      0.65   1475461
   macro avg       0.65      0.65      0.65   1475461
weighted avg       0.65      0.65      0.65   1475461

0.7040720831004901


In [4]:
new_xgb_model, new_xgb_preds_val = run_xgb(new_train_X,new_train_y['target'], new_val_X, new_val_y['target'])

[0]	train-error:0.292847	valid-error:0.329908
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 100 rounds.
[10]	train-error:0.22277	valid-error:0.269601
[20]	train-error:0.206824	valid-error:0.259991
[30]	train-error:0.19615	valid-error:0.255884
[40]	train-error:0.187904	valid-error:0.252114
[50]	train-error:0.181421	valid-error:0.248682
[60]	train-error:0.174808	valid-error:0.245511
[70]	train-error:0.168357	valid-error:0.243593
[80]	train-error:0.163797	valid-error:0.242046
[90]	train-error:0.159169	valid-error:0.240652
[100]	train-error:0.154767	valid-error:0.239322
[110]	train-error:0.150585	valid-error:0.238211
[120]	train-error:0.146319	valid-error:0.237426
[130]	train-error:0.143174	valid-error:0.236424
[140]	train-error:0.139526	valid-error:0.235302
[150]	train-error:0.136446	valid-error:0.233482
[160]	train-error:0.13319	valid-error:0.232752
[170]	train-error:0.129529	valid-error:0.231663
[1

In [5]:
print(classification_report(new_val_y['target'], new_xgb_preds_val>0.5))
print(roc_auc_score(new_val_y['target'],new_xgb_preds_val))

              precision    recall  f1-score   support

         0.0       0.77      0.79      0.78     45023
         1.0       0.79      0.77      0.78     46757

   micro avg       0.78      0.78      0.78     91780
   macro avg       0.78      0.78      0.78     91780
weighted avg       0.78      0.78      0.78     91780

0.859816533397021


In [6]:
feature_imp = new_xgb_model.get_score(importance_type='gain')

In [7]:
xgb_test_data = xgb.DMatrix(new_test_X.values, new_test_y.values)
new_xgb_preds_test = new_xgb_model.predict(xgb_test_data, ntree_limit=new_xgb_model.best_ntree_limit)

In [8]:
np.savez('output/new_xgb_preds_results.npz', val_target=new_val_y['target'], val_preds= new_xgb_preds_val, test_target = new_test_y['target'], test_preds=new_xgb_preds_test,
        feature_importance = feature_imp)

## Tree

### Decision Tree

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
pipelines = {
    "dtclass": make_pipeline(DecisionTreeClassifier(random_state=0))
}
decisiontree_hyperparameters = {
    'decisiontreeclassifier__max_depth' : np.arange(3, 20),
    'decisiontreeclassifier__max_features' : np.arange(3, 20),
    'decisiontreeclassifier__min_samples_split' : np.arange(5, 25),
    "decisiontreeclassifier__min_samples_leaf" : np.arange(1,3)
}
dtclass_model = RandomizedSearchCV(pipelines['dtclass'],decisiontree_hyperparameters,n_iter = 100,cv=3, scoring = 'roc_auc', verbose=100)
dtclass_model.fit(new_train_X, new_train_y['target'])
print(dtclass_model.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] decisiontreeclassifier__min_samples_split=14, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=18, decisiontreeclassifier__max_depth=17 
[CV]  decisiontreeclassifier__min_samples_split=14, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=18, decisiontreeclassifier__max_depth=17, score=0.7529322083224497, total=   7.3s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.6s remaining:    0.0s
[CV] decisiontreeclassifier__min_samples_split=14, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=18, decisiontreeclassifier__max_depth=17 
[CV]  decisiontreeclassifier__min_samples_split=14, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=18, decisiontreeclassifier__max_depth=17, score=0.7510674185848192, total=   7.2

[CV]  decisiontreeclassifier__min_samples_split=20, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=7, decisiontreeclassifier__max_depth=11, score=0.7135378188512922, total=   2.5s
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  1.2min remaining:    0.0s
[CV] decisiontreeclassifier__min_samples_split=19, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=17, decisiontreeclassifier__max_depth=19 
[CV]  decisiontreeclassifier__min_samples_split=19, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=17, decisiontreeclassifier__max_depth=19, score=0.7569004575802006, total=   7.4s
[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:  1.3min remaining:    0.0s
[CV] decisiontreeclassifier__min_samples_split=19, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=17, decisiontreeclassifier__max_depth=19 
[CV]  decisiontreeclassifier__min_samples_split=19, decisiontre

[CV]  decisiontreeclassifier__min_samples_split=13, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=7, decisiontreeclassifier__max_depth=14, score=0.736122111877116, total=   2.8s
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  2.2min remaining:    0.0s
[CV] decisiontreeclassifier__min_samples_split=11, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=10, decisiontreeclassifier__max_depth=9 
[CV]  decisiontreeclassifier__min_samples_split=11, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=10, decisiontreeclassifier__max_depth=9, score=0.7070114195168886, total=   2.7s
[Parallel(n_jobs=1)]: Done  37 out of  37 | elapsed:  2.2min remaining:    0.0s
[CV] decisiontreeclassifier__min_samples_split=11, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=10, decisiontreeclassifier__max_depth=9 
[CV]  decisiontreeclassifier__min_samples_split=11, decisiontreecla

[CV]  decisiontreeclassifier__min_samples_split=14, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=13, decisiontreeclassifier__max_depth=8, score=0.7025779495910519, total=   3.0s
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  3.2min remaining:    0.0s
[CV] decisiontreeclassifier__min_samples_split=8, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=8, decisiontreeclassifier__max_depth=11 
[CV]  decisiontreeclassifier__min_samples_split=8, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=8, decisiontreeclassifier__max_depth=11, score=0.7165244658423766, total=   2.5s
[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:  3.3min remaining:    0.0s
[CV] decisiontreeclassifier__min_samples_split=8, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=8, decisiontreeclassifier__max_depth=11 
[CV]  decisiontreeclassifier__min_samples_split=8, decisiontreeclassi

[CV]  decisiontreeclassifier__min_samples_split=9, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=15, decisiontreeclassifier__max_depth=14, score=0.7435119476427211, total=   5.5s
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  4.5min remaining:    0.0s
[CV] decisiontreeclassifier__min_samples_split=19, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=13, decisiontreeclassifier__max_depth=8 
[CV]  decisiontreeclassifier__min_samples_split=19, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=13, decisiontreeclassifier__max_depth=8, score=0.7016295402196397, total=   2.9s
[Parallel(n_jobs=1)]: Done  73 out of  73 | elapsed:  4.6min remaining:    0.0s
[CV] decisiontreeclassifier__min_samples_split=19, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=13, decisiontreeclassifier__max_depth=8 
[CV]  decisiontreeclassifier__min_samples_split=19, decisiontreecl

[CV]  decisiontreeclassifier__min_samples_split=19, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=12, decisiontreeclassifier__max_depth=15, score=0.7438120579001724, total=   4.7s
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  5.5min remaining:    0.0s
[CV] decisiontreeclassifier__min_samples_split=5, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=12, decisiontreeclassifier__max_depth=9 
[CV]  decisiontreeclassifier__min_samples_split=5, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=12, decisiontreeclassifier__max_depth=9, score=0.7049455828231337, total=   3.2s
[Parallel(n_jobs=1)]: Done  91 out of  91 | elapsed:  5.5min remaining:    0.0s
[CV] decisiontreeclassifier__min_samples_split=5, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=12, decisiontreeclassifier__max_depth=9 
[CV]  decisiontreeclassifier__min_samples_split=5, decisiontreeclass

[CV]  decisiontreeclassifier__min_samples_split=17, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=4, decisiontreeclassifier__max_depth=19, score=0.7248383371549897, total=   2.3s
[CV] decisiontreeclassifier__min_samples_split=17, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=4, decisiontreeclassifier__max_depth=19 
[CV]  decisiontreeclassifier__min_samples_split=17, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=4, decisiontreeclassifier__max_depth=19, score=0.7311703818714288, total=   2.5s
[CV] decisiontreeclassifier__min_samples_split=20, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=8, decisiontreeclassifier__max_depth=10 
[CV]  decisiontreeclassifier__min_samples_split=20, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=8, decisiontreeclassifier__max_depth=10, score=0.7107699581942613, total=   2.7s
[CV] decisio

[CV]  decisiontreeclassifier__min_samples_split=20, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=7, decisiontreeclassifier__max_depth=17, score=0.7437957360075722, total=   3.5s
[CV] decisiontreeclassifier__min_samples_split=21, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=17, decisiontreeclassifier__max_depth=5 
[CV]  decisiontreeclassifier__min_samples_split=21, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=17, decisiontreeclassifier__max_depth=5, score=0.681568703822739, total=   3.0s
[CV] decisiontreeclassifier__min_samples_split=21, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=17, decisiontreeclassifier__max_depth=5 
[CV]  decisiontreeclassifier__min_samples_split=21, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=17, decisiontreeclassifier__max_depth=5, score=0.6818841489047175, total=   3.1s
[CV] decision

[CV]  decisiontreeclassifier__min_samples_split=13, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=3, decisiontreeclassifier__max_depth=11, score=0.6985935313341252, total=   1.5s
[CV] decisiontreeclassifier__min_samples_split=13, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=3, decisiontreeclassifier__max_depth=11 
[CV]  decisiontreeclassifier__min_samples_split=13, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=3, decisiontreeclassifier__max_depth=11, score=0.7011083554518639, total=   1.4s
[CV] decisiontreeclassifier__min_samples_split=13, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=3, decisiontreeclassifier__max_depth=11 
[CV]  decisiontreeclassifier__min_samples_split=13, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=3, decisiontreeclassifier__max_depth=11, score=0.7008471074372216, total=   1.2s
[CV] decisio

[CV]  decisiontreeclassifier__min_samples_split=16, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=4, decisiontreeclassifier__max_depth=12, score=0.7112990112436036, total=   1.7s
[CV] decisiontreeclassifier__min_samples_split=16, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=4, decisiontreeclassifier__max_depth=12 
[CV]  decisiontreeclassifier__min_samples_split=16, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=4, decisiontreeclassifier__max_depth=12, score=0.7118474520104308, total=   1.7s
[CV] decisiontreeclassifier__min_samples_split=5, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=14, decisiontreeclassifier__max_depth=4 
[CV]  decisiontreeclassifier__min_samples_split=5, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=14, decisiontreeclassifier__max_depth=4, score=0.6751908697522822, total=   1.9s
[CV] decisiont

[CV]  decisiontreeclassifier__min_samples_split=11, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=15, decisiontreeclassifier__max_depth=7, score=0.6948125026574659, total=   2.9s
[CV] decisiontreeclassifier__min_samples_split=8, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=11, decisiontreeclassifier__max_depth=17 
[CV]  decisiontreeclassifier__min_samples_split=8, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=11, decisiontreeclassifier__max_depth=17, score=0.7484025374383199, total=   4.7s
[CV] decisiontreeclassifier__min_samples_split=8, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=11, decisiontreeclassifier__max_depth=17 
[CV]  decisiontreeclassifier__min_samples_split=8, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=11, decisiontreeclassifier__max_depth=17, score=0.7452855940177872, total=   4.6s
[CV] decisio

[CV]  decisiontreeclassifier__min_samples_split=23, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=5, decisiontreeclassifier__max_depth=11, score=0.707843483820549, total=   2.1s
[CV] decisiontreeclassifier__min_samples_split=23, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=5, decisiontreeclassifier__max_depth=11 
[CV]  decisiontreeclassifier__min_samples_split=23, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=5, decisiontreeclassifier__max_depth=11, score=0.705990013480799, total=   1.8s
[CV] decisiontreeclassifier__min_samples_split=23, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=5, decisiontreeclassifier__max_depth=11 
[CV]  decisiontreeclassifier__min_samples_split=23, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=5, decisiontreeclassifier__max_depth=11, score=0.7086067971466442, total=   1.9s
[CV] decisiont

[CV]  decisiontreeclassifier__min_samples_split=24, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=16, decisiontreeclassifier__max_depth=11, score=0.7210925746677963, total=   4.9s
[CV] decisiontreeclassifier__min_samples_split=24, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=16, decisiontreeclassifier__max_depth=11 
[CV]  decisiontreeclassifier__min_samples_split=24, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=16, decisiontreeclassifier__max_depth=11, score=0.7222654297456339, total=   4.9s
[CV] decisiontreeclassifier__min_samples_split=24, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=9, decisiontreeclassifier__max_depth=10 
[CV]  decisiontreeclassifier__min_samples_split=24, decisiontreeclassifier__min_samples_leaf=1, decisiontreeclassifier__max_features=9, decisiontreeclassifier__max_depth=10, score=0.7104098343879754, total=   2.7s
[CV] deci

[CV]  decisiontreeclassifier__min_samples_split=24, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=8, decisiontreeclassifier__max_depth=9, score=0.7063175498497913, total=   2.5s
[CV] decisiontreeclassifier__min_samples_split=16, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=17, decisiontreeclassifier__max_depth=17 
[CV]  decisiontreeclassifier__min_samples_split=16, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=17, decisiontreeclassifier__max_depth=17, score=0.7547055861988495, total=   7.2s
[CV] decisiontreeclassifier__min_samples_split=16, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=17, decisiontreeclassifier__max_depth=17 
[CV]  decisiontreeclassifier__min_samples_split=16, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=17, decisiontreeclassifier__max_depth=17, score=0.7528810428793719, total=   7.2s
[CV] deci

[CV]  decisiontreeclassifier__min_samples_split=11, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=8, decisiontreeclassifier__max_depth=4, score=0.6726895156343011, total=   1.1s
[CV] decisiontreeclassifier__min_samples_split=11, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=8, decisiontreeclassifier__max_depth=4 
[CV]  decisiontreeclassifier__min_samples_split=11, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=8, decisiontreeclassifier__max_depth=4, score=0.6728363838703004, total=   1.1s
[CV] decisiontreeclassifier__min_samples_split=11, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=8, decisiontreeclassifier__max_depth=4 
[CV]  decisiontreeclassifier__min_samples_split=11, decisiontreeclassifier__min_samples_leaf=2, decisiontreeclassifier__max_features=8, decisiontreeclassifier__max_depth=4, score=0.673681179475818, total=   1.1s
[CV] decisiontreec

In [5]:
dt = DecisionTreeClassifier(min_samples_split=22, min_samples_leaf=1, max_features=12, max_depth=18)
dt.fit(new_train_X, new_train_y['target'])
print(classification_report(new_val_y['target'], dt.predict(new_val_X)>0.5))
print(roc_auc_score(new_val_y['target'],dt.predict(new_val_X)))

              precision    recall  f1-score   support

         0.0       0.70      0.70      0.70     45023
         1.0       0.71      0.71      0.71     46757

   micro avg       0.71      0.71      0.71     91780
   macro avg       0.71      0.71      0.71     91780
weighted avg       0.71      0.71      0.71     91780

0.7055808475950635


In [6]:
feature_imp = dt.tree_.compute_feature_importances(normalize=True)
feature_imp

array([0.01975149, 0.09631412, 0.09142515, 0.02160081, 0.01617742,
       0.01376882, 0.20059003, 0.02373978, 0.02188981, 0.0379878 ,
       0.02613674, 0.01860469, 0.01862873, 0.02964512, 0.0427191 ,
       0.01062292, 0.01258822, 0.13514117, 0.08356792, 0.07910017])

In [7]:
np.savez('output/dt_new_results.npz', val_target=new_val_y['target'], val_preds= dt.predict(new_val_X), test_target = new_test_y['target'], test_preds=dt.predict(new_test_X), feature_importance=feature_imp)

### Random Forest

In [3]:
train_X_converted = pd.read_csv('music_data/train_X_date_converted.csv')
train_X = pd.read_csv('music_data/train_X.csv')

In [4]:
train_X_converted.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,msno,song_id,source_screen_name,source_system_tab,source_type,song_length,genre_ids,artist_name,composer,lyricist,language,city,bd,gender,registered_via,time,registration_init_time_int,expiration_date_int
0,0,2942719,8145,253733,16,6,8,267517.0,371,4252,8389,2681,9,0,0,2,1,0.296221,6193,6196
1,1,4875524,5224,145235,16,6,8,200620.0,371,34892,74276,26024,6,3,41,1,2,0.490781,5000,6475
2,2,6589819,5474,22231,11,0,7,213342.0,371,20609,27775,9110,9,0,0,2,1,0.663346,6150,6436
3,3,1172060,23177,70181,8,3,3,262246.0,371,44425,83027,34734,2,0,0,2,0,0.117982,5181,6498
4,4,2069395,3269,128141,12,2,2,310753.0,371,42400,81151,32836,2,0,0,2,2,0.20831,4393,6275


In [5]:
train_X.head()

Unnamed: 0.1,Unnamed: 0,msno,song_id,source_screen_name,source_system_tab,source_type,song_length,genre_ids,artist_name,composer,lyricist,language,city,bd,gender,registered_via,registration_init_time,expiration_date,time
0,2942719,8145,253733,16,6,8,267517.0,371,4252,8389,2681,9,0,0,2,1,2016-12-15,2016-12-18,0.296221
1,4875524,5224,145235,16,6,8,200620.0,371,34892,74276,26024,6,3,41,1,2,2013-09-09,2017-09-23,0.490781
2,6589819,5474,22231,11,0,7,213342.0,371,20609,27775,9110,9,0,0,2,1,2016-11-02,2017-08-15,0.663346
3,1172060,23177,70181,8,3,3,262246.0,371,44425,83027,34734,2,0,0,2,0,2014-03-09,2017-10-16,0.117982
4,2069395,3269,128141,12,2,2,310753.0,371,42400,81151,32836,2,0,0,2,2,2012-01-11,2017-03-07,0.20831


In [6]:
train_X.count()

Unnamed: 0                4426382
msno                      4426382
song_id                   4426382
source_screen_name        4426382
source_system_tab         4426382
source_type               4426382
song_length               4426382
genre_ids                 4426382
artist_name               4426382
composer                  4426382
lyricist                  4426382
language                  4426382
city                      4426382
bd                        4426382
gender                    4426382
registered_via            4426382
registration_init_time    4426382
expiration_date           4426382
time                      4426382
dtype: int64