In [None]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
import os
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from scipy.special import expit, logit
from sklearn.metrics import roc_auc_score
from datetime import datetime as dt

almost_zero = 1e-10
almost_one = 1 - almost_zero


traininfo = [['./sub_dart_train_BayesOpt_2018_0501_0125_11.csv', './sub_dart_train_BayesOpt_2018_0501_0315_38.csv',
              './sub_dart_train_BayesOpt_2018_0501_0518_48.csv', './sub_dart_train_BayesOpt_2018_0501_0715_06.csv'],
             ['./sub_xgdt_train_BayesOpt_2018_0430_1519_33.csv', './sub_xgdt_train_BayesOpt_2018_0430_1656_17.csv', 
              './sub_xgdt_train_BayesOpt_2018_0430_1844_31.csv', './sub_xgdt_train_BayesOpt_2018_0430_2026_42.csv']
            ]

validinfo = [['./sub_dart_valid_BayesOpt_2018_0501_0125_11.csv', './sub_dart_valid_BayesOpt_2018_0501_0315_38.csv',
              './sub_dart_valid_BayesOpt_2018_0501_0518_48.csv', './sub_dart_valid_BayesOpt_2018_0501_0715_06.csv'],
             ['./sub_xgdt_valid_BayesOpt_2018_0430_1519_33.csv', './sub_xgdt_valid_BayesOpt_2018_0430_1656_17.csv', 
              './sub_xgdt_valid_BayesOpt_2018_0430_1844_31.csv', './sub_xgdt_valid_BayesOpt_2018_0430_2026_42.csv']
            ]

testinfo = [['./sub_dart_BayesOpt_2018_0501_0148_24.csv', './sub_dart_BayesOpt_2018_0501_0339_24.csv',
             './sub_dart_BayesOpt_2018_0501_0544_49.csv', './sub_dart_BayesOpt_2018_0501_0739_43.csv'],
            ['./sub_it_BayesOpt_2018_0430_1527_02.csv', './sub_it_BayesOpt_2018_0430_1703_36.csv',
             './sub_it_BayesOpt_2018_0430_1854_57.csv', './sub_it_BayesOpt_2018_0430_2035_07.csv']]

train_trueinfo = [['./rawdata_train_From2400_2018_0501_0125_11.csv', './rawdata_train_From2400_2018_0501_0315_38.csv',
                   './rawdata_train_From2400_2018_0501_0518_48.csv', './rawdata_train_From2400_2018_0501_0715_06.csv'],
                  ['./rawdata_valid_From2400_2018_0501_0125_11.csv', './rawdata_valid_From2400_2018_0501_0315_38.csv',
                   './rawdata_valid_From2400_2018_0501_0518_48.csv', './rawdata_valid_From2400_2018_0501_0715_06.csv']
                 ]
dtypes = {'is_attributed' : 'uint8'}

validlen = 2500000
models = []
starttime = time.time()
for i in range(4):
    print('Start fitting ' + str(i) + '-th model')
    data = pd.DataFrame()
    valid = pd.DataFrame()
    
    # dart
    data['is_attributed_dart_1'] = pd.read_csv(traininfo[0][i], usecols=['is_attributed'])
    valid['is_attributed_dart_1'] = pd.read_csv(validinfo[0][i], usecols=['is_attributed'])
    print(f'[{time.time() - starttime: .2f}]: Read dart finished')

    # xgdt
    data['is_attributed_xgdt_1'] = pd.read_csv(traininfo[1][i], usecols=['is_attributed'])
    valid['is_attributed_xgdt_1'] = pd.read_csv(validinfo[1][i], usecols=['is_attributed'])
    print(f'[{time.time() - starttime: .2f}]: Read xgdt finished')    

    data = data.clip(almost_zero, almost_one).apply(logit)
    valid = valid.clip(almost_zero, almost_one).apply(logit)
    gc.collect()
    print(f'[{time.time() - starttime: .2f}]: Split train finished')

    # y
    y = pd.read_csv(train_trueinfo[0][i], usecols=['is_attributed'], dtype=dtypes)
    yvalid = pd.read_csv(train_trueinfo[1][i], usecols=['is_attributed'], dtype=dtypes)
    print(f'[{time.time() - starttime: .2f}]: Read and Split y finished')

    # pred
    clf = LogisticRegression(class_weight='balanced', solver='sag', C=4, n_jobs=4)
    clf.fit(data, y.values.ravel())
    print(f'[{time.time() - starttime: .2f}]: Fit finished')
    print('RocAuc Score: ' + str(roc_auc_score(yvalid.values.ravel(), clf.predict_proba(valid)[:, 1])))    
    
    models.append(clf)
    del data, valid, y, yvalid
    gc.collect()
    print(f'[{time.time() - starttime: .2f}]: Model saved')

preds = pd.DataFrame()
for i in range(4):
    print('Start submission ' + str(i) + '-th model')
    test = pd.DataFrame()
    
    # dart
    train = pd.read_csv(testinfo[0][i], usecols=['is_attributed'])
    test['is_attributed_dart_1'] = train['is_attributed']
    del train; gc.collect()
    print(f'[{time.time() - starttime: .2f}]: Read dart-test finished')

    # xgdt
    train = pd.read_csv(traininfo[1][i], usecols=['is_attributed'])
    test['is_attributed_xgdt_1'] = train['is_attributed']
    del train; gc.collect()
    print(f'[{time.time() - starttime: .2f}]: Read xgdt-test finished')    

    test = test.clip(almost_zero, almost_one).apply(logit)
    gc.collect()
    preds['is_attributed_' + str(i)] = models[i].predict_proba(test)[:, 1]
    del test; gc.collect()
    print(f'[{time.time() - starttime: .2f}]: Predict finished')
    sub = pd.read_csv('../input/test.csv', usecols=['click_id'])
    sub['is_attributed'] = preds['is_attributed_' + str(i)]
    datetime = dt.now().strftime('_%Y_%m%d_%H%M_%S')
    fname = '../sub/sub_Logistic_DartXGDT_Simple' + datetime + '.csv'
    sub.to_csv(fname, index=False)
    del sub; gc.collect()
    
print(f'[{time.time() - starttime: .2f}]: Start making submission file')
sub = pd.read_csv('../input/test.csv', usecols=['click_id'])
coef = [0.25, 0.25, 0.25, 0.25]
sub['is_attributed'] = np.exp(coef[0]*np.log(preds['is_attributed_0'])
                              + coef[1]*np.log(preds['is_attributed_1'])
                              + coef[2]*np.log(preds['is_attributed_2'])
                              + coef[3]*np.log(preds['is_attributed_3']))

datetime = dt.now().strftime('_%Y_%m%d_%H%M_%S')
fname = '../sub/sub_Logistic_DartXGDT_' + datetime + '.csv'
sub.to_csv(fname, index=False)

del sub, preds; gc.collect()
print(f'[{time.time() - starttime: .2f}]: Subs created')



Start fitting 0-th model
[ 14.33]: Read dart finished
[ 23.78]: Read xgdt finished
[ 26.14]: Split train finished
[ 71.92]: Read and Split y finished
