In [None]:
from __future__ import division
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from time import time
from catboost import CatBoostClassifier
import gc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint
from hyperopt import hp, tpe, STATUS_OK, Trials, fmin
from sklearn.ensemble import RandomForestClassifier

print('loading files...')
start = time()
train = pd.read_csv('D:/Driver/train.csv', na_values=-1)
test = pd.read_csv('D:/Driver/test.csv',na_values=-1)
print(train.shape, test.shape)

unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(unwanted, axis=1)
test = test.drop(unwanted, axis=1)

train.fillna(-1,inplace=True)
test.fillna(-1,inplace=True)
# custom objective function (similar to auc)
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True


X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = X.values
y = train['target'].values
sub = test['id'].to_frame()
sub['target'] = 0
sub['xgb'] = 0
sub['lgb'] = 0

nrounds = 2000  # need to change to 2000
kfold = 5  # need to change to 5

# # rf
# rf = RandomForestClassifier(max_features = 0.6, 
#                             min_samples_split = 325, 
#                             n_estimators = 500, 
#                             max_depth = 14, 
#                             min_samples_leaf = 333,
#                             n_jobs=4,verbose=5)

# skf = StratifiedKFold(n_splits=kfold, random_state=2017)
# for i, (train_index, test_index) in enumerate(skf.split(X, y)):
#     print('rf kfold: {}  of  {} : '.format(i + 1, kfold))
#     X_train, X_valid = X[train_index], X[test_index]
#     y_train, y_valid = y[train_index], y[test_index]
#     rf_model = rf.fit(X_train, y_train)
#     y_valid_pred = rf_model.predict_proba(X_valid)[:,1]
#     print 'Fold {}: {}'.format(i+1 ,2*roc_auc_score(y_valid, y_valid_pred)-1)
#     sub['rf'] += rf_model.predict_proba(test[features].values)[:,1] / (kfold)

# xgb
print('xgb start...')
# params = {'eta': 0.025, 'max_depth': 7, 'subsample': 0.8, 'colsample_bytree': 0.4,
#           'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True, 'max_delta_step':1.8,
#           'min_child_weight':8, 'gamma':0.65}

params = {
    'min_child_weight': 10.0,
    'objective': 'binary:logistic',
    'max_depth': 7,
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.025,
    'gamma': 0.65,
    'num_boost_round' : 700
    }

skf = StratifiedKFold(n_splits=kfold, random_state=42)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print('xgb kfold: {}  of  {} : '.format(i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=70,
                          feval=gini_xgb, maximize=True, verbose_eval=100)
    sub['xgb'] += xgb_model.predict(xgb.DMatrix(test[features].values),
                                    ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
gc.collect()

sub['xgb'].to_csv('no_ohe_xgb.csv',index=False)
# # lgb
# print 'lgb start...'
# params = {'metric': 'auc', 'learning_rate': 0.01, 'max_depth': 10, 'max_bin': 10, 'objective': 'binary',
#           'feature_fraction': 0.8, 'bagging_fraction': 0.9, 'bagging_freq': 10, 'min_data': 500}

# skf = StratifiedKFold(n_splits=kfold, random_state=2017)
# for i, (train_index, test_index) in enumerate(skf.split(X, y)):
#     print('lgb kfold: {}  of  {} : '.format(i + 1, kfold))
#     X_train, X_eval = X[train_index], X[test_index]
#     y_train, y_eval = y[train_index], y[test_index]
#     lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds,
#                           lgb.Dataset(X_eval, label=y_eval), verbose_eval=100,
#                           feval=gini_lgb, early_stopping_rounds=100)
#     sub['lgb'] += lgb_model.predict(test[features].values,
#                                     num_iteration=lgb_model.best_iteration+50) / (2 * kfold)

# gc.collect()
print sub.head(2)
print('Total time: {} mins'.format((time()-start) / 60))

loading files...
((595212, 59), (892816, 58))
xgb start...
xgb kfold: 1  of  5 : 
[0]	train-error:0.036447	valid-error:0.036449	train-gini:0.039317	valid-gini:0.047052
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 70 rounds.
[100]	train-error:0.036447	valid-error:0.036449	train-gini:0.319812	valid-gini:0.266746
[200]	train-error:0.036447	valid-error:0.036449	train-gini:0.358216	valid-gini:0.275074
[300]	train-error:0.036437	valid-error:0.036449	train-gini:0.395347	valid-gini:0.279543


In [None]:
res = pd.read_csv('avg.csv')
cat = pd.read_csv('cat_submit.csv')
ori = pd.read_csv('no_ohe_xgb.csv')
res['cat'] = cat['target']
res['ori_xgb'] = ori['xgb']
res.head()

In [None]:
res.to_csv('avg.csv',index=False)

In [3]:
sub[['id', 'rf']].to_csv('rf.csv', index=False)

In [10]:
sub.to_csv('C:/Users/KrystalU/Documents/Data/Driver/avg.csv', index=False)

In [4]:
sub['xgb'] = 2*sub['xgb']
sub['lgb'] = 2*sub['lgb']
sub.head()

Unnamed: 0,id,target,xgb,lgb
0,0,0,0.027262,0.027969
1,1,0,0.023622,0.025589
2,2,0,0.0236,0.025659
3,3,0,0.015024,0.015382
4,4,0,0.037317,0.036267


In [5]:
sub['0.75xgb+0.25lgb'] = 0.75*sub['xgb'] + 0.25*sub['lgb']
sub['0.25xgb+0.75lgb'] = 0.25*sub['xgb'] + 0.75*sub['lgb']
sub['0.5xgb+0.5lgb'] = 0.5*sub['xgb'] + 0.5*sub['lgb']
sub.head()

Unnamed: 0,id,target,xgb,lgb,0.75xgb+0.25lgb,0.25xgb+0.75lgb,0.5xgb+0.5lgb
0,0,0,0.027262,0.027969,0.027439,0.027792,0.027615
1,1,0,0.023622,0.025589,0.024114,0.025097,0.024605
2,2,0,0.0236,0.025659,0.024115,0.025145,0.02463
3,3,0,0.015024,0.015382,0.015114,0.015292,0.015203
4,4,0,0.037317,0.036267,0.037055,0.036529,0.036792


In [6]:
res1 = sub[['id', '0.75xgb+0.25lgb']]
res1.columns = ['id', 'target']
res1.to_csv('sub_0.75xgb+0.25lgb.csv',index=False)

In [8]:
for item in ['0.25xgb+0.75lgb', '0.5xgb+0.5lgb']:
    res = sub[['id', item]]
    res.columns = ['id', 'target']
    res.to_csv('sub_{}.csv'.format(item),index=False)

In [9]:
sub.head()

Unnamed: 0,id,target,xgb,lgb,0.75xgb+0.25lgb,0.25xgb+0.75lgb,0.5xgb+0.5lgb
0,0,0,0.027262,0.027969,0.027439,0.027792,0.027615
1,1,0,0.023622,0.025589,0.024114,0.025097,0.024605
2,2,0,0.0236,0.025659,0.024115,0.025145,0.02463
3,3,0,0.015024,0.015382,0.015114,0.015292,0.015203
4,4,0,0.037317,0.036267,0.037055,0.036529,0.036792


In [11]:
for item in ['xgb']:
    res = sub[['id', item]]
    res.columns = ['id', 'target']
    res.to_csv('sub_{}.csv'.format(item),index=False)

In [None]:
for i in xrnage