In [None]:
from __future__ import division
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, cross_val_score
from time import time
from catboost import CatBoostClassifier
import gc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint
from hyperopt import hp, tpe, STATUS_OK, Trials, fmin
from sklearn.ensemble import RandomForestClassifier

# custom objective function (similar to auc)
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y))], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

def load_data():
    print('loading files...')
    train = pd.read_csv('D:/Driver/ohe_train_v2.csv')
    test = pd.read_csv('D:/Driver/ohe_test_v2.csv')
    print(train.shape, test.shape)

    unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
    train = train.drop(unwanted, axis=1)
    test = test.drop(unwanted, axis=1)
    return train, test

def transform(df, d_median, d_mean):
    dcol = [c for c in df.columns if c not in ['id','target'] and 'ohe' not in c]
    print('Length of dcol: {}'.format(len(dcol)))
    df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
    for c in dcol:
        if '_bin' not in c: #standard arithmetic
            df[c+str('_median_range')] = (df[c].values > d_median[c]).astype(np.int)
            df[c+str('_mean_range')] = (df[c].values > d_mean[c]).astype(np.int)
    return df

if __name__ == '__main__':
    train, test = load_data()

    start = time()

    X = train.drop(['id', 'target'], axis=1)
    features = X.columns
    y = train['target']
    sub = test['id'].to_frame()
    sub['target'] = 0
    sub['xgb'] = 0
    sub['lgb'] = 0
    # sub['rf'] = 0
    test.drop('id',axis=1,inplace=True)

    nrounds = 2000  # need to change to 2000
    kfold = 5  # need to change to 5

    # # rf
    # rf = RandomForestClassifier(max_features = 0.6,
    #                             min_samples_split = 325,
    #                             n_estimators = 500,
    #                             max_depth = 14,
    #                             min_samples_leaf = 333,
    #                             n_jobs=4,verbose=5)
    #
    # skf = StratifiedKFold(n_splits=kfold, random_state=2017)
    # for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    #     print('rf kfold: {}  of  {} : '.format(i + 1, kfold))
    #     X_train, X_valid = X[train_index], X[test_index]
    #     y_train, y_valid = y[train_index], y[test_index]
    #     rf_model = rf.fit(X_train, y_train)
    #     y_valid_pred = rf_model.predict_proba(X_valid)[:,1]
    #     print 'Fold {}: {}'.format(i+1 ,2*roc_auc_score(y_valid, y_valid_pred)-1)
    #     sub['sub'] += rf_model.predict_proba(test[features].values)[:,1] / (kfold)

    test_median = test.median(axis=0)
    test_mean = test.mean(axis=0)
    test = transform(test, test_median, test_mean)

    print(X.shape, test.shape)
    # xgb
    print('xgb start...')
    params = {'eta': 0.025, 'max_depth': 7, 'subsample': 0.8, 'colsample_bytree': 0.4,
              'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True, 'max_delta_step':1.8,
              'min_child_weight':8, 'gamma':0.65}

    skf = StratifiedKFold(n_splits=kfold, random_state=2016)
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        print('xgb kfold: {}  of  {} : '.format(i + 1, kfold))
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

        d_median = X_train.median(axis=0)
        d_mean = X_train.mean(axis=0)
        X_train = transform(X_train, d_median, d_mean)
        d_median = X_valid.median(axis=0)
        d_mean = X_valid.mean(axis=0)
        X_valid = transform(X_valid, d_median, d_mean)
        
        print(set(test.columns) - set(X_train.columns))
        exit()
        
        d_train = xgb.DMatrix(X_train.values, y_train.values)
        d_valid = xgb.DMatrix(X_valid.values, y_valid.values)
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100,
                              feval=gini_xgb, maximize=True, verbose_eval=100)
        sub['xgb'] += xgb_model.predict(xgb.DMatrix(test.values),
                                        ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
        print(sub.head())
    gc.collect()


    # lgb
    print('lgb start...')
    params = {'metric': 'auc', 'learning_rate': 0.01, 'max_depth': 10, 'max_bin': 10, 'objective': 'binary',
              'feature_fraction': 0.8, 'bagging_fraction': 0.9, 'bagging_freq': 10, 'min_data': 500}

    skf = StratifiedKFold(n_splits=kfold, random_state=2017)
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        print('lgb kfold: {}  of  {} : '.format(i + 1, kfold))
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_vaid = y.iloc[train_index], y.iloc[test_index]

        d_median = X_train.median(axis=0)
        d_mean = X_train.mean(axis=0)
        X_train = transform(X_train, d_median, d_mean)
        d_median = X_valid.median(axis=0)
        d_mean = X_valid.mean(axis=0)
        X_valid = transform(X_valid, d_median, d_mean)

        lgb_model = lgb.train(params, lgb.Dataset(X_train.values, label=y_train.values), nrounds,
                              lgb.Dataset(X_valid.values, label=y_valid.values), verbose_eval=100,
                              feval=gini_lgb, early_stopping_rounds=100)
        sub['lgb'] += lgb_model.predict(test.values,
                                        num_iteration=lgb_model.best_iteration+50) / (kfold)
        print(sub.head())

    gc.collect()
    print(sub.head(2))
    sub.to_csv('D:/Driver/sub25.csv',index=False)
    print('Total time: {} mins'.format((time()-start) / 60))

loading files...
((595212, 229), (892816, 229))
Length of dcol: 25
((595212, 207), (892816, 237))
xgb start...




xgb kfold: 1  of  5 : 
Length of dcol: 25


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Length of dcol: 25
set(['target'])
[0]	train-auc:0.514444	valid-auc:0.512605	train-gini:0.029039	valid-gini:0.035007
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.


KeyboardInterrupt: 