In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier

from rgf import *     # https://github.com/fukatani/rgf_python

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [7]:
# The lgbm part of the code is infuenced by https://www.kaggle.com/yekenot/simple-stacker-lb-0-284
# with a little modification
# The output of this model is available under Input datasets section
# Preprocessing 
id_test = test['id'].values
target_train = train['target'].values

train = train.drop(['target','id'], axis = 1)
test = test.drop(['id'], axis = 1)

col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(col_to_drop, axis=1)  
test = test.drop(col_to_drop, axis=1)  

train = train.replace(-1, np.nan)
test = test.replace(-1, np.nan)

cat_features = [a for a in train.columns if a.endswith('cat')]

for column in cat_features:
	temp = pd.get_dummies(pd.Series(train[column]))
	train = pd.concat([train,temp],axis=1)
	train = train.drop([column],axis=1)
    
for column in cat_features:
	temp = pd.get_dummies(pd.Series(test[column]))
	test = pd.concat([test,temp],axis=1)
	test = test.drop([column],axis=1)

print(train.values.shape, test.values.shape)


(595212, 198) (892816, 198)


In [8]:
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
#                y_holdout = y[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
#                cross_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
#                print("    cross_score: %.5f" % (cross_score.mean()))
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
            S_test[:, i] = S_test_i.mean(axis=1)

        results = cross_val_score(self.stacker, S_train, y, cv=3, scoring='roc_auc')
        print("Stacker score: %.5f" % (results.mean()))

        self.stacker.fit(S_train, y)
        res = self.stacker.predict_proba(S_test)[:,1]
        return res

In [11]:
# LightGBM params
lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 650
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['feature_fraction'] = 0.9
lgb_params['num_iterations']=900
lgb_params['bagging_freq'] = 1
lgb_params['seed'] = 200

lgb_params2 = {}
lgb_params2['n_estimators'] = 1090
lgb_params2['learning_rate'] = 0.02
lgb_params2['colsample_bytree'] = 0.3   
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16
lgb_params2['num_iterations']=900
lgb_params2['feature_fraction'] = 0.9
lgb_params2['bagging_freq'] = 1
lgb_params2['seed'] = 200


lgb_params3 = {}
lgb_params3['n_estimators'] = 1100
lgb_params3['max_depth'] = 4
lgb_params3['num_iterations']=900
lgb_params3['learning_rate'] = 0.02
lgb_params3['feature_fraction'] = 0.9
lgb_params3['bagging_freq'] = 1
lgb_params3['seed'] = 200

from xgboost import XGBClassifier

xgb_params = {}
xgb_params['objective'] = 'binary:logistic'
xgb_params['learning_rate'] = 0.04
xgb_params['n_estimators'] =490
xgb_params['max_depth'] = 4
xgb_params['subsample'] = 0.9
xgb_params['colsample_bytree'] = 0.9  
xgb_params['min_child_weight'] = 10



In [12]:
lgb_model = LGBMClassifier(**lgb_params)

lgb_model2 = LGBMClassifier(**lgb_params2)

lgb_model3 = LGBMClassifier(**lgb_params3)

xgb_model = XGBClassifier(**xgb_params)

In [21]:
test

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,...,95,96,97,98,99,100,101,102,103,104
0,0,8,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,5,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,5,3,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,6,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,7,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,6,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,3,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,0,7,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,1,6,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
log_model = LogisticRegression()
       
stack = Ensemble(n_splits=6,
        stacker =xgb_model,
        base_models = (lgb_model, lgb_model2, lgb_model3))        
        
y_pred = stack.fit_predict(train, target_train, test)        

sub_1 = pd.DataFrame()
sub_1['id'] = id_test
sub_1['target'] = y_pred

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 1


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 2


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 3


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 4


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 5


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 6


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 1


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 2


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 3


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 4


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 5


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 6


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 1


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 2


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 3


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 4


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 5


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 6
Stacker score: 0.64300


In [23]:
def eval_gini(y_true, y_prob):
   y_true = np.asarray(y_true)
   y_true = y_true[np.argsort(y_prob)]
   ntrue = 0
   gini = 0
   delta = 0
   n = len(y_true)
   for i in range(n-1, -1, -1):
       y_i = y_true[i]
       ntrue += y_i
       gini += y_i * delta
       delta += 1 - y_i
   gini = 1 - 2 * gini / (ntrue * (n - ntrue))
   return gini

print('gini係数 : {}'.format(eval_gini(S_test[test_idx],y_pred)))

NameError: name 'S_test' is not defined

In [16]:
# All these datasets are from different kaggle kernels
stacked_２ = pd.read_csv('stacked_2.csv')
mixed = pd.read_csv('20171125_kernelfromkaggle.csv')


# Ensemble and create submission 

sub = pd.DataFrame()
sub['id'] = stacked_2['id']
sub['target'] = np.exp(np.mean(
	[	
    sub_1['target'].apply(lambda x: np.log(x)),\
    stacked_２['target'].apply(lambda x: np.log(x)),\
    mixed['target'].apply(lambda x: np.log(x))
	], axis =0))

sub.to_csv('sub.csv', index = False)

In [18]:
pd.read_csv('sub.csv')

Unnamed: 0,id,target
0,0,0.032118
1,1,0.029084
2,2,0.026393
3,3,0.016911
4,4,0.042115
5,5,0.054805
6,6,0.022534
7,8,0.039425
8,10,0.066128
9,11,0.065422
