In [1]:
import numpy as np
import pandas as pd
import gc
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata

In [2]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

features = [x for x in train_df.columns if x.startswith("var")]


In [3]:
#Reverse features
for var in features:
    if np.corrcoef( train_df['target'], train_df[var] )[1][0] < 0:
        train_df[var] = train_df[var] * -1
        test_df[var]  = test_df[var]  * -1

In [4]:
#count all values
var_stats = {}
hist_df = pd.DataFrame()
for var in features:
    var_stats = train_df[var].append(test_df[var]).value_counts()
    hist_df[var] = pd.Series(test_df[var]).map(var_stats)
    hist_df[var] = hist_df[var] > 1
#remove fake test rows
ind = hist_df.sum(axis=1) != 200

In [5]:
#recount values without fake rows
var_stats = {}
for var in features:
    var_stats[var] = train_df[var].append(test_df[ind][var]).value_counts()

In [6]:
def logit(p):
    return np.log(p) - np.log(1 - p)

def var_to_feat(vr, var_stats, feat_id ):
    new_df = pd.DataFrame()
    new_df["var"] = vr.values
    new_df["hist"] = pd.Series(vr).map(var_stats)
    new_df["feature_id"] = feat_id
    new_df["var_rank"] = new_df["var"].rank()/200000.
    return new_df.values

In [7]:
TARGET = np.array( list(train_df['target'].values) * 200 )

TRAIN = []
var_mean = {}
var_var  = {}
for var in features:
    tmp = var_to_feat(train_df[var], var_stats[var], int(var[4:]) )
    var_mean[var] = np.mean(tmp[:,0]) 
    var_var[var]  = np.var(tmp[:,0])
    tmp[:,0] = (tmp[:,0]-var_mean[var])/var_var[var]
    TRAIN.append( tmp )
TRAIN = np.vstack( TRAIN )

del train_df
_=gc.collect()

print( TRAIN.shape, len( TARGET ) )

(40000000, 4) 40000000


In [8]:
model = lgb.LGBMClassifier(**{
     'learning_rate': 0.04,
     'num_leaves': 31,
     'max_bin': 1023,
     'min_child_samples': 1000,
     'reg_alpha': 0.1,
     'reg_lambda': 0.2,
     'feature_fraction': 1.0,
     'bagging_freq': 1,
     'bagging_fraction': 0.85,
     'objective': 'binary',
     'n_jobs': -1,
     'n_estimators':200,})

MODELS = []
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=11111)
for fold_, (train_indexes, valid_indexes) in enumerate(skf.split(TRAIN, TARGET)):
    print('Fold:', fold_ )
    model = model.fit( TRAIN[train_indexes], TARGET[train_indexes],
                      eval_set = (TRAIN[valid_indexes], TARGET[valid_indexes]),
                      verbose = 10,
                      eval_metric='auc',
                      early_stopping_rounds=25,
                      categorical_feature = [2] )
    MODELS.append( model )

del TRAIN, TARGET
_=gc.collect()

Fold: 0


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325586	valid_0's auc: 0.528007
[20]	valid_0's binary_logloss: 0.325356	valid_0's auc: 0.528201
[30]	valid_0's binary_logloss: 0.325253	valid_0's auc: 0.528269
[40]	valid_0's binary_logloss: 0.325203	valid_0's auc: 0.528356
[50]	valid_0's binary_logloss: 0.325177	valid_0's auc: 0.528396
[60]	valid_0's binary_logloss: 0.325163	valid_0's auc: 0.528405
[70]	valid_0's binary_logloss: 0.325156	valid_0's auc: 0.528382
[80]	valid_0's binary_logloss: 0.325151	valid_0's auc: 0.528378
Early stopping, best iteration is:
[55]	valid_0's binary_logloss: 0.325169	valid_0's auc: 0.528421
Fold: 1


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325591	valid_0's auc: 0.526874
[20]	valid_0's binary_logloss: 0.325363	valid_0's auc: 0.527129
[30]	valid_0's binary_logloss: 0.325261	valid_0's auc: 0.527225
[40]	valid_0's binary_logloss: 0.325213	valid_0's auc: 0.527317
[50]	valid_0's binary_logloss: 0.325188	valid_0's auc: 0.527387
[60]	valid_0's binary_logloss: 0.325174	valid_0's auc: 0.527431
[70]	valid_0's binary_logloss: 0.325167	valid_0's auc: 0.527434
[80]	valid_0's binary_logloss: 0.325162	valid_0's auc: 0.527446
[90]	valid_0's binary_logloss: 0.325159	valid_0's auc: 0.527446
[100]	valid_0's binary_logloss: 0.325158	valid_0's auc: 0.527424
Early stopping, best iteration is:
[84]	valid_0's binary_logloss: 0.32516	valid_0's auc: 0.527457
Fold: 2


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325583	valid_0's auc: 0.528154
[20]	valid_0's binary_logloss: 0.325349	valid_0's auc: 0.528495
[30]	valid_0's binary_logloss: 0.325244	valid_0's auc: 0.52855
[40]	valid_0's binary_logloss: 0.325194	valid_0's auc: 0.528604
[50]	valid_0's binary_logloss: 0.325168	valid_0's auc: 0.528586
[60]	valid_0's binary_logloss: 0.325155	valid_0's auc: 0.528563
Early stopping, best iteration is:
[41]	valid_0's binary_logloss: 0.32519	valid_0's auc: 0.528611
Fold: 3


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325586	valid_0's auc: 0.527289
[20]	valid_0's binary_logloss: 0.325357	valid_0's auc: 0.52764
[30]	valid_0's binary_logloss: 0.325255	valid_0's auc: 0.527809
[40]	valid_0's binary_logloss: 0.325207	valid_0's auc: 0.527853
[50]	valid_0's binary_logloss: 0.325183	valid_0's auc: 0.527873
[60]	valid_0's binary_logloss: 0.32517	valid_0's auc: 0.527876
[70]	valid_0's binary_logloss: 0.325162	valid_0's auc: 0.527907
[80]	valid_0's binary_logloss: 0.325158	valid_0's auc: 0.527897
[90]	valid_0's binary_logloss: 0.325156	valid_0's auc: 0.527918
[100]	valid_0's binary_logloss: 0.325154	valid_0's auc: 0.527923
[110]	valid_0's binary_logloss: 0.325154	valid_0's auc: 0.527921
[120]	valid_0's binary_logloss: 0.325153	valid_0's auc: 0.52791
[130]	valid_0's binary_logloss: 0.325153	valid_0's auc: 0.527893
Early stopping, best iteration is:
[106]	valid_0's binary_logloss: 0.325154	valid_0's auc: 0.527927
Fold:

New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325579	valid_0's auc: 0.528635
[20]	valid_0's binary_logloss: 0.325341	valid_0's auc: 0.528838
[30]	valid_0's binary_logloss: 0.325233	valid_0's auc: 0.52902
[40]	valid_0's binary_logloss: 0.325181	valid_0's auc: 0.529106
[50]	valid_0's binary_logloss: 0.325154	valid_0's auc: 0.529139
[60]	valid_0's binary_logloss: 0.325139	valid_0's auc: 0.529153
[70]	valid_0's binary_logloss: 0.32513	valid_0's auc: 0.52915
[80]	valid_0's binary_logloss: 0.325125	valid_0's auc: 0.529171
[90]	valid_0's binary_logloss: 0.325122	valid_0's auc: 0.529169
[100]	valid_0's binary_logloss: 0.32512	valid_0's auc: 0.529176
[110]	valid_0's binary_logloss: 0.325119	valid_0's auc: 0.529166
[120]	valid_0's binary_logloss: 0.325118	valid_0's auc: 0.52918
[130]	valid_0's binary_logloss: 0.325118	valid_0's auc: 0.529186
[140]	valid_0's binary_logloss: 0.325117	valid_0's auc: 0.529202
[150]	valid_0's binary_logloss: 0.325117	v

New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325597	valid_0's auc: 0.528034
[20]	valid_0's binary_logloss: 0.325369	valid_0's auc: 0.528367
[30]	valid_0's binary_logloss: 0.325268	valid_0's auc: 0.528518
[40]	valid_0's binary_logloss: 0.32522	valid_0's auc: 0.528574
[50]	valid_0's binary_logloss: 0.325195	valid_0's auc: 0.528628
[60]	valid_0's binary_logloss: 0.325181	valid_0's auc: 0.528692
[70]	valid_0's binary_logloss: 0.325174	valid_0's auc: 0.528702
[80]	valid_0's binary_logloss: 0.32517	valid_0's auc: 0.528713
[90]	valid_0's binary_logloss: 0.325167	valid_0's auc: 0.52872
[100]	valid_0's binary_logloss: 0.325165	valid_0's auc: 0.528755
[110]	valid_0's binary_logloss: 0.325165	valid_0's auc: 0.528742
[120]	valid_0's binary_logloss: 0.325164	valid_0's auc: 0.528752
Early stopping, best iteration is:
[100]	valid_0's binary_logloss: 0.325165	valid_0's auc: 0.528755
Fold: 6


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325576	valid_0's auc: 0.528573
[20]	valid_0's binary_logloss: 0.325339	valid_0's auc: 0.528827
[30]	valid_0's binary_logloss: 0.325232	valid_0's auc: 0.528901
[40]	valid_0's binary_logloss: 0.32518	valid_0's auc: 0.528978
[50]	valid_0's binary_logloss: 0.325154	valid_0's auc: 0.52902
[60]	valid_0's binary_logloss: 0.325139	valid_0's auc: 0.529047
[70]	valid_0's binary_logloss: 0.32513	valid_0's auc: 0.529033
[80]	valid_0's binary_logloss: 0.325125	valid_0's auc: 0.529023
Early stopping, best iteration is:
[59]	valid_0's binary_logloss: 0.32514	valid_0's auc: 0.529053
Fold: 7


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325589	valid_0's auc: 0.528384
[20]	valid_0's binary_logloss: 0.325359	valid_0's auc: 0.528671
[30]	valid_0's binary_logloss: 0.325257	valid_0's auc: 0.528747
[40]	valid_0's binary_logloss: 0.325207	valid_0's auc: 0.528807
[50]	valid_0's binary_logloss: 0.325183	valid_0's auc: 0.528839
[60]	valid_0's binary_logloss: 0.325169	valid_0's auc: 0.528865
[70]	valid_0's binary_logloss: 0.325162	valid_0's auc: 0.528869
[80]	valid_0's binary_logloss: 0.325157	valid_0's auc: 0.528885
[90]	valid_0's binary_logloss: 0.325155	valid_0's auc: 0.528879
[100]	valid_0's binary_logloss: 0.325152	valid_0's auc: 0.52889
[110]	valid_0's binary_logloss: 0.325152	valid_0's auc: 0.528882
[120]	valid_0's binary_logloss: 0.32515	valid_0's auc: 0.528886
Early stopping, best iteration is:
[103]	valid_0's binary_logloss: 0.325152	valid_0's auc: 0.528893
Fold: 8


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325588	valid_0's auc: 0.527704
[20]	valid_0's binary_logloss: 0.325357	valid_0's auc: 0.528011
[30]	valid_0's binary_logloss: 0.325253	valid_0's auc: 0.528141
[40]	valid_0's binary_logloss: 0.325204	valid_0's auc: 0.528205
[50]	valid_0's binary_logloss: 0.325179	valid_0's auc: 0.528267
[60]	valid_0's binary_logloss: 0.325165	valid_0's auc: 0.528317
[70]	valid_0's binary_logloss: 0.325158	valid_0's auc: 0.528325
[80]	valid_0's binary_logloss: 0.325153	valid_0's auc: 0.528322
[90]	valid_0's binary_logloss: 0.325151	valid_0's auc: 0.528323
Early stopping, best iteration is:
[68]	valid_0's binary_logloss: 0.325159	valid_0's auc: 0.528335
Fold: 9


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325586	valid_0's auc: 0.528262
[20]	valid_0's binary_logloss: 0.325356	valid_0's auc: 0.528341
[30]	valid_0's binary_logloss: 0.325253	valid_0's auc: 0.528393
[40]	valid_0's binary_logloss: 0.325204	valid_0's auc: 0.528546
[50]	valid_0's binary_logloss: 0.325177	valid_0's auc: 0.528629
[60]	valid_0's binary_logloss: 0.325164	valid_0's auc: 0.528676
[70]	valid_0's binary_logloss: 0.325157	valid_0's auc: 0.528662
[80]	valid_0's binary_logloss: 0.325151	valid_0's auc: 0.528719
[90]	valid_0's binary_logloss: 0.325149	valid_0's auc: 0.528723
[100]	valid_0's binary_logloss: 0.325147	valid_0's auc: 0.52873
[110]	valid_0's binary_logloss: 0.325145	valid_0's auc: 0.528738
[120]	valid_0's binary_logloss: 0.325145	valid_0's auc: 0.52873
[130]	valid_0's binary_logloss: 0.325144	valid_0's auc: 0.528735
Early stopping, best iteration is:
[107]	valid_0's binary_logloss: 0.325146	valid_0's auc: 0.528746


In [9]:
ypred = np.zeros( (200000,200) )
for feat,var in enumerate(features):
    tmp = var_to_feat(test_df[var], var_stats[var], int(var[4:]) )
    tmp[:,0] = (tmp[:,0]-var_mean[var])/var_var[var]
    for model_id in range(10):
        model = MODELS[model_id]
        ypred[:,feat] += model.predict_proba( tmp )[:,1] / 10.
ypred = np.mean( logit(ypred), axis=1 )

sub = test_df[['ID_code']]
sub['target'] = ypred
sub['target'] = sub['target'].rank() / 200000.
sub.to_csv('golden_sub.csv', index=False)
print( sub.head(10) )


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


  ID_code    target
0  test_0  0.809585
1  test_1  0.867265
2  test_2  0.873530
3  test_3  0.861945
4  test_4  0.753875
5  test_5  0.062715
6  test_6  0.103695
7  test_7  0.639420
8  test_8  0.048835
9  test_9  0.169635
