In [4]:
import pandas as pd
import numpy as np
import gc
import random
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.cross_validation import StratifiedKFold, cross_val_score, KFold
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier


In [6]:

def binarize(columnName, df, features=None):
    df[columnName] = df[columnName].astype(str)
    if features is None:
        features = np.unique(df[columnName].values)
    for x in features:
        df[columnName+'_' + x] = df[columnName].map(lambda y: 1 if y == x else 0)
    return df, features


def add_bernulli(train, test, features):
    for col in features:
        train, binfeatures = binarize(col, train)
        test, _ = binarize(col, test, binfeatures)
        nb = BernoulliNB()
        nb.fit(train[col + '_' + binfeatures].values, y_train)
        train['naive_' + col] = nb.predict_proba(train[col + '_' + binfeatures].values)[:, 1]
        test['naive_' + col] = nb.predict_proba(test[col + '_' + binfeatures].values)[:, 1]
        train.drop(col + '_' + binfeatures, inplace=True, axis=1)
        test.drop(col + '_' + binfeatures, inplace=True, axis=1)
    return train, test


def factorize(train, test):
    for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(), test.iteritems()):
        if train_series.dtype == 'O':
            train[train_name], tmp_indexer = pd.factorize(train[train_name])
            test[test_name] = tmp_indexer.get_indexer(test[test_name])
        else:
            tmp_len = len(train[train_series.isnull()])
            if tmp_len > 0:
                train.loc[train_series.isnull(), train_name] = -2
            tmp_len = len(test[test_series.isnull()])
            if tmp_len > 0:
                test.loc[test_series.isnull(), test_name] = -2
    return train, test


def stacking(X_train, X_test, y_train, skf, clfs):
    meta_train = np.zeros((X_train.shape[0], len(clfs)))
    meta_test  = np.zeros((X_test.shape[0],  len(clfs)))
    
    for j, clf in enumerate(clfs):
        print('Clf', j+1)
        meta_test_j = np.zeros((X_test.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print('Fold', i+1)
            X_tr = X_train[train]
            y_tr = y_train[train]
            X_ts = X_train[test]
            y_ts = y_train[test]
            clf.fit(X_tr, y_tr)
            y_submission = clf.predict_proba(X_ts)[:, 1]
            meta_train[test, j] = y_submission
            meta_test_j[:, i] = clf.predict_proba(X_test)[:, 1]
        meta_test[:, j] = meta_test_j.mean(1)
        gc.collect()
        
    return meta_train, meta_test

In [7]:
class addNearestNeighbourLinearFeatures:
    
    def __init__(self, n_neighbours=1, max_elts=None, verbose=True, random_state=None):
        self.rnd = random_state
        self.n = n_neighbours
        self.max_elts = max_elts
        self.verbose = verbose
        self.neighbours = []
        self.clfs = []
        
    def fit(self,train, y):
        if self.rnd != None:
            random.seed(rnd)
        if self.max_elts == None:
            self.max_elts = len(train.columns)
        list_vars = list(train.columns)
        random.shuffle(list_vars)
        
        lastscores = np.zeros(self.n) + 1e15

        for elt in list_vars[:self.n]:
            self.neighbours.append([elt])
        list_vars = list_vars[self.n:]
        
        for elt in list_vars:
            indice = 0
            scores = []
            for elt2 in self.neighbours:
                if len(elt2) < self.max_elts:
                    clf = LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1) 
                    clf.fit(train[elt2 + [elt]], y)
                    scores.append(log_loss(y,clf.predict(train[elt2 + [elt]])))
                    indice = indice + 1
                else:
                    scores.append(lastscores[indice])
                    indice = indice + 1
            gains = lastscores - scores
            if gains.max() > 0:
                temp = gains.argmax()
                lastscores[temp] = scores[temp]
                self.neighbours[temp].append(elt)

        indice = 0
        for elt in self.neighbours:
            clf = LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1) 
            clf.fit(train[elt], y)
            self.clfs.append(clf)
            if self.verbose:
                print(indice, lastscores[indice], elt)
            indice = indice + 1
                    
    def transform(self, train):
        indice = 0
        for elt in self.neighbours:
            train['_'.join(pd.Series(elt).sort_values().values)] = self.clfs[indice].predict(train[elt])
            indice = indice + 1
        return train
    
    def fit_transform(self, train, y):
        self.fit(train, y)
        return self.transform(train)

In [8]:
class Addfeatures(TransformerMixin, BaseEstimator):
    
    def __init__(self, n_neighbours=1, max_elts=None):
        self.n = n_neighbours
        self.max_elts = max_elts
        self.neighbours = []
        self.clfs = []
        
    def fit(self,X, y):
        if self.max_elts == None:
            self.max_elts = len(X.columns)
        list_vars = list(X.columns)
        random.shuffle(list_vars)
        
        lastscores = np.zeros(self.n) + 1e15

        for elt in list_vars[:self.n]:
            self.neighbours.append([elt])
        list_vars = list_vars[self.n:]
        
        for elt in list_vars:
            indice = 0
            scores = []
            for elt2 in self.neighbours:
                if len(elt2) < self.max_elts:
                    clf = LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1) 
                    clf.fit(train[elt2 + [elt]], y)
                    scores.append(log_loss(y,clf.predict(train[elt2 + [elt]])))
                    indice = indice + 1
                else:
                    scores.append(lastscores[indice])
                    indice = indice + 1
            gains = lastscores - scores
            if gains.max() > 0:
                temp = gains.argmax()
                lastscores[temp] = scores[temp]
                self.neighbours[temp].append(elt)

        indice = 0
        for elt in self.neighbours:
            clf = LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1) 
            clf.fit(train[elt], y)
            self.clfs.append(clf)
            indice = indice + 1
                    
    def transform(self, train):
        indice = 0
        for elt in self.neighbours:
            train['_'.join(pd.Series(elt).sort_values().values)] = self.clfs[indice].predict(train[elt])
            indice = indice + 1
        return train
    
    def fit_transform(self, train, y):
        self.fit(train, y)
        return self.transform(train)

NameError: name 'TransformerMixin' is not defined

In [5]:
def preprocess_data(train, test):
    
    drop_columns = ['v8', 'v23', 'v25', 'v31', 'v36', 'v37', 'v46', 'v51', 'v53', 'v54', 'v63', 'v73', 'v75',
                    'v79', 'v81', 'v82', 'v89', 'v92', 'v95', 'v105', 'v107', 'v108', 'v109', 'v110', 'v116', 'v117',
                    'v118', 'v119', 'v123', 'v124', 'v128']
    train.drop(drop_columns, axis=1, inplace=True)
    test.drop(drop_columns, axis=1, inplace=True)
    
    naive_vars = ['v24', 'v112', 'v30', 'v91', 'v56', 'v74', 'v125', 'v71', 'v113', 'v47', 'v3', 'v66']
    
    train, test = factorize(train, test)
    train, test = add_bernulli(train, test, naive_vars)
    
    return train, test

In [6]:
def prepare_data(train, test):
    trainids = train.ID.values
    testids = test.ID.values
    targets = train['target'].values
    tokeep = [ 'v3', 'v10', 'v12', 'v14', 'v21', 'v22', 'v24', 'v30', 'v31', 'v34', 'v38', 'v40',
              'v50', 'v52', 'v56', 'v62', 'v66', 'v71', 'v72', 'v74', 'v75', 'v79', 'v91', 'v47',  
              'v112', 'v113', 'v114', 'v125', 'v129']
    features = train.columns[2:]
    todrop = list(set(features).difference(tokeep))
    train.drop(todrop, inplace=True, axis=1)
    test.drop(todrop, inplace=True, axis=1)
    features = train.columns[2:]
    for col in features:
        if((train[col].dtype == 'object')):
            train.loc[~train[col].isin(test[col]), col] = 'Orphans'
            test.loc[~test[col].isin(train[col]), col] = 'Orphans'
            train[col].fillna('Missing', inplace=True)
            test[col].fillna('Missing', inplace=True)
            train[col], tmp_indexer = pd.factorize(train[col])
            test[col] = tmp_indexer.get_indexer(test[col])
            traincounts = train[col].value_counts().reset_index()
            traincounts.rename(columns={'index': col, col: col+'_count'}, inplace=True)
            traincounts = traincounts[traincounts[col+'_count'] >= 50]
            g = train[[col, 'target']].copy().groupby(col).mean().reset_index()
            g = g[g[col].isin(traincounts[col])]
            g.rename(columns={'target': col+'_avg'}, inplace=True)
            train = train.merge(g, how='left', on=col)
            test = test.merge(g, how='left', on=col)
            h = train[[col, 'target']].copy().groupby(col).std().reset_index()
            h = h[h[col].isin(traincounts[col])]
            h.rename(columns={'target': col+'_std'}, inplace=True)
            train = train.merge(h, how='left', on=col)
            test = test.merge(h, how='left', on=col)
            train.drop(col, inplace=True, axis=1)
            test.drop(col, inplace=True, axis=1)

    features = train.columns[2:]
    train.fillna(-1, inplace=True)
    test.fillna(-1, inplace=True)
    train[features] = train[features].astype(float)
    test[features] = test[features].astype(float)
    ss = StandardScaler()
    train[features] = np.round(ss.fit_transform(train[features].values), 6)
    test[features] = np.round(ss.transform(test[features].values), 6)
    gptrain = pd.DataFrame()
    gptest = pd.DataFrame()
    gptrain.insert(0, 'ID', trainids)
    gptest.insert(0, 'ID', testids)
    gptrain = pd.merge(gptrain, train[list(['ID'])+list(features)], on='ID')
    gptest = pd.merge(gptest, test[list(['ID'])+list(features)], on='ID')
    gptrain['TARGET'] = targets
    del train
    del test
    gc.collect()
    
    return gptrain, gptest

In [7]:
y_train = pd.read_csv('train.csv')['target'].values
cv = StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=115)

### Stacking: 1 level

In [8]:
skf = list(StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=115))

#### Base

In [11]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

y_train = train['target'].values
id_test = test['ID'].values

train.drop(['ID', 'target'], axis=1, inplace=True)
test.drop(['ID'], axis=1, inplace=True)

X_train, X_test = preprocess_data(train, test)

X_train_n = np.array(X_train, dtype=np.float32)
X_test_n = np.array(X_test, dtype=np.float32)

In [14]:
pca = PCA(n_components=50)

In [15]:
pca.fit(X_train_n)

PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [25]:
b = pca.components_.T

In [42]:
lda = LinearDiscriminantAnalysis(n_components= 50)

In [43]:
lda.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=50, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [56]:
lda.coef_.shape

(1, 112)

In [54]:
lda.explained_variance_ratio_

array([ 1.])

In [9]:
clfs = [ExtraTreesClassifier(n_estimators=800, criterion='entropy', max_depth=37, max_features=25, 
                             min_samples_split=4, min_samples_leaf=2, n_jobs=-1, random_state=888),                  
        XGBClassifier(n_estimators=600, learning_rate=0.03, max_depth=10, colsample_bytree=0.4, 
                      min_child_weight=1, seed=88888), 
        RandomForestClassifier(n_estimators=800, criterion='gini')]

meta_train, meta_test = stacking(X_train_n, X_test_n, y_train, skf, clfs)

meta_train_1 = pd.DataFrame(meta_train, index=X_train.index, columns=['base_et', 'base_xgb', 'base_rf'])
meta_test_1 = pd.DataFrame(meta_test, index=X_test.index, columns=['base_et', 'base_xgb', 'base_rf'])

Clf 1
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 2
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 3
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


#### Base Logistic Regression

In [10]:
ss = StandardScaler()
X_train_n = ss.fit_transform(X_train_n)
X_test_n = ss.transform(X_test_n)

In [18]:
clfs = [LogisticRegression(C=1.0, penalty='l2', n_jobs=-1)]

meta_train, meta_test = stacking(X_train_n, X_test_n, y_train, skf, clfs)

meta_train_2 = pd.DataFrame(meta_train, index=X_train.index, columns=['base_lr'])
meta_test_2 = pd.DataFrame(meta_test, index=X_test.index, columns=['base_lr'])

Clf 1
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


#### One-hot encodered features only

In [19]:
values, counts = np.unique(X_train.v22, return_counts=True)
counts = {x : y for x, y in zip(values, counts)}
X_train.v22 = X_train.v22.apply(lambda x: x if counts.get(x, 0) > 50 else 0)
X_test.v22 = X_test.v22.apply(lambda x: x if counts.get(x, 0) > 50 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [20]:
cat_vars = ['v24', 'v112', 'v30', 'v91', 'v52', 'v56', 'v74', 'v125', 'v71', 'v113', 'v47', 'v3', 'v66']

In [21]:
cat_vars += ['v22']
X_train = X_train[cat_vars]
X_test = X_test[cat_vars]
gc.collect()

data = pd.concat((X_train, X_test), axis=0, ignore_index=True)
for col in cat_vars:
    data = pd.concat((data, pd.get_dummies(data[col], prefix=col)), axis=1)
    data.drop(col, axis=1, inplace=True)

X_train_n = np.array(data, dtype=np.float32)[:X_train.shape[0]]
X_test_n = np.array(data, dtype=np.float32)[X_train.shape[0]:]
gc.collect()

35

In [22]:
clfs = [ExtraTreesClassifier(n_estimators=500, criterion='entropy', max_depth=45, max_features=30, 
                              random_state=333, n_jobs=-1),
        XGBClassifier(max_depth=8, learning_rate=0.05, n_estimators=600, min_child_weight=5, 
                      subsample=0.95, colsample_bytree=0.35, seed=901),
        RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=50, max_features=20, 
                              random_state=333, n_jobs=-1),
        LogisticRegression(C=0.05, penalty='l2', n_jobs=-1)]

meta_train, meta_test = stacking(X_train_n, X_test_n, y_train, skf, clfs)

meta_train_3 = pd.DataFrame(meta_train, index=X_train.index, columns=['ohe_et', 'ohe_xgb', 'ohe_rf', 'ohe_lr'])
meta_test_3 = pd.DataFrame(meta_test, index=X_test.index, columns=['ohe_et', 'ohe_xgb', 'ohe_rf', 'ohe_lr'])

Clf 1
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 2
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 3
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 4
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


#### Neighbors linear regressions

In [29]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.drop(['ID', 'target'], axis=1, inplace=True)
test.drop(['ID'], axis=1, inplace=True)

train, test = factorize(train, test)

In [27]:
# add more 4 variables from columns v22, transform LabelEnconding values of columns into Unicode values
train['v22-1'] = train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0]))
test['v22-1'] = test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0]))
train['v22-2'] = train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1]))
test['v22-2'] = test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1]))
train['v22-3'] = train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2]))
test['v22-3'] = test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2]))
train['v22-4'] = train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3]))
test['v22-4'] = test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3]))

drop_list=['v91','v1', 'v8', 'v10', 'v15', 'v17', 'v25', 'v29', 'v34', 'v41', 
           'v46', 'v54', 'v64', 'v67', 'v97', 'v105', 'v111', 'v122']

train = train.drop(drop_list,axis=1).fillna(-2)
test = test.drop(drop_list,axis=1).fillna(-2)

In [33]:
rnd = 12
random.seed(rnd)
n_ft = 20 # Number of features to add
max_elts = 3 # Maximum size of a group of linear features

a = addNearestNeighbourLinearFeatures(n_neighbours=n_ft, max_elts=max_elts, verbose=False, random_state=rnd)
a.fit(train, y_train)

train = a.transform(train)
test = a.transform(test)

X_train_n = np.array(train, dtype=float)
X_test_n = np.array(test, dtype=float)

In [34]:
train.head()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,...,v123_v68_v76,v20_v39_v98,v118_v17_v60,v130_v23_v51,v117_v16_v63,v102_v128_v5,v43_v67_v73,v35_v58_v75,v127_v129_v87,v116_v33_v42
0,1.335739,8.727474,0,3.921026,7.915266,2.599278,3.176895,0.012941,9.999999,0.503281,...,0.692118,0.35129,0.330708,1.107842,0.584594,0.546231,0.94773,0.458984,0.432507,1.015836
1,-2.0,-2.0,0,-2.0,9.191265,-2.0,-2.0,2.30163,-2.0,1.31291,...,0.501866,0.231017,0.668665,0.393885,0.59725,0.743152,0.61985,-0.103345,0.881238,0.655353
2,0.943877,5.310079,0,4.410969,5.326159,3.979592,3.928571,0.019645,12.666667,0.765864,...,0.305699,-0.17819,0.034876,1.452979,0.311165,0.490311,0.31454,0.991893,0.917135,0.507833
3,0.797415,8.304757,0,4.22593,11.627438,2.0977,1.987549,0.171947,8.965516,6.542669,...,1.03616,0.86873,0.745239,0.28892,0.431143,1.212713,1.028347,0.483741,0.848629,1.037825
4,-2.0,-2.0,0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,1.050328,...,0.501866,0.190606,0.668665,0.393885,0.043838,0.495921,0.61985,-0.103345,0.022069,0.655353


In [26]:
clfs = [ExtraTreesClassifier(n_estimators=500, criterion='entropy', max_depth=31, max_features=50, 
                             min_samples_split=2, min_samples_leaf=2, random_state=610, n_jobs=-1),
        XGBClassifier(n_estimators=350, learning_rate=0.05, max_depth=10, min_child_weight=5, 
                      subsample=1.0, colsample_bytree=0.4, seed=120),
        RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=15, max_features=30, 
                            random_state=123, n_jobs=-1)]

meta_train, meta_test = stacking(X_train_n, X_test_n, y_train, skf, clfs)

meta_train_4 = pd.DataFrame(meta_train, index=X_train.index, columns=['nnlr_et', 'nnlr_xgb', 'nnlr_rf'])
meta_test_4 = pd.DataFrame(meta_test, index=X_test.index, columns=['nnlr_et', 'nnlr_xgb', 'nnlr_rf'])

Clf 1
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 2
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 3
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


#### Another one script

In [27]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train, test = prepare_data(train, test)

In [28]:
train.drop(['ID', 'TARGET'], axis=1, inplace=True)
test.drop(['ID'], axis=1, inplace=True)

X_train_n = np.array(train, dtype=np.float32)
X_test_n = np.array(test, dtype=np.float32)

In [29]:
clfs = [ExtraTreesClassifier(n_estimators=500, criterion='entropy', max_depth=19, max_features=20, 
                              random_state=671, n_jobs=-1),
        XGBClassifier(n_estimators=280, learning_rate=0.05, max_depth=8, min_child_weight=5, 
                      subsample=0.9, colsample_bytree=0.45, seed=511),
        RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=12, max_features=15,
                           random_state=181, n_jobs=-1)]

meta_train, meta_test = stacking(X_train_n, X_test_n, y_train, skf, clfs)

meta_train_5 = pd.DataFrame(meta_train, index=X_train.index, columns=['small_et', 'small_xgb', 'small_rf'])
meta_test_5 = pd.DataFrame(meta_test, index=X_test.index, columns=['small_et', 'small_xgb', 'small_rf'])

Clf 1
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 2
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 3
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


#### Concat

In [30]:
meta_train_level1 = pd.concat((meta_train_1, meta_train_2, meta_train_3, meta_train_4, meta_train_5), axis=1)
meta_test_level1 = pd.concat((meta_test_1, meta_test_2, meta_test_3, meta_test_4, meta_test_5), axis=1)

In [32]:
meta_train_level1.to_csv('meta_train_level1.csv', index=False)
meta_test_level1.to_csv('meta_test_level1.csv', index=False)

In [2]:
meta_train_level1 = pd.read_csv('meta_train_level1.csv')
meta_test_level1 = pd.read_csv('meta_test_level1.csv')

### Stacking: 2 level

####  Base

In [34]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

y_train = train['target'].values
id_test = test['ID'].values

train.drop(['ID', 'target'], axis=1, inplace=True)
test.drop(['ID'], axis=1, inplace=True)

X_train, X_test = preprocess_data(train, test)

X_train_2 = pd.concat((X_train, meta_train_level1), axis=1)
X_test_2 = pd.concat((X_test, meta_test_level1), axis=1)

X_train_n = np.array(X_train_2, dtype=np.float32)
X_test_n = np.array(X_test_2, dtype=np.float32)

In [35]:
clfs = [ExtraTreesClassifier(n_estimators=1000, max_features=25, criterion='entropy', min_samples_split=2, 
                             max_depth=36, min_samples_leaf=2, n_jobs=-1, random_state=888),
        XGBClassifier(n_estimators=600, learning_rate=0.03, max_depth=10, colsample_bytree=0.4, 
                      min_child_weight=1, seed=129)]

meta_train, meta_test = stacking(X_train_n, X_test_n, y_train, skf, clfs)

meta_train_0 = pd.DataFrame(meta_train, index=X_train.index, columns=['main_et', 'main_xgb'])
meta_test_0 = pd.DataFrame(meta_test, index=X_test.index, columns=['main_et', 'main_xgb'])

Clf 1
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 2
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


#### Neighbors linear regressions

In [35]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.drop(['ID', 'target'], axis=1, inplace=True)
test.drop(['ID'], axis=1, inplace=True)

train, test = factorize(train, test)

In [36]:
train['v22-1'] = train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0]))
test['v22-1'] = test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0]))
train['v22-2'] = train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1]))
test['v22-2'] = test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1]))
train['v22-3'] = train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2]))
test['v22-3'] = test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2]))
train['v22-4'] = train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3]))
test['v22-4'] = test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3]))

drop_list=['v91','v1', 'v8', 'v10', 'v15', 'v17', 'v25', 'v29', 'v34', 'v41', 
           'v46', 'v54', 'v64', 'v67', 'v97', 'v105', 'v111', 'v122']

train = train.drop(drop_list,axis=1).fillna(-2)
test = test.drop(drop_list,axis=1).fillna(-2)

In [41]:
list_vars = list(train.columns)
random.shuffle(list_vars)
neighbours = []
max_elts = 3
clfs = []
scores = []

lastscores = np.zeros(20) + 1e15

for elt in list_vars[:20]:
    neighbours.append([elt])
list_vars = list_vars[20:]

In [None]:
rnd = 12
random.seed(rnd)
n_ft = 20 # Number of features to add
max_elts = 3 # Maximum size of a group of linear features

a = addNearestNeighbourLinearFeatures(n_neighbours=n_ft, max_elts=max_elts, verbose=False, random_state=rnd)
a.fit(train, y_train)

train = a.transform(train)
test = a.transform(test)

In [None]:
X_train_2 = pd.concat((train, meta_train_level1), axis=1)
X_test_2 = pd.concat((test, meta_test_level1), axis=1)

X_train_n = np.array(X_train_2, dtype=np.float32)
X_test_n = np.array(X_test_2, dtype=np.float32)

In [None]:
clfs = [ExtraTreesClassifier(n_estimators=500, criterion='entropy', max_depth=32, max_features=50, 
                             min_samples_split=2, min_samples_leaf=2, random_state=171, n_jobs=-1),
        XGBClassifier(n_estimators=330, learning_rate=0.03, max_depth=8, colsample_bytree=0.45, 
                      min_child_weight=5, subsample=1.0, seed=666)]

meta_train, meta_test = stacking(X_train_n, X_test_n, y_train, skf, clfs)

meta_train_00 = pd.DataFrame(meta_train, index=train.index, columns=['main_nnlr_et', 'main_nnlr_xgb'])
meta_test_00 = pd.DataFrame(meta_test, index=test.index, columns=['main_nnlr_et', 'main_nnlr_xgb'])

In [41]:
meta_train_00 = pd.read_csv('meta_train_00.csv')
meta_test_00 = pd.read_csv('meta_test_00.csv')

#### Another one script

In [36]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train, test = prepare_data(train, test)

In [37]:
train.drop(['ID', 'TARGET'], axis=1, inplace=True)
test.drop(['ID'], axis=1, inplace=True)

In [38]:
X_train_2 = pd.concat((train, meta_train_level1), axis=1)
X_test_2 = pd.concat((test, meta_test_level1), axis=1)

In [39]:
X_train_n = np.array(X_train_2, dtype=np.float32)
X_test_n = np.array(X_test_2, dtype=np.float32)

In [40]:
clfs = [ExtraTreesClassifier(n_estimators=500, criterion='entropy', max_depth=14, max_features=32, 
                             min_samples_split=2, min_samples_leaf=3, random_state=10, n_jobs=-1),
        XGBClassifier(n_estimators=650, learning_rate=0.01, max_depth=7, colsample_bytree=0.4,
              min_child_weight=5, subsample=1.0, seed=89)]

meta_train, meta_test = stacking(X_train_n, X_test_n, y_train, skf, clfs)

meta_train_000 = pd.DataFrame(meta_train, index=train.index, columns=['main_small_et', 'main_small_xgb'])
meta_test_000 = pd.DataFrame(meta_test, index=test.index, columns=['main_small_et', 'main_small_xgb'])

Clf 1
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Clf 2
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


#### Concat

In [42]:
meta_train_level2 = pd.concat((meta_train_0, meta_train_00, meta_train_000), axis=1)
meta_test_level2 = pd.concat((meta_test_0, meta_test_00, meta_test_000), axis=1)

In [43]:
meta_train_level2.to_csv('meta_train_level2.csv', index=False)
meta_test_level2.to_csv('meta_test_level2.csv', index=False)

In [3]:
meta_train_level2 = pd.read_csv('meta_train_level2.csv')
meta_test_level2 = pd.read_csv('meta_test_level2.csv')

### Final model

In [5]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X_train, X_test = prepare_data(train, test)

X_train.drop(['ID', 'TARGET'], axis=1, inplace=True)
X_test.drop(['ID'], axis=1, inplace=True)

NameError: name 'prepare_data' is not defined

In [None]:
id_test = test['ID'].values

In [7]:
X_train_3 = pd.concat((X_train, meta_train_level1, meta_train_level2), axis=1)
X_test_3 = pd.concat((X_test, meta_test_level1, meta_test_level2), axis=1)

In [9]:
clf = CalibratedClassifierCV(ExtraTreesClassifier(n_estimators=2000, max_features=30, criterion='entropy', 
                                                  min_samples_split=2, max_depth=14, min_samples_leaf=2, 
                                                  n_jobs=-1, random_state=190), method='isotonic', cv= cv)

clf.fit(X_train_3, y_train) 
y_pred = clf.predict_proba(X_test_3)[:, 1]

pd.DataFrame({"ID": id_test, "PredictedProb": y_pred}).to_csv('bnp_predict_6.csv', index=False)

NameError: name 'id_test' is not defined

In [11]:
pd.DataFrame({"ID": id_test, "PredictedProb": y_pred}).to_csv('bnp_predict_6.csv', index=False)