In [292]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import KFold,StratifiedKFold

In [293]:
train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')

In [294]:
ADD_FEATURES = True
USE_STACKING = False

X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [295]:
def add_feature(o, r, f, s, t):
    if s == t:
        return
    if s > t:
        (s, t) = (t, s)
    name = f + "_" + str(s) + "_" + str(t)
    if f == "mean":
        o[name] = r.iloc[:,[s,t]].mean(axis=1)
    if f == "max":
        o[name] = r.iloc[:,[s,t]].max(axis=1)
    if f == "min":
        o[name] = r.iloc[:,[s,t]].min(axis=1)
    if f == "std":
        o[name] = r.iloc[:,[s,t]].std(axis=1)
    if f == "mean_dev_std":
        o[name] = r.iloc[:,[s,t]].mean(axis=1) / r.iloc[:,[s,t]].std(axis=1)

def add_features_both(f, s, t):
    add_feature(test_df, X_test, f, s, t)
    add_feature(train_df, X, f, s, t)

def add_features_all(s, t):
    if t >= 199:
        t = 199
    add_features_both("mean", s, t)
    add_features_both("max", s, t)
    add_features_both("min", s, t)
    add_features_both("std", s, t)
    add_features_both("mean_dev_std", s, t)

In [296]:
if ADD_FEATURES:
    add_features_all(0, 199)
    for i in range(0,181,30):
        add_features_all(i, i + 29)
    

In [297]:
test_df.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,mean_150_179,max_150_179,min_150_179,std_150_179,mean_dev_std_150_179,mean_180_199,max_180_199,min_180_199,std_180_199,mean_dev_std_180_199
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,10.53335,18.7741,2.2926,11.65418,0.903826,-6.62775,-4.5358,-8.7197,2.958464,-2.240267
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,11.9782,18.7454,5.211,9.570266,1.251606,-10.28365,0.4087,-20.976,15.121266,-0.680079
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,9.4521,14.4431,4.4611,7.05834,1.339139,-13.13145,-3.0835,-23.1794,14.209947,-0.924103
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,7.40625,13.7106,1.1019,8.915697,0.830698,2.6241,9.459,-4.2108,9.666008,0.271477
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,13.06955,20.0502,6.0889,9.87213,1.323884,-5.2733,-1.362,-9.1846,5.531414,-0.953337


In [298]:
train_df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,mean_150_179,max_150_179,min_150_179,std_150_179,mean_dev_std_150_179,mean_180_199,max_180_199,min_180_199,std_180_199,mean_dev_std_180_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,10.11895,18.5177,1.7202,11.877626,0.851934,-4.74825,-1.0914,-8.4051,5.171567,-0.918145
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,15.8523,20.6294,11.0752,6.75584,2.346459,-1.82095,1.9518,-5.5937,5.335474,-0.341291
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,8.36715,14.333,2.4013,8.436986,0.991723,-1.9485,0.3965,-4.2935,3.316331,-0.587547
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,11.89725,18.0984,5.6961,8.76975,1.356624,-6.3771,-3.7546,-8.9996,3.708775,-1.719463
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,12.6602,20.1461,5.1743,10.586661,1.195863,-8.2521,-7.6938,-8.8104,0.789555,-10.451578


In [299]:
X_tophalf, _, y_tophalf, _ = train_test_split(X, y, test_size=0.8, random_state=10)

In [300]:
if USE_STACKING:
    pred_cv = np.zeros(len(X))
    pred = np.zeros(len(X_test))

In [301]:
if USE_STACKING:
    # results from other models
    models = [
        ('lg', SGDClassifier(loss='log', max_iter=5000, tol=1e-7, alpha=0.3)),
        ('mlp', MLPClassifier(solver='lbfgs', alpha=0.001, hidden_layer_sizes=(5, 2), random_state=1)),
        ('rf', RandomForestClassifier(n_estimators=10, criterion='entropy')),
        ('gnb', GaussianNB()),
        ('qda', QuadraticDiscriminantAnalysis(tol=1e-12)),
    ]

    train_models_out = pd.DataFrame(index=train_df.iloc[:,0])
    test_models_out = pd.DataFrame(index=test_df.iloc[:,0])

    for model in models:
        name = model[0]
        cls = model[1]
        print('working on ' + name)

        # k-fold
        n_splits = 5
        folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

        pred_cv = np.zeros(len(X))
        pred = np.zeros(len(X_test))

        for fold_n, (train_index, valid_index) in enumerate(folds.split(X_tophalf,y_tophalf)):
            print("fold: " + str(fold_n))
            X_train, X_valid = X_tophalf.iloc[train_index], X_tophalf.iloc[valid_index]
            y_train, y_valid = y_tophalf.iloc[train_index], y_tophalf.iloc[valid_index]

            cls.fit(X_train, y_train)
            pred_cv += cls.predict_proba(X)[:,1] / n_splits
            pred += cls.predict_proba(X_test)[:,1] / n_splits

        print(pred_cv)

        train_models_out[name] = pred_cv
        test_models_out[name] = pred

In [302]:
if USE_STACKING:
    # add statistics
    num = len(models);
    train_models_out['models_mean'] = train_models_out.iloc[:,[0,num-1]].mean(axis=1)
    train_models_out['models_min'] = train_models_out.iloc[:,[0,num-1]].min(axis=1)
    train_models_out['models_max'] = train_models_out.iloc[:,[0,num-1]].max(axis=1)
    test_models_out['models_mean'] = test_models_out.iloc[:,[0,num-1]].mean(axis=1)
    test_models_out['models_min'] = test_models_out.iloc[:,[0,num-1]].min(axis=1)
    test_models_out['models_max'] = test_models_out.iloc[:,[0,num-1]].max(axis=1)

In [303]:
if USE_STACKING:
    train_models_out.head()

In [304]:
if USE_STACKING:
    test_models_out.head()

In [305]:
if USE_STACKING:
    # add to data
    for col in train_models_out:
        train_df[col] = train_models_out[col].values
        test_df[col] = test_models_out[col].values

In [306]:
train_df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,mean_150_179,max_150_179,min_150_179,std_150_179,mean_dev_std_150_179,mean_180_199,max_180_199,min_180_199,std_180_199,mean_dev_std_180_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,10.11895,18.5177,1.7202,11.877626,0.851934,-4.74825,-1.0914,-8.4051,5.171567,-0.918145
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,15.8523,20.6294,11.0752,6.75584,2.346459,-1.82095,1.9518,-5.5937,5.335474,-0.341291
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,8.36715,14.333,2.4013,8.436986,0.991723,-1.9485,0.3965,-4.2935,3.316331,-0.587547
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,11.89725,18.0984,5.6961,8.76975,1.356624,-6.3771,-3.7546,-8.9996,3.708775,-1.719463
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,12.6602,20.1461,5.1743,10.586661,1.195863,-8.2521,-7.6938,-8.8104,0.789555,-10.451578


In [307]:
test_df.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,mean_150_179,max_150_179,min_150_179,std_150_179,mean_dev_std_150_179,mean_180_199,max_180_199,min_180_199,std_180_199,mean_dev_std_180_199
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,10.53335,18.7741,2.2926,11.65418,0.903826,-6.62775,-4.5358,-8.7197,2.958464,-2.240267
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,11.9782,18.7454,5.211,9.570266,1.251606,-10.28365,0.4087,-20.976,15.121266,-0.680079
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,9.4521,14.4431,4.4611,7.05834,1.339139,-13.13145,-3.0835,-23.1794,14.209947,-0.924103
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,7.40625,13.7106,1.1019,8.915697,0.830698,2.6241,9.459,-4.2108,9.666008,0.271477
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,13.06955,20.0502,6.0889,9.87213,1.323884,-5.2733,-1.362,-9.1846,5.531414,-0.953337


In [308]:
train_df.to_csv("input/train_more_features.csv", index=False)
test_df.to_csv("input/test_more_features.csv", index=False)