In [198]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import time
import csv

In [199]:
from sklearn.model_selection import KFold
def cross_validation(folds, data, labels, error_function, model, **model_args):
    kf = KFold(n_splits=folds, random_state=None, shuffle = True)
    score = []
    y = labels
    if model == LogisticRegression:
        M = model(max_iter = 20**9)
    else:
        M = model()
    predicted_lable = pd.DataFrame(columns = ['prediction'])
    for train_index,test_index in kf.split(data):
        x_training_set = data.iloc[train_index]
        y_training_set = y.iloc[train_index]
        x_test_set = data.iloc[test_index]
        y_test_set = y.iloc[test_index]
        M.fit(np.array(x_training_set), np.array(y_training_set).T[0])
        y_pred = M.predict_proba(np.array(x_test_set))
        y_pred = [x[1] for x in y_pred]
        score.append(error_function(y_test_set,y_pred)) 
        print('.',end=' ')
    average_error = round(sum(score)/folds,4)
    return average_error

In [200]:
def level_1_predictor(t_train,y, model):
    leng = int(len(t_train)/2)
    t_a, t_b= t_train[:leng], t_train[leng:]
    tg_a, tg_b = y[:leng], y[leng:]
    if model == LogisticRegression:
        m1 = model(max_iter = 20**9)
        m2 = model(max_iter = 20**9)
    else:
        m1 = model()
        m2 = model()

    m1.fit(t_a,tg_a)
    tp_b = [i[1] for i in m1.predict_proba(t_b)]
    m2.fit(t_b,tg_b)
    tp_a = [i[1] for i in m2.predict_proba(t_a)]
    tp = [[x] for x in np.hstack((tp_a,tp_b))]
    
    return tp

In [201]:
def get_important_feature(model,n):
    file = pd.read_csv('important_l1.csv')
    feature_name = file.columns
    if model == GradientBoostingClassifier:
        importance = file.iloc[0]
    if model == LogisticRegression:
        importance = file.iloc[1]
    itr = 0
    important_feature = []
    unimportant_feature = []
    for x in importance.iloc[0]:
        if x>n:
            important_feature.append(feature_name[itr])
        else:
            unimportant_feature.append(feature_name[itr])
        itr+=1
    return important_feature, unimportant_feature

In [202]:
def stacker(feature,feature_sets):
    column_name = list(feature_sets.keys())
    r = pd.read_csv('model_for_stacking_l1.csv')
    stack_feature = pd.DataFrame(columns = column_name)
    for x in column_name:
        print(x, end='-->')
        best_stacking_model = r[x].value_counts().keys()[0]
        stacking_feature = np.array(feature[feature_sets[x]])
        stack_feature[x] = level_1_predictor(stacking_feature,np_label.T[0], model_dic[best_stacking_model])
        print(best_stacking_model)
    return stack_feature

In [203]:
def categorize_important_feature(n):
    important_feature, unimportant_feature = get_important_feature(n)
    delinquency=[]
    spend = []
    payment = []
    balance = []
    risk = []
    std = []
    mean = []
    _25 = []
    _50 = []
    _75 = []
    _min = []
    _max = []

    for x in important_feature:
        if x[0]=='D':
            delinquency.append(x)
        if x[0]=='S':
            spend.append(x)
        if x[0]=='P':
            payment.append(x)
        if x[0]=='B':
            balance.append(x)
        if x[0]=='R':
            risk.append(x)
        if x[-3:]=='std':
            std.append(x)
        elif x[-3:]=='25%':
            _25.append(x)
        elif x[-3:]=='75%':
            _75.append(x)
        elif x[-3:]=='50%':
            _50.append(x)
        elif x[-3:]=='min':
            _min.append(x)
        elif x[-3:]=='max':
            _max.append(x)
        else:
            mean.append(x)  
    feature_sets = {'delinquency':delinquency,'spend':spend,'payment':payment,'balance':balance,'risk':risk,'std':std,
                    'mean':mean,'_25':_25,'_50':_50,'_75':_75,'_min':_min,'_max':_max, 'unimportant_fe':unimportant_feature}
    return feature_sets

In [204]:
df= pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_9.csv')

In [205]:
feature = df.iloc[:,1:-1]
label = pd.DataFrame(df['target'], columns=['target'])
np_label = np.array(label)
feature_name = feature.columns

In [206]:
important_feature, unimportant_feature = get_important_feature(0)

In [207]:
feature_sets = categorize_important_feature(0)

In [208]:
models = [GradientBoostingClassifier,RandomForestClassifier,LogisticRegression]

In [209]:
model_dic ={}
for x in models:
    model_dic[x.__name__] = x

In [210]:
for x in feature_name:
    feature[x]=feature[x].fillna(feature[x].mean())

In [96]:
stack_feature = stacker(feature, feature_sets)

delinquency-->GradientBoostingClassifier
spend-->GradientBoostingClassifier
payment-->LogisticRegression
balance-->RandomForestClassifier
risk-->RandomForestClassifier
std-->GradientBoostingClassifier
mean-->GradientBoostingClassifier
_25-->GradientBoostingClassifier
_50-->GradientBoostingClassifier
_75-->GradientBoostingClassifier
_min-->GradientBoostingClassifier
_max-->LogisticRegression
unimportant_fe-->GradientBoostingClassifier


In [97]:
imp,unimp = get_important_feature(5)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(feature[important_feature],label, test_size=0.2, random_state=11)
for x in models:
    print(x.__name__,cross_validation(5,X_train, y_train, roc_auc_score,x))

. . . . . GradientBoostingClassifier 0.9463
. . . . . RandomForestClassifier 0.9426
. . . . . LogisticRegression 0.9374


In [175]:
i = pd.read_csv('important_l1.csv')

In [179]:
i.index

RangeIndex(start=0, stop=2, step=1)

In [145]:
i.iloc[0]

P_2_mean     4
D_39_mean    6
B_1_mean     4
B_2_mean     2
R_1_mean     3
            ..
D_141_max    0
D_142_max    1
D_143_max    1
D_144_max    1
D_145_max    1
Name: GradientBoostingClassifier, Length: 1302, dtype: int64

In [190]:
j = pd.DataFrame(index=['GradientBoostingClassifier','LogisticRegression'],columns = [i.columns])

In [191]:
j.iloc[0] = i.iloc[0]

In [192]:
j = j.iloc[:,1:]

In [193]:
j

Unnamed: 0,P_2_mean,D_39_mean,B_1_mean,B_2_mean,R_1_mean,S_3_mean,D_41_mean,B_3_mean,D_42_mean,D_43_mean,...,D_136_max,D_137_max,D_138_max,D_139_max,D_140_max,D_141_max,D_142_max,D_143_max,D_144_max,D_145_max
GradientBoostingClassifier,4.0,6.0,4.0,2.0,3.0,4.0,0.0,1.0,6.0,5.0,...,0.0,2.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
LogisticRegression,,,,,,,,,,,...,,,,,,,,,,


In [173]:
j.iloc[0] = i.iloc[0]

IndexError: iloc cannot enlarge its target object

In [148]:
for x in i.columns:
    j[x].iloc[1] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  j[x].iloc[1] = 0


KeyboardInterrupt: 

In [169]:
j['P_2_mean'].iloc[1] =[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  j['P_2_mean'].iloc[1] =[0]


In [194]:
j.iloc[1] = [0 for x in j.columns]

In [195]:
j.to_csv('important_l1.csv', index = True)

In [196]:
j

Unnamed: 0,P_2_mean,D_39_mean,B_1_mean,B_2_mean,R_1_mean,S_3_mean,D_41_mean,B_3_mean,D_42_mean,D_43_mean,...,D_136_max,D_137_max,D_138_max,D_139_max,D_140_max,D_141_max,D_142_max,D_143_max,D_144_max,D_145_max
GradientBoostingClassifier,4,6,4,2,3,4,0,1,6,5,...,0,2,2,0,0,0,1,1,1,1
LogisticRegression,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
