In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import time
import csv

In [2]:
def level_1_predictor(t_train,y, model):
    leng = int(len(t_train)/2)
    t_a, t_b= t_train[:leng], t_train[leng:]
    tg_a, tg_b = y[:leng], y[leng:]
    if model == LogisticRegression:
        m1 = model(max_iter = 20**9)
        m2 = model(max_iter = 20**9)
    else:
        m1 = model()
        m2 = model()

    m1.fit(t_a,tg_a)
    tp_b = [i[1] for i in m1.predict_proba(t_b)]
    m2.fit(t_b,tg_b)
    tp_a = [i[1] for i in m2.predict_proba(t_a)]
    tp = [[x] for x in np.hstack((tp_a,tp_b))]
    
    return tp

In [16]:
def get_important_feature(model,s=1):
    file = pd.read_csv('important_l1.csv')
    feature_name = file.columns
    if model == GradientBoostingClassifier:
        importance = file.iloc[0]
    if model == LogisticRegression:
        importance = file.iloc[1]
    itr = 0
    important_feature = []
    unimportant_feature = []
    for x in importance:
        if x>s:
            important_feature.append(feature_name[itr])
        else:
            unimportant_feature.append(feature_name[itr])
        itr+=1
    return important_feature, unimportant_feature

In [5]:
def categorize_important_feature(model):
    important_feature, unimportant_feature = get_important_feature(model)
    delinquency=[]
    spend = []
    payment = []
    balance = []
    risk = []
    std = []
    mean = []
    _25 = []
    _50 = []
    _75 = []
    _min = []
    _max = []

    for x in important_feature:
        if x[0]=='D':
            delinquency.append(x)
        if x[0]=='S':
            spend.append(x)
        if x[0]=='P':
            payment.append(x)
        if x[0]=='B':
            balance.append(x)
        if x[0]=='R':
            risk.append(x)
        if x[-3:]=='std':
            std.append(x)
        elif x[-3:]=='25%':
            _25.append(x)
        elif x[-3:]=='75%':
            _75.append(x)
        elif x[-3:]=='50%':
            _50.append(x)
        elif x[-3:]=='min':
            _min.append(x)
        elif x[-3:]=='max':
            _max.append(x)
        else:
            mean.append(x)  
    feature_sets = {'delinquency':delinquency,'spend':spend,'payment':payment,'balance':balance,'risk':risk,'std':std,
                    'mean':mean,'_25':_25,'_50':_50,'_75':_75,'_min':_min,'_max':_max, 'unimportant_fe':unimportant_feature}
    return feature_sets

In [6]:
# I use the best prediction from one of the models to stack.
def training_stacker(feature, model):
    feature_sets = categorize_important_feature(model)
    column_name = list(feature_sets.keys())
    itr = 0
    dic_out = {}
    for y in column_name:
        stacking_feature = np.array(feature[feature_sets[y]])
        print('--',column_name[itr], end= ' ')
        score_set = {}
        for x in models:
            l1_pred = level_1_predictor(stacking_feature,np_label.T[0], x)
            score = roc_auc_score(np_label,l1_pred)
            model_name = x.__name__
            score_set[str(score)]=model_name
        model_for_stacking = score_set[str(max([float(x) for x in score_set.keys()]))]
        print(model_for_stacking)
        dic_out[y]=[model_for_stacking]
        itr+=1
    temp = pd.DataFrame(dic_out)
    if model == GradientBoostingClassifier:
        model_for_stacking_file = pd.read_csv('l1_stacking_model_G.csv')
        model_for_stacking_file = pd.concat((model_for_stacking_file,temp), axis = 0)
        model_for_stacking_file.to_csv('l1_stacking_model_G.csv', index = False)
    if model == LogisticRegression:
        model_for_stacking_file = pd.read_csv('l1_stacking_model_L.csv')
        model_for_stacking_file = pd.concat((model_for_stacking_file,temp), axis = 0)
        model_for_stacking_file.to_csv('l1_stacking_model_L.csv', index = False)        
#GGLGRGLLLLLL
#GGLGRGLLLLGL

In [7]:
models = [GradientBoostingClassifier,RandomForestClassifier,LogisticRegression]

In [9]:
for file in range(0,6):
    start_time = time.time()
    df = pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_{0}.csv'.format(file))
    feature = df.iloc[:,1:-1]
    feature_name = feature.columns
    for y in feature_name:
        feature[y]=feature[y].fillna(feature[y].mean())
    label = pd.DataFrame(df['target'], columns=['target'])
    np_label = np.array(label)
    training_stacker(feature,LogisticRegression)
    print(file, end=' ')
    print(" %s second " %(time.time()-start_time))

TypeError: 'numpy.int64' object is not iterable

In [17]:
get_important_feature(LogisticRegression)

4
5
2
6
3
2
5
3
4
1
3
3
2
3
6
4
5
5
2
5
3
5
4
3
2
4
3
4
7
2
4
2
5
4
3
2
4
3
3
3
2
3
2
2
3
3
2
4
4
6
3
4
6
3
3
6
2
4
7
5
6
3
7
7
5
3
4
6
3
3
4
6
6
3
4
6
7
4
3
7
3
5
5
3
7
4
4
1
0
6
3
6
7
2
0
5
6
4
5
7
2
6
6
5
0
0
0
2
3
0
4
2
3
4
4
4
2
2
0
5
3
2
3
4
1
3
4
5
5
5
5
5
5
5
2
3
5
5
3
4
3
7
2
1
2
2
2
3
6
5
3
5
4
4
7
4
3
7
5
6
3
4
6
5
4
6
5
5
1
5
3
3
3
2
3
4
4
2
5
2
4
3
3
3
3
3
5
4
2
4
5
5
3
2
2
3
2
2
2
3
5
4
4
2
5
7
3
4
5
5
5
2
2
3
7
3
3
4
4
3
4
3
3
3
3
3
2
2
3
2
3
4
3
4
5
4
4
6
4
3
3
2
1
4
3
5
7
4
5
7
2
5
3
2
1
2
0
3
4
3
4
4
7
1
3
7
4
3
4
4
7
1
6
3
0
5
4
6
6
2
1
4
7
5
3
6
2
4
4
3
0
0
0
0
4
0
4
4
4
4
3
4
2
2
1
3
5
3
1
2
1
5
4
5
6
5
2
5
3
2
5
6
3
2
3
2
6
4
1
0
0
0
2
3
7
6
2
4
3
1
4
4
4
6
5
4
5
2
5
5
4
1
4
4
0
5
6
4
4
0
3
4
4
2
2
4
6
4
2
3
3
3
6
5
5
6
4
3
2
5
2
4
6
5
6
3
2
6
5
5
3
4
6
7
3
4
3
4
4
4
4
2
4
4
7
5
1
3
5
4
7
3
4
4
3
3
4
6
3
3
4
4
5
3
5
4
6
7
0
5
6
7
1
7
5
6
5
2
4
5
6
4
4
6
4
4
1
3
0
5
4
1
3
5
3
7
3
6
3
2
0
7
2
6
6
2
0
3
2
1
6
3
1
4
6
5
0
0
0
2
5
0
6
4
5
4
5
5
0
0
0
7
3
4
4
4
0
2
7
4


(['P_2_mean',
  'D_39_mean',
  'B_1_mean',
  'B_2_mean',
  'R_1_mean',
  'S_3_mean',
  'D_41_mean',
  'B_3_mean',
  'D_42_mean',
  'D_44_mean',
  'B_4_mean',
  'D_45_mean',
  'B_5_mean',
  'R_2_mean',
  'D_46_mean',
  'D_47_mean',
  'D_48_mean',
  'D_49_mean',
  'B_6_mean',
  'B_7_mean',
  'B_8_mean',
  'D_50_mean',
  'D_51_mean',
  'B_9_mean',
  'R_3_mean',
  'D_52_mean',
  'P_3_mean',
  'B_10_mean',
  'D_53_mean',
  'S_5_mean',
  'B_11_mean',
  'S_6_mean',
  'D_54_mean',
  'R_4_mean',
  'S_7_mean',
  'B_12_mean',
  'S_8_mean',
  'D_55_mean',
  'D_56_mean',
  'B_13_mean',
  'R_5_mean',
  'D_58_mean',
  'S_9_mean',
  'B_14_mean',
  'D_59_mean',
  'D_60_mean',
  'D_61_mean',
  'B_15_mean',
  'S_11_mean',
  'D_62_mean',
  'D_65_mean',
  'B_16_mean',
  'B_17_mean',
  'B_18_mean',
  'B_19_mean',
  'D_66_mean',
  'B_20_mean',
  'D_68_mean',
  'S_12_mean',
  'R_6_mean',
  'S_13_mean',
  'B_21_mean',
  'D_69_mean',
  'B_22_mean',
  'D_70_mean',
  'D_71_mean',
  'D_72_mean',
  'S_15_mean',
  '

In [13]:
importance.iloc[0]

NameError: name 'importance' is not defined

In [142]:
importance

Unnamed: 0,P_2_mean,D_39_mean,B_1_mean,B_2_mean,R_1_mean,S_3_mean,D_41_mean,B_3_mean,D_42_mean,D_43_mean,...,D_136_max,D_137_max,D_138_max,D_139_max,D_140_max,D_141_max,D_142_max,D_143_max,D_144_max,D_145_max
0,0,1,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
