In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import time
import csv

In [3]:
def level_1_predictor(t_train,y, model):
    leng = int(len(t_train)/2)
    t_a, t_b= t_train[:leng], t_train[leng:]
    tg_a, tg_b = y[:leng], y[leng:]
    if model == LogisticRegression:
        m1 = model(max_iter = 20**9)
        m2 = model(max_iter = 20**9)
    else:
        m1 = model()
        m2 = model()

    m1.fit(t_a,tg_a)
    tp_b = [i[1] for i in m1.predict_proba(t_b)]
    m2.fit(t_b,tg_b)
    tp_a = [i[1] for i in m2.predict_proba(t_a)]
    tp = [[x] for x in np.hstack((tp_a,tp_b))]
    
    return tp

In [4]:
def retrieve_important_feature(model,s=0):
    file = pd.read_csv('important_l1.csv')
    feature_name = file.columns
    if model == GradientBoostingClassifier:
        importance = file.iloc[0]
    if model == LogisticRegression:
        importance = file.iloc[1]
    itr = 0
    important_feature = []
    unimportant_feature = []
    for x in importance:
        if x>s:
            important_feature.append(feature_name[itr])
        else:
            unimportant_feature.append(feature_name[itr])
        itr+=1
    return important_feature, unimportant_feature

In [5]:
def categorize_important_feature(model):
    important_feature, unimportant_feature = retrieve_important_feature(model)
    delinquency=[]
    spend = []
    payment = []
    balance = []
    risk = []
    std = []
    mean = []
    _25 = []
    _50 = []
    _75 = []
    _min = []
    _max = []

    for x in important_feature:
        if x[0]=='D':
            delinquency.append(x)
        if x[0]=='S':
            spend.append(x)
        if x[0]=='P':
            payment.append(x)
        if x[0]=='B':
            balance.append(x)
        if x[0]=='R':
            risk.append(x)
        if x[-3:]=='std':
            std.append(x)
        elif x[-3:]=='25%':
            _25.append(x)
        elif x[-3:]=='75%':
            _75.append(x)
        elif x[-3:]=='50%':
            _50.append(x)
        elif x[-3:]=='min':
            _min.append(x)
        elif x[-3:]=='max':
            _max.append(x)
        else:
            mean.append(x)  
    feature_sets = {'delinquency':delinquency,'spend':spend,'payment':payment,'balance':balance,'risk':risk,'std':std,
                    'mean':mean,'_25':_25,'_50':_50,'_75':_75,'_min':_min,'_max':_max, 'unimportant_fe':unimportant_feature}
    return feature_sets

In [6]:
# I use the best prediction from one of the models to stack.
def training_stacker(feature, model):
    feature_sets = categorize_important_feature(model)
    column_name = list(feature_sets.keys())
    itr = 0
    dic_out = {}
    for y in column_name:
        stacking_feature = np.array(feature[feature_sets[y]])
        print('--',column_name[itr], end= ' ')
        score_set = {}
        for x in models:
            l1_pred = level_1_predictor(stacking_feature,np_label.T[0], x)
            score = roc_auc_score(np_label,l1_pred)
            model_name = x.__name__
            score_set[str(score)]=model_name
        model_for_stacking = score_set[str(max([float(x) for x in score_set.keys()]))]
        print(model_for_stacking)
        dic_out[y]=[model_for_stacking]
        itr+=1
    temp = pd.DataFrame(dic_out)
    # G means the I used 
    if model == GradientBoostingClassifier:
        model_for_stacking_file = pd.read_csv('l1_stacking_model_G.csv')
        model_for_stacking_file = pd.concat((model_for_stacking_file,temp), axis = 0)
        model_for_stacking_file.to_csv('l1_stacking_model_G.csv', index = False)
    if model == LogisticRegression:
        model_for_stacking_file = pd.read_csv('l1_stacking_model_L.csv')
        model_for_stacking_file = pd.concat((model_for_stacking_file,temp), axis = 0)
        model_for_stacking_file.to_csv('l1_stacking_model_L.csv', index = False)        


In [7]:
models = [GradientBoostingClassifier,RandomForestClassifier,LogisticRegression]

In [20]:
for file in range(0,15):
    start_time = time.time()
    df = pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_{0}.csv'.format(file))
    feature = df.iloc[:,1:-1]
    feature_name = feature.columns
    for y in feature_name:
        feature[y]=feature[y].fillna(feature[y].mean())
    label = pd.DataFrame(df['target'], columns=['target'])
    np_label = np.array(label)
    training_stacker(feature,GradientBoostingClassifier)
    print(file, end=' ')
    print(" %s second " %(time.time()-start_time))

-- delinquency GradientBoostingClassifier
-- spend RandomForestClassifier
-- payment LogisticRegression
-- balance GradientBoostingClassifier
-- risk RandomForestClassifier
-- std GradientBoostingClassifier
-- mean GradientBoostingClassifier
-- _25 LogisticRegression
-- _50 GradientBoostingClassifier
-- _75 LogisticRegression
-- _min LogisticRegression
-- _max LogisticRegression
-- unimportant_fe GradientBoostingClassifier
0  851.6341094970703 second 
-- delinquency LogisticRegression
-- spend GradientBoostingClassifier
-- payment LogisticRegression
-- balance GradientBoostingClassifier
-- risk LogisticRegression
-- std GradientBoostingClassifier
-- mean GradientBoostingClassifier
-- _25 GradientBoostingClassifier
-- _50 GradientBoostingClassifier
-- _75 GradientBoostingClassifier
-- _min GradientBoostingClassifier
-- _max LogisticRegression
-- unimportant_fe GradientBoostingClassifier
1  847.9264659881592 second 
-- delinquency LogisticRegression
-- spend RandomForestClassifier
-- pay