In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import time
import csv

In [3]:
from sklearn.model_selection import KFold
def cross_validation(folds, data, labels, error_function, model, **model_args):
    kf = KFold(n_splits=folds, random_state=None, shuffle = True)
    score = []
    y = labels
    if model == LogisticRegression:
        M = model(max_iter = 20**9)
    else:
        M = model()
    predicted_lable = pd.DataFrame(columns = ['prediction'])
    for train_index,test_index in kf.split(data):
        x_training_set = data.iloc[train_index]
        y_training_set = y.iloc[train_index]
        x_test_set = data.iloc[test_index]
        y_test_set = y.iloc[test_index]
        M.fit(np.array(x_training_set), np.array(y_training_set).T[0])
        y_pred = M.predict_proba(np.array(x_test_set))
        y_pred = [x[1] for x in y_pred]
        score.append(error_function(y_test_set,y_pred)) 
        print('.',end=' ')
    average_error = round(sum(score)/folds,4)
    return average_error

In [4]:
def level_1_predictor(t_train,y, model):
    leng = int(len(t_train)/2)
    t_a, t_b= t_train[:leng], t_train[leng:]
    tg_a, tg_b = y[:leng], y[leng:]
    if model == LogisticRegression:
        m1 = model(max_iter = 20**9)
        m2 = model(max_iter = 20**9)
    else:
        m1 = model()
        m2 = model()

    m1.fit(t_a,tg_a)
    tp_b = [i[1] for i in m1.predict_proba(t_b)]
    m2.fit(t_b,tg_b)
    tp_a = [i[1] for i in m2.predict_proba(t_a)]
    tp = [x for x in np.hstack((tp_a,tp_b))]
    
    return tp

In [5]:
def retrieve_important_feature(s,m):
    file = pd.read_csv('important_l1.csv')
    feature_name = file.columns
    if m=='G':
        importance = file.iloc[0]
    if m=='L':
        importance = file.iloc[1]
    itr = 0
    important_feature = []
    unimportant_feature = []
    for x in importance:
        if x>s:
            important_feature.append(feature_name[itr])
        else:
            unimportant_feature.append(feature_name[itr])
        itr+=1
    return important_feature, unimportant_feature

In [6]:
def categorize_important_feature(important_feature, unimportant_feature):
    delinquency=[]
    spend = []
    payment = []
    balance = []
    risk = []
    std = []
    mean = []
    _25 = []
    _50 = []
    _75 = []
    _min = []
    _max = []

    for x in important_feature:
        if x[0]=='D':
            delinquency.append(x)
        if x[0]=='S':
            spend.append(x)
        if x[0]=='P':
            payment.append(x)
        if x[0]=='B':
            balance.append(x)
        if x[0]=='R':
            risk.append(x)
        if x[-3:]=='std':
            std.append(x)
        elif x[-3:]=='25%':
            _25.append(x)
        elif x[-3:]=='75%':
            _75.append(x)
        elif x[-3:]=='50%':
            _50.append(x)
        elif x[-3:]=='min':
            _min.append(x)
        elif x[-3:]=='max':
            _max.append(x)
        else:
            mean.append(x)  
    feature_sets = {'delinquency':delinquency,'spend':spend,'payment':payment,'balance':balance,'risk':risk,'std':std,
                    'mean':mean,'_25':_25,'_50':_50,'_75':_75,'_min':_min,'_max':_max, 'unimportant_fe':unimportant_feature}
    return feature_sets

In [7]:
def stacker(feature,np_label,feature_sets,model_dic,m='G'):
    column_name = list(feature_sets.keys())
    if m=='G':
        r = pd.read_csv('l1_stacking_model_G.csv')
    if m=='L':
        r = pd.read_csv('l1_stacking_model_L.csv')
    stack_feature = pd.DataFrame(columns = column_name)
    fitted_model = {}
    for x in column_name:
        best_stacking_model = model_dic[r[x].value_counts().keys()[0]]
        stacking_feature = np.array(feature[feature_sets[x]])
        stack_feature[x] = level_1_predictor(stacking_feature,np_label, best_stacking_model)
        if best_stacking_model==LogisticRegression:
            md = best_stacking_model(max_iter = 20**9)
            md.fit(stacking_feature, np_label)
            fitted_model[x]=md
        else:
            md = best_stacking_model()
            md.fit(stacking_feature, np_label)
            fitted_model[x]=md
        print(best_stacking_model.__name__[0], end = ' ')
    return stack_feature, fitted_model

In [8]:
models = [LogisticRegression,GradientBoostingClassifier,RandomForestClassifier]
model_dic ={}
for x in models:
    model_dic[x.__name__] = x
feature_name = pd.read_csv('important_l1.csv').columns

In [9]:
pd.read_csv('important_l1.csv').columns

Index(['P_2_mean', 'D_39_mean', 'B_1_mean', 'B_2_mean', 'R_1_mean', 'S_3_mean',
       'D_41_mean', 'B_3_mean', 'D_42_mean', 'D_43_mean',
       ...
       'D_136_max', 'D_137_max', 'D_138_max', 'D_139_max', 'D_140_max',
       'D_141_max', 'D_142_max', 'D_143_max', 'D_144_max', 'D_145_max'],
      dtype='object', length=1302)

In [16]:
# this is the function that I use to determine how many features I am going to select from important_l1,
# what model should I use for final prediction
for n in range(0,20):
    df= pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_{0}.csv'.format(n))
    feature = df.iloc[:,1:-1]
    label = pd.DataFrame(df['target'], columns=['target'])
    np_label = np.array(label)
    for x in feature_name:
        feature[x]=feature[x].fillna(feature[x].mean())
    recording = []
    for y in range(0,10):
        start_time = time.time()
        important_feature, unimportant_feature = retrieve_important_feature(y,'G')        
        feature_sets = categorize_important_feature(important_feature, unimportant_feature)
        stack_feature= stacker(feature, np_label.T[0],feature_sets,model_dic)[0]
        impt_feature = feature[important_feature]
        feature_u = pd.concat([impt_feature,stack_feature], axis = 1)
        for m in models:
            score = cross_validation(5,feature_u,label, roc_auc_score,m)
            recording.append((score, y, m.__name__[0]))
        print(y)
    top_3 = sorted(recording, key=lambda x:x[0], reverse = True)[:3]
    rd = pd.DataFrame(top_3, columns = ['AUC','important_level','model'])
    performance_history = pd.read_csv('performance.csv')
    new_performance = pd.concat([rd,performance_history], axis = 0)
    new_performance.to_csv('performance', index = False)

G G L G R G G L G L G L G . . . . . . . . . . . . . . . 0
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 1
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 2
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 3
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 4
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 5
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 6
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 7
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 8
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 9
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 0
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 1
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 2
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 3
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 4
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 5
G G L G R G G L G L G L G . . . . . . . . . . . . . . . 6
G G L G R G G 

KeyboardInterrupt: 