In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import time
import csv

In [2]:
from sklearn.model_selection import KFold
def cross_validation(folds, data, labels, error_function, model, **model_args):
    kf = KFold(n_splits=folds, random_state=None, shuffle = True)
    score = []
    y = labels
    if model == LogisticRegression:
        M = model(max_iter = 20**9)
    else:
        M = model()
    predicted_lable = pd.DataFrame(columns = ['prediction'])
    for train_index,test_index in kf.split(data):
        x_training_set = data.iloc[train_index]
        y_training_set = y.iloc[train_index]
        x_test_set = data.iloc[test_index]
        y_test_set = y.iloc[test_index]
        M.fit(np.array(x_training_set), np.array(y_training_set).T[0])
        y_pred = M.predict_proba(np.array(x_test_set))
        y_pred = [x[1] for x in y_pred]
        score.append(error_function(y_test_set,y_pred)) 
        print('.',end=' ')
    average_error = round(sum(score)/folds,4)
    return average_error

In [3]:
def level_1_predictor(t_train,y, model):
    leng = int(len(t_train)/2)
    t_a, t_b= t_train[:leng], t_train[leng:]
    tg_a, tg_b = y[:leng], y[leng:]
    if model == LogisticRegression:
        m1 = model(max_iter = 20**9)
        m2 = model(max_iter = 20**9)
    else:
        m1 = model()
        m2 = model()

    m1.fit(t_a,tg_a)
    tp_b = [i[1] for i in m1.predict_proba(t_b)]
    m2.fit(t_b,tg_b)
    tp_a = [i[1] for i in m2.predict_proba(t_a)]
    tp = [x for x in np.hstack((tp_a,tp_b))]
    
    return tp

In [4]:
def retrieve_important_feature(s,m):
    file = pd.read_csv('important_l1.csv')
    feature_name = file.columns
    if m=='G':
        importance = file.iloc[0]
    if m=='L':
        importance = file.iloc[1]
    itr = 0
    important_feature = []
    unimportant_feature = []
    for x in importance:
        if x>s:
            important_feature.append(feature_name[itr])
        else:
            unimportant_feature.append(feature_name[itr])
        itr+=1
    return important_feature, unimportant_feature

In [5]:
def categorize_important_feature(n,m):
    important_feature, unimportant_feature = retrieve_important_feature(n,m)
    delinquency=[]
    spend = []
    payment = []
    balance = []
    risk = []
    std = []
    mean = []
    _25 = []
    _50 = []
    _75 = []
    _min = []
    _max = []

    for x in important_feature:
        if x[0]=='D':
            delinquency.append(x)
        if x[0]=='S':
            spend.append(x)
        if x[0]=='P':
            payment.append(x)
        if x[0]=='B':
            balance.append(x)
        if x[0]=='R':
            risk.append(x)
        if x[-3:]=='std':
            std.append(x)
        elif x[-3:]=='25%':
            _25.append(x)
        elif x[-3:]=='75%':
            _75.append(x)
        elif x[-3:]=='50%':
            _50.append(x)
        elif x[-3:]=='min':
            _min.append(x)
        elif x[-3:]=='max':
            _max.append(x)
        else:
            mean.append(x)  
    feature_sets = {'delinquency':delinquency,'spend':spend,'payment':payment,'balance':balance,'risk':risk,'std':std,
                    'mean':mean,'_25':_25,'_50':_50,'_75':_75,'_min':_min,'_max':_max, 'unimportant_fe':unimportant_feature}
    return feature_sets

In [8]:
models = [LogisticRegression,GradientBoostingClassifier,RandomForestClassifier]

In [53]:
df= pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_9.csv')
feature = df.iloc[:,1:-1]
label = pd.DataFrame(df['target'], columns=['target'])
np_label = np.array(label)
feature_name = feature.columns
feature_sets = categorize_important_feature(1,'G')
model_dic ={}
for x in models:
    model_dic[x.__name__] = x
for x in feature_name:
    feature[x]=feature[x].fillna(feature[x].mean())

KeyboardInterrupt: 

In [14]:
#G -G -L -R -R -G -G -G -G -G -G -L -G 
#G -G -L -R -R -G -G -G -G -G -G -L -G 
stack_feature, fitted_model = stacker(feature, feature_sets)

G -G -L -R -R -G -G -G -G -G -G -L -G -

    4 G -- 0.945
    4 G -- 0.9461
    1 G -- 0.9461
    4 L -- 0.9514
    (4,L,0.951), (3,G,0.9506),(4,G,0.9497)  

In [None]:
for x in range(0,7):
    print(x)
    important_feature, unimportant_feature = retrieve_important_feature(x,'G')
    impt_feature = feature[important_feature]
    feature_u = pd.concat([impt_feature,stack_feature], axis = 1)
    for x in models:
        print(x.__name__,cross_validation(5,feature_u,label, roc_auc_score,x))

In [102]:
class amex_default_prediction:
    def __init__(self):
        self._feature_importance = pd.read_csv('important_l1.csv')
        self._feature_name = self._feature_importance.columns
        self._impt_fe= retrieve_important_feature(4,'G')[0]
        self._impt_fe_train = None
        self._feature_sets = categorize_important_feature(1,'G')
        self._models = [LogisticRegression,GradientBoostingClassifier,RandomForestClassifier]
        self._model_dic = {}
        for x in self._models:
            self._model_dic[x.__name__] = x
        self._train_l1 = None
        self._feature_l2 = None
        self._label = None
        self._stacking_train = None
        self._fitted_models_l1 = None
        self._fitted_model_l2 = None
        self._test_l1 = None
        self._test_l2 = None
        self._stacked_test = None
        self._prediction = None
    def fillNa(self, data):
        feature = data
        for x in data.columns:
            feature[x]=feature[x].fillna(feature[x].mean())
        return feature
    def fit(self, train, label):
        self._train_l1 = self.fillNa(train)
        self._label = np.array(label).T[0]
        self._stacking_train, self._fitted_models_l1 = stacker(self._train_l1,self._label,self._feature_sets, self._model_dic)
        self._impt_fe_train = self._train_l1[self._impt_fe]
        self._train_l2 = pd.concat([self._impt_fe_train,self._stacking_train], axis = 1)        
        self._fitted_model_l2 = GradientBoostingClassifier()
        self._fitted_model_l2.fit(self._train_l2, self._label)
    def predict(self, test):
        self._test_l1 = self.fillNa(test)
        self._stacked_test = stacker_for_testSet(self._test_l1,self._feature_sets,self._fitted_models_l1)
        self._impt_fe_test = self._test_l1[self._impt_fe]
        self._test_l2 = pd.concat([self._impt_fe_test,self._stacked_test], axis = 1)
        self._prediction = self._fitted_model_l2.predict_proba(self._test_l2)
        return self._prediction

In [97]:
def stacker(feature,np_label,feature_sets,model_dic,m='G'):
    column_name = list(feature_sets.keys())
    if m=='G':
        r = pd.read_csv('l1_stacking_model_G.csv')
    if m=='L':
        r = pd.read_csv('l1_stacking_model_L.csv')
    stack_feature = pd.DataFrame(columns = column_name)
    fitted_model = {}
    for x in column_name:
        best_stacking_model = model_dic[r[x].value_counts().keys()[0]]
        stacking_feature = np.array(feature[feature_sets[x]])
        stack_feature[x] = level_1_predictor(stacking_feature,np_label, best_stacking_model)
        if best_stacking_model==LogisticRegression:
            md = best_stacking_model(max_iter = 20**9)
            md.fit(stacking_feature, np_label)
            fitted_model[x]=md
        else:
            md = best_stacking_model()
            md.fit(stacking_feature, np_label)
            fitted_model[x]=md
        print(best_stacking_model.__name__[0], end = ' ')
    return stack_feature, fitted_model

In [98]:
def stacker_for_testSet(feature,feature_sets,fitted_models_l1):
    column_name = list(feature_sets.keys())
    stack_feature = pd.DataFrame(columns = column_name)
    for x in column_name:
        i = np.array(feature[feature_sets[x]])
        stack_feature[x] = [x[1] for x in fitted_models_l1[x].predict_proba(i)]
    return stack_feature

In [130]:
training_set= pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_0.csv')
training_feature = training_set.iloc[:,1:-1]
training_label = pd.DataFrame(training_set['target'], columns=['target'])

In [None]:
start_time = time.time()
ame = amex_default_prediction()
ame.fit(training_feature, training_label)
print(time.time()-start_time, 'seconds')

In [None]:
for x in range(1:15):
    start_time = time.time()
    test_set = pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_1.csv')
    test_feature = test_set.iloc[:,1:-1]
    test_label = pd.DataFrame(test_set['target'], columns=['target'])
    prediction = ame.predict(test_feature)
    prediction_1 = [x[1] for x in prediction]
    print(roc_auc_score(test_label, prediction_1))
    print(time.time()-start_time, 'seconds')