In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score 
import time
import csv

In [2]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [3]:
def level_1_predictor(t_train,y, model):
    leng = int(len(t_train)/2)
    t_a, t_b= t_train[:leng], t_train[leng:]
    tg_a, tg_b = y[:leng], y[leng:]
    if model == LogisticRegression:
        m1 = model(max_iter = 20**9)
        m2 = model(max_iter = 20**9)
    else:
        m1 = model()
        m2 = model()

    m1.fit(t_a,tg_a)
    tp_b = [i[1] for i in m1.predict_proba(t_b)]
    m2.fit(t_b,tg_b)
    tp_a = [i[1] for i in m2.predict_proba(t_a)]
    tp = [x for x in np.hstack((tp_a,tp_b))]
    
    return tp

In [4]:
def retrieve_important_feature(s,m):
    file = pd.read_csv('important_l1.csv')
    feature_name = file.columns
    if m=='G':
        importance = file.iloc[0]
    if m=='L':
        importance = file.iloc[1]
    itr = 0
    important_feature = []
    unimportant_feature = []
    for x in importance:
        if x>s:
            important_feature.append(feature_name[itr])
        else:
            unimportant_feature.append(feature_name[itr])
        itr+=1
    return important_feature, unimportant_feature

In [5]:
def categorize_important_feature(important_feature, unimportant_feature):
    delinquency=[]
    spend = []
    payment = []
    balance = []
    risk = []
    std = []
    mean = []
    _25 = []
    _50 = []
    _75 = []
    _min = []
    _max = []

    for x in important_feature:
        if x[0]=='D':
            delinquency.append(x)
        if x[0]=='S':
            spend.append(x)
        if x[0]=='P':
            payment.append(x)
        if x[0]=='B':
            balance.append(x)
        if x[0]=='R':
            risk.append(x)
        if x[-3:]=='std':
            std.append(x)
        elif x[-3:]=='25%':
            _25.append(x)
        elif x[-3:]=='75%':
            _75.append(x)
        elif x[-3:]=='50%':
            _50.append(x)
        elif x[-3:]=='min':
            _min.append(x)
        elif x[-3:]=='max':
            _max.append(x)
        else:
            mean.append(x)
    if unimportant_feature != None:
        feature_sets = {'delinquency':delinquency,'spend':spend,'payment':payment,'balance':balance,'risk':risk,'std':std,
                        'mean':mean,'_25':_25,'_50':_50,'_75':_75,'_min':_min,'_max':_max, 'unimportant_fe':unimportant_feature}
    else:
        feature_sets = {'delinquency':delinquency,'spend':spend,'payment':payment,'balance':balance,'risk':risk,'std':std,
                        'mean':mean,'_25':_25,'_50':_50,'_75':_75,'_min':_min,'_max':_max}        
    return feature_sets

In [6]:
def stacker(feature,np_label,feature_sets,model_dic,m='G'):
    column_name = list(feature_sets.keys())
    if m=='G':
        r = pd.read_csv('l1_stacking_model_G.csv')
    if m=='L':
        r = pd.read_csv('l1_stacking_model_L.csv')
    stack_feature = pd.DataFrame(columns = column_name)
    fitted_model = {}
    for x in column_name:
        best_stacking_model = model_dic[r[x].value_counts().keys()[0]]
        if len(feature_sets[x])!=0:
            stacking_feature = np.array(feature[feature_sets[x]])
            stack_feature[x] = level_1_predictor(stacking_feature,np_label, best_stacking_model)
            if best_stacking_model==LogisticRegression:
                md = best_stacking_model(max_iter = 20**9)
                md.fit(stacking_feature, np_label)
                fitted_model[x]=md
            else:
                md = best_stacking_model()
                md.fit(stacking_feature, np_label)
                fitted_model[x]=md
        else:
            stack_feature[x] = [1 for x in range(leng)]
            fitted_model[x] = None
        print(best_stacking_model.__name__[0], end = ' ')
    return stack_feature, fitted_model

In [7]:
def stacker_for_testSet(feature,feature_sets,fitted_models_l1):
    column_name = list(feature_sets.keys())
    stack_feature = pd.DataFrame(columns = column_name)
    leng = len(feature)
    for x in column_name:
        if len(feature_sets[x])!=0:
            i = np.array(feature[feature_sets[x]])
            stack_feature[x] = [x[1] for x in fitted_models_l1[x].predict_proba(i)]
        else:
            stack_feature[x] = [1. for x in range(leng)]
    return stack_feature

In [8]:
class amex_default_prediction:
    # unimpt_l1 is the parameter to control if I add all unimpt feature as a another stacking feature 
    # impt_l2 is the parameter to control if I add impt features for the final prediction, 
    #'n' means I only used the stacking feature.
    def __init__(self,n, unimpt_l1, impt_l2, std):
        self._feature_importance = pd.read_csv('important_l1.csv')
        self._feature_name = self._feature_importance.columns
        self._impt_fe, self._unimpt_fe= retrieve_important_feature(n,'G')
        self._impt_fe_train = None
        self._std = std
        self._impt_l2 = impt_l2
        if unimpt_l1 == 'y':
            self._feature_sets = categorize_important_feature(self._impt_fe, self._unimpt_fe)
        else:
            self._feature_sets = categorize_important_feature(self._impt_fe, None)
        self._models = [LogisticRegression,GradientBoostingClassifier,RandomForestClassifier]
        self._model_dic = {}
        for x in self._models:
            self._model_dic[x.__name__] = x
        self._train_l1 = None
        self._feature_l2 = None
        self._label = None
        self._stacking_train = None
        self._fitted_models_l1 = None
        self._fitted_model_l2 = None
        self._test_l1 = None
        self._test_l2 = None
        self._stacked_test = None
        self._prediction = None
    def fillNa(self, data):
        feature = data
        for x in data.columns:
            feature[x]=feature[x].fillna(feature[x].mean())
        return feature
    def fit(self, train, label, model):
        self._train_l1 = self.fillNa(train)
        if self._std == True:
            self._train_l1 = stder(self._train_l1)
        self._label = np.array(label).T
        self._stacking_train, self._fitted_models_l1 = stacker(self._train_l1,self._label,self._feature_sets, self._model_dic)
        if self._impt_l2 =='y':         
            self._impt_fe_train = self._train_l1[self._impt_fe]
            self._train_l2 = pd.concat([self._impt_fe_train,self._stacking_train], axis = 1)        
            self._fitted_model_l2 = model()
            self._fitted_model_l2.fit(self._train_l2, self._label)
        else:
            self._train_l2 = self._stacking_train
            self._fitted_model_l2 = model()
            self._fitted_model_l2.fit(self._train_l2, self._label)
    def predict_proba(self, test):
        self._test_l1 = self.fillNa(test)
        if self._std == 'y':
            self._test_l1 = stder(self._test_l1)
        self._stacked_test = stacker_for_testSet(self._test_l1,self._feature_sets,self._fitted_models_l1)
        if self._impt_l2 == 'y':            
            self._impt_fe_test = self._test_l1[self._impt_fe]
            self._test_l2 = pd.concat([self._impt_fe_test,self._stacked_test], axis = 1)
            self._prediction = self._fitted_model_l2.predict_proba(self._test_l2)[:,1]
        else:
            self._test_l2 = self._stacked_test
            self._prediction = self._fitted_model_l2.predict_proba(self._test_l2)[:,1]
        return self._prediction 

In [9]:
def default_rate_captured(label, prediction):
    df = pd.DataFrame(columns = ['label', 'prediction'])
    df['label'] = label
    df['prediction'] = prediction
    numb_of_default = df.loc[df['label']==1]
    default_caught = numb_of_default.loc[numb_of_default['prediction']>0.5]
    return default_caught.shape[0]/numb_of_default.shape[0]
def gini(list_of_values):
    sorted_list = sorted(list(list_of_values))
    height, area = 0, 0
    for value in sorted_list:
        height += value
        area += height - value / 2.
    fair_area = height * len(list_of_values) / 2
    return (fair_area - area) / fair_area
def normalized_gini(y,y_pred):
    normalized_gini = gini(y_pred)/gini(y)
    return normalized_gini
def M(label, prediction):
    return (normalized_gini(label,prediction)+default_rate_captured(label, prediction))/2

In [10]:
from sklearn.preprocessing import StandardScaler
def stder(X):
    cols = X.columns
    stder = StandardScaler(copy=True, with_mean = True, with_std = True)
    stder.fit(X)
    return pd.DataFrame(stder.transform(X), columns = cols)

In [11]:
def get_data(a,b,n):
    shuffled_indices = np.random.permutation([x for x in range(a,b)])
    file = []
    for x in shuffled_indices[:n]:
        file.append(pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_{0}.csv'.format(x)))
    data_set = pd.concat(file, axis = 0)
    data_set.index = [x for x in range(data_set.shape[0])]
    feature = data_set.iloc[:, 1:-1]
    label = data_set['target']
    return feature, label

In [12]:
def get_balanced_data(a,b,n):
    shuffled_indices_0 = np.random.permutation([x for x in range(a,b)])
    file = []
    for x in shuffled_indices_0[:n]:
        file.append(pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_{0}.csv'.format(x)))
    data_set = pd.concat(file, axis = 0)
    data_set.index = [x for x in range(data_set.shape[0])]
    defualted = data_set.loc[data_set['target']==1]
    undefualted = data_set.loc[data_set['target']==0]
    shuffled_indices_1 = np.random.permutation(undefualted.shape[0])[:defualted.shape[0]]
    pick_undefualted = undefualted.iloc[shuffled_indices_1]
    temp = pd.concat([defualted, pick_undefualted], axis = 0)
    shuffled_indices_2 = np.random.permutation(temp.shape[0])
    df = temp.iloc[shuffled_indices_2]
    df.index = [x for x in range(len(df))]
    feature = df.iloc[:,1:-1]
    label = df['target']
    return feature, label

In [13]:
def record(bc, unimpt_l1, impt_l2,std, fe_impt,flz, score):
    rd = pd.read_csv('record.csv')
    i = pd.DataFrame([bc, unimpt_l1, impt_l2,std, fe_impt,flz, score]).T
    i.columns = ['balanced', 'unimpt_l1','impt_l2','std','feature_importance','file_size', 'score']
    C = False
    b = ''
    for I in np.array(i.iloc[0,:-2]):
        b+=str(I)
    for x in range(len(rd)):
        a = ''
        for R in np.array(rd.iloc[x,:-2]):
            a+=str(R)
        if a==b:
            C = True
            rd.loc[x,'score']=0.5*(rd.loc[x,'score']+score) 
            rd.to_csv('record.csv', index = False)
    if C==False:
        out = pd.concat([rd, i], axis = 0)
        out.to_csv('record.csv', index = False)

In [14]:
def zero_mean(data):
    for x in data.columns:
        m = data[x].mean()
        data[x] = data[x]-m
    return data

### test

start_tn = 1
end_tn = 21
choose_tn = 5
training_feature, training_label = get_data(start_tn, end_tn, choose_tn)
training_feature_std = stder(training_feature)

In [24]:
start_tn, end_tn= 1, 21
choose_tn = 10
training_feature, training_label = get_balanced_data(start_tn, end_tn, choose_tn)
training_feature = zero_mean(training_feature)

In [27]:
start_tt, end_tt= 21, 51
choose_tt = 15
test_feature, test_label = get_data(start_tt, end_tt, choose_tt)
test_feature = zero_mean(test_feature)

In [21]:
(training_feature_B[training_feature_B.columns[0]]-training_feature_B[training_feature_B.columns[0]].mean()).mean()

9.634477773018307e-17

#### feature importance 

In [28]:
data_balanced, unimpt_l1, impt_l2, std = 'y','n','n','n'
ame = amex_default_prediction(10,unimpt_l1, impt_l2, std)
ame.fit(training_feature, training_label, GradientBoostingClassifier)
prediction = ame.predict_proba(test_feature)
score1 = amex_metric(test_label,  pd.DataFrame(prediction, columns= ['prediction']))

G R L G R G L G G L G L 

NameError: name 'amex_metric' is not defined

In [32]:
score1 = amex_metric(pd.DataFrame(test_label),  pd.DataFrame(prediction, columns= ['prediction']))

In [33]:
score1

0.735614288262392

In [34]:
start_time = time.time()
data_balanced, unimpt_l1, impt_l2, std = 'y','n','n','n'
for x in [0,10,20,30,40,50]:
    start_time = time.time()
    ame = amex_default_prediction(x,unimpt_l1, impt_l2, std)
    ame.fit(training_feature_B, training_label_B, GradientBoostingClassifier)
    prediction1 = ame.predict_proba(test_feature)
    score1 = M(test_label, prediction1)
    record(data_balanced,unimpt_l1, impt_l2, std,x,choose_tn,score1)
    print(x, score1,time.time()-start_time)

G R L G R G L G G L G L 0 0.6236209036887282 6383.467397928238
G R L G R G L G G L G L 10 0.6408313532065635 7768.9975526332855
G R L G R G L G G L G L 20 0.6628432474365336 5409.229932308197
G R L G R G L G G L G L 30 0.6119195651577155 3049.318484067917
G R L G R G L G G L G L 40 0.6650684758781171 2511.714039325714
G 

NameError: name 'leng' is not defined

In [None]:
start_time = time.time()
data_balanced, unimpt_l1, impt_l2, std = 'y','y','n','n'
for x in [0,10,20,30,40,50]:
    start_time = time.time()
    ame = amex_default_prediction(x,unimpt_l1, impt_l2, std)
    ame.fit(training_feature_B, training_label_B, GradientBoostingClassifier)
    prediction1 = ame.predict_proba(test_feature)
    score1 = M(test_label, prediction1)
    record(data_balanced,unimpt_l1, impt_l2, std,x,choose_tn,score1)
    print(x, score1,time.time()-start_time)

In [None]:
start_time = time.time()
data_balanced, unimpt_l1, impt_l2, std = 'y','n','y','n'
for x in [0,10,20,30,40,50]:
    start_time = time.time()
    ame = amex_default_prediction(x,unimpt_l1, impt_l2, std)
    ame.fit(training_feature_B, training_label_B, GradientBoostingClassifier)
    prediction1 = ame.predict_proba(test_feature)
    score1 = M(test_label, prediction1)
    record(data_balanced,unimpt_l1, impt_l2, std,x,choose_tn,score1)
    print(x, score1,time.time()-start_time)

### predict

In [None]:
training_feature, training_label = get_balanced_data(0,40,40)
training_feature_std = stder(training_feature)

In [None]:
ame = amex_default_prediction(10,'y','n', 'n')
ame.fit(training_feature, training_label,GradientBoostingClassifier)

In [None]:
f = pd.DataFrame(columns = ['customer_ID','prediction'])
f.to_csv('prediction.csv', index = False)
for x in range(0,57):
    df_test = pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\transformed_test\test_{0}.csv'.format(x))
    feature = df_test.iloc[:,1:]
    customer_id = df_test['customer_ID']
    prediction = ame.predict_proba(feature)
    output = pd.DataFrame(columns = ['customer_ID','prediction'])
    output['customer_ID'] = customer_ida
    output['prediction'] = prediction
    old_file = pd.read_csv('prediction.csv')
    new_file = pd.concat([old_file,output], axis = 0)
    new_file.to_csv('prediction.csv', index = False)
    print(x, end= ' ')

In [None]:
i = new_file.drop_duplicates(subset='customer_ID')

In [None]:
i.to_csv('prediction.csv', index = False)

In [None]:
i

In [None]:
new_file = pd.DataFrame(columns = ['customer_ID','prediction'])
new_file.to_csv('prediction.csv', index = False)

In [None]:
i['prediction'].value_counts()