In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score 
import time
import csv

In [None]:
def gini(list_of_values):
    sorted_list = sorted(list(list_of_values))
    height, area = 0, 0
    for value in sorted_list:
        height += value
        area += height - value / 2.
    fair_area = height * len(list_of_values) / 2
    return (fair_area - area) / fair_area
  
def normalized_gini(y_pred, y):
    normalized_gini = gini(y_pred)/gini(y)
    return normalized_gini
    

In [2]:
def level_1_predictor(t_train,y, model):
    leng = int(len(t_train)/2)
    t_a, t_b= t_train[:leng], t_train[leng:]
    tg_a, tg_b = y[:leng], y[leng:]
    if model == LogisticRegression:
        m1 = model(max_iter = 20**9)
        m2 = model(max_iter = 20**9)
    else:
        m1 = model()
        m2 = model()

    m1.fit(t_a,tg_a)
    tp_b = [i[1] for i in m1.predict_proba(t_b)]
    m2.fit(t_b,tg_b)
    tp_a = [i[1] for i in m2.predict_proba(t_a)]
    tp = [x for x in np.hstack((tp_a,tp_b))]
    
    return tp

In [3]:
def retrieve_important_feature(s,m):
    file = pd.read_csv('important_l1.csv')
    feature_name = file.columns
    if m=='G':
        importance = file.iloc[0]
    if m=='L':
        importance = file.iloc[1]
    itr = 0
    important_feature = []
    unimportant_feature = []
    for x in importance:
        if x>s:
            important_feature.append(feature_name[itr])
        else:
            unimportant_feature.append(feature_name[itr])
        itr+=1
    return important_feature, unimportant_feature

In [103]:
def categorize_important_feature(important_feature, unimportant_feature):
    delinquency=[]
    spend = []
    payment = []
    balance = []
    risk = []
    std = []
    mean = []
    _25 = []
    _50 = []
    _75 = []
    _min = []
    _max = []

    for x in important_feature:
        if x[0]=='D':
            delinquency.append(x)
        if x[0]=='S':
            spend.append(x)
        if x[0]=='P':
            payment.append(x)
        if x[0]=='B':
            balance.append(x)
        if x[0]=='R':
            risk.append(x)
        if x[-3:]=='std':
            std.append(x)
        elif x[-3:]=='25%':
            _25.append(x)
        elif x[-3:]=='75%':
            _75.append(x)
        elif x[-3:]=='50%':
            _50.append(x)
        elif x[-3:]=='min':
            _min.append(x)
        elif x[-3:]=='max':
            _max.append(x)
        else:
            mean.append(x)  
    feature_sets = {'delinquency':delinquency,'spend':spend,'payment':payment,'balance':balance,'risk':risk,'std':std,
                    'mean':mean,'_25':_25,'_50':_50,'_75':_75,'_min':_min,'_max':_max, 'unimportant_fe':unimportant_feature}
    return feature_sets

In [6]:
def stacker(feature,np_label,feature_sets,model_dic,m='G'):
    column_name = list(feature_sets.keys())
    if m=='G':
        r = pd.read_csv('l1_stacking_model_G.csv')
    if m=='L':
        r = pd.read_csv('l1_stacking_model_L.csv')
    stack_feature = pd.DataFrame(columns = column_name)
    fitted_model = {}
    for x in column_name:
        best_stacking_model = model_dic[r[x].value_counts().keys()[0]]
        stacking_feature = np.array(feature[feature_sets[x]])
        stack_feature[x] = level_1_predictor(stacking_feature,np_label, best_stacking_model)
        if best_stacking_model==LogisticRegression:
            md = best_stacking_model(max_iter = 20**9)
            md.fit(stacking_feature, np_label)
            fitted_model[x]=md
        else:
            md = best_stacking_model()
            md.fit(stacking_feature, np_label)
            fitted_model[x]=md
        print(best_stacking_model.__name__[0], end = ' ')
    return stack_feature, fitted_model

In [7]:
def stacker_for_testSet(feature,feature_sets,fitted_models_l1):
    column_name = list(feature_sets.keys())
    stack_feature = pd.DataFrame(columns = column_name)
    for x in column_name:
        i = np.array(feature[feature_sets[x]])
        stack_feature[x] = [x[1] for x in fitted_models_l1[x].predict_proba(i)]
    return stack_feature

In [110]:
class amex_default_prediction:
    def __init__(self,n, unimpt_l1='n', impt_l2 = 'n', std = True):
        self._feature_importance = pd.read_csv('important_l1.csv')
        self._feature_name = self._feature_importance.columns
        self._impt_fe, self._unimpt_fe= retrieve_important_feature(n,'G')
        self._impt_fe_train = None
        self._std = std
        self._impt_l2 = impt_l2
        if unimpt_l1 == 'y':
            self._feature_sets = categorize_important_feature(self._impt_fe, self._unimpt_fe)
        else:
            self._feature_sets = categorize_important_feature(self._impt_fe, 0)
        self._models = [LogisticRegression,GradientBoostingClassifier,RandomForestClassifier]
        self._model_dic = {}
        for x in self._models:
            self._model_dic[x.__name__] = x
        self._train_l1 = None
        self._feature_l2 = None
        self._label = None
        self._stacking_train = None
        self._fitted_models_l1 = None
        self._fitted_model_l2 = None
        self._test_l1 = None
        self._test_l2 = None
        self._stacked_test = None
        self._prediction = None
    def fillNa(self, data):
        feature = data
        for x in data.columns:
            feature[x]=feature[x].fillna(feature[x].mean())
        return feature
    def fit(self, train, label, model):
        self._train_l1 = self.fillNa(train)
        if self._std == True:
            self._train_l1 = stder(self._train_l1)
        self._label = np.array(label).T
        self._stacking_train, self._fitted_models_l1 = stacker(self._train_l1,self._label,self._feature_sets, self._model_dic)
        if self._impt_l2 =='y':         
            self._impt_fe_train = self._train_l1[self._impt_fe]
            self._train_l2 = pd.concat([self._impt_fe_train,self._stacking_train], axis = 1)        
            self._fitted_model_l2 = model()
            self._fitted_model_l2.fit(self._train_l2, self._label)
        else:
            self._train_l2 = self._stacking_train
            self._fitted_model_l2 = model()
            self._fitted_model_l2.fit(self._train_l2, self._label)
    def predict_proba(self, test):
        self._test_l1 = self.fillNa(test)
        if self._std == True:
            self._test_l1 = stder(self._test_l1)
        self._stacked_test = stacker_for_testSet(self._test_l1,self._feature_sets,self._fitted_models_l1)
        if self._impt_l2 == 'y':            
            self._impt_fe_test = self._test_l1[self._impt_fe]
            self._test_l2 = pd.concat([self._impt_fe_test,self._stacked_test], axis = 1)
            self._prediction = self._fitted_model_l2.predict_proba(self._test_l2)[:,1]
        else:
            self._test_l2 = self._stacked_test
            self._prediction = self._fitted_model_l2.predict_proba(self._test_l2)[:,1]
        return self._prediction 

In [97]:
def default_rate_captured(label, prediction):
    df = pd.DataFrame(columns = ['label', 'prediction'])
    df['label'] = label
    df['prediction'] = prediction
    numb_of_default = df.loc[df['label']==1]
    default_caught = numb_of_default.loc[numb_of_default['prediction']>0.5]
    return default_caught.shape[0]/numb_of_default.shape[0]
def gini(list_of_values):
    sorted_list = sorted(list(list_of_values))
    height, area = 0, 0
    for value in sorted_list:
        height += value
        area += height - value / 2.
    fair_area = height * len(list_of_values) / 2
    return (fair_area - area) / fair_area
def normalized_gini(y,y_pred):
    normalized_gini = gini(y_pred)/gini(y)
    return normalized_gini    
def M(label, prediction):
    return (normalized_gini(label,prediction)+default_rate_captured(label, prediction))/2

### test

In [8]:
from sklearn.preprocessing import StandardScaler
def stder(X):
    cols = X.columns
    stder = StandardScaler(copy=True, with_mean = True, with_std = True)
    stder.fit(X)
    return pd.DataFrame(stder.transform(X), columns = cols)

In [9]:
def get_data(a,b,n):
    shuffled_indices = np.random.permutation([x for x in range(a,b)])
    file = []
    for x in shuffled_indices[:n]:
        file.append(pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_{0}.csv'.format(x)))
    data_set = pd.concat(file, axis = 0)
    data_set.index = [x for x in range(data_set.shape[0])]
    feature = data_set.iloc[:, 1:-1]
    label = data_set['target']
    return feature, label

In [89]:
def get_balanced_data(a,b,n):
    shuffled_indices_0 = np.random.permutation([x for x in range(a,b)])
    file = []
    for x in shuffled_indices_0[:n]:
        file.append(pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_{0}.csv'.format(x)))
    data_set = pd.concat(file, axis = 0)
    data_set.index = [x for x in range(data_set.shape[0])]
    defualted = data_set.loc[data_set['target']==1]
    undefualted = data_set.loc[data_set['target']==0]
    shuffled_indices_1 = np.random.permutation(undefualted.shape[0])[:defualted.shape[0]]
    pick_undefualted = undefualted.iloc[shuffled_indices_1]
    temp = pd.concat([defualted, pick_undefualted], axis = 0)
    shuffled_indices_2 = np.random.permutation(temp.shape[0])
    df = temp.iloc[shuffled_indices_2]
    df.index = [x for x in range(len(df))]
    feature = df.iloc[:,1:-1]
    label = df['target']
    return feature, label

In [111]:
training_feature, training_label = get_balanced_data(0,40,20)
training_feature_std = stder(training_feature)

In [100]:
test_feature, test_label = get_data(20,40,10)
test_feature_std = stder(test_feature)

In [104]:
start_time = time.time()
ame = amex_default_prediction(10)
ame.fit(training_feature, training_label,GradientBoostingClassifier)
print(time.time()-start_time)

G G L G R G G L G L G L G 4397.90585064888


In [106]:
start_time = time.time()
prediction1 = ame.predict_proba(test_feature)
score1 = M(test_label, prediction1[:,1])
print(score1, time.time()-start_time)

0.859388492127309 362.1442425251007


In [89]:
#0.8525062807324651
#0.8073017488850901

In [None]:
ame_std = amex_default_prediction(10)
ame_std.fit(training_feature_std, training_label,GradientBoostingClassifier )

In [None]:
start_time = time.time()
prediction2 = ame_std.predict(test_feature_std)
score2 = roc_auc_score(test_label, prediction2)
print(score2, time.time()-start_time)

In [None]:
df_score = pd.DataFrame(columns = ['G','L'])

In [None]:
for x in range(3,15):
    start_time = time.time()
    ame_i = amex_default_prediction(x)
    ame_i.fit(training_feature,training_label,GradientBoostingClassifier)
    prediction = ame_i.predict(test_feature)
    score_i = roc_auc_score(test_label, prediction)
    df_score.loc[x,'G'] = score_i
    print(x,time.time()-start_time)

### predict

In [None]:
training_feature, training_label = get_balanced_data(0,40,20)
training_feature_std = stder(training_feature)

In [None]:
ame = amex_default_prediction(9,'y','n', True)
ame.fit(training_feature, training_label,GradientBoostingClassifier)

In [None]:
f = pd.DataFrame(columns = ['customer_ID','prediction'])
f.to_csv('prediction.csv', index = False)
for x in range(0,57):
    df_test = pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\transformed_test\test_{0}.csv'.format(x))
    feature = df_test.iloc[:,1:]
    customer_id = df_test['customer_ID']
    prediction = ame.predict_proba(feature)
    output = pd.DataFrame(columns = ['customer_ID','prediction'])
    output['customer_ID'] = customer_id20
    output['prediction'] = prediction
    old_file = pd.read_csv('prediction.csv')
    new_file = pd.concat([old_file,output], axis = 0)
    new_file.to_csv('prediction.csv', index = False)
    print(x, end= ' ')

In [None]:
i = new_file.drop_duplicates(subset='customer_ID')

In [None]:
i.to_csv('prediction.csv', index = False)

In [43]:
i

Unnamed: 0,customer_ID,prediction
0,022038896e955592be9fd15a2c92280af0a6a6528b82e2...,0.007600
1,02cb0afa52e29623c2ee458f0833969ce67118f3fa079b...,0.477175
2,02cb2189daeb052be4fcd43e321729c6350baff1ddaf48...,0.059211
3,02cb3afa5b7bcf5c5a1ebba986aa6e8d3730e07b87c7ea...,0.006848
4,02cb3ff52cfc30d234d1d225be36f5ddcaf8a5135a98b9...,0.011097
...,...,...
13313,ff5077720c690e7e081bb98a27753bf2e753b493aa95d7...,0.409132
13314,ff4f8244df0510415856574029fe292cb59849bc0b1334...,0.138331
13315,fe79c3058dc562e4fa749d906d5e9896d77754eab075b5...,0.065636
13316,ff45089de47699e0400c84ef0d5c5db4e5d7c15edbd079...,0.063360


In [33]:
new_file = pd.DataFrame(columns = ['customer_ID','prediction'])
new_file.to_csv('prediction.csv', index = False)

In [24]:
i['prediction'].value_counts()

0    685425
1    239196
Name: prediction, dtype: int64