In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score 
import time
import csv

In [2]:
import matplotlib.pyplot as plt
import tensorflow as tf

In [3]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [4]:
def get_data(a,b,n):
    shuffled_indices = np.random.permutation([x for x in range(a,b)])
    file = []
    for x in shuffled_indices[:n]:
        file.append(pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_{0}.csv'.format(x)))
    data_set = pd.concat(file, axis = 0)
    data_set.index = [x for x in range(data_set.shape[0])]
    feature = data_set.iloc[:, 1:-1]
    label = data_set['target']
    return feature, pd.DataFrame(label, columns=['target'])

In [5]:
def get_balanced_data(a,b,n,r=1):
    shuffled_indices_0 = np.random.permutation([x for x in range(a,b)])
    file = []
    for x in shuffled_indices_0[:n]:
        file.append(pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_{0}.csv'.format(x)))
    data_set = pd.concat(file, axis = 0)
    data_set.index = [x for x in range(data_set.shape[0])]
    defualted = data_set.loc[data_set['target']==1]
    undefualted = data_set.loc[data_set['target']==0]
    shuffled_indices_1 = np.random.permutation(undefualted.shape[0])[:defualted.shape[0]*r]
    pick_undefualted = undefualted.iloc[shuffled_indices_1]
    temp = pd.concat([defualted, pick_undefualted], axis = 0)
    shuffled_indices_2 = np.random.permutation(temp.shape[0])
    df = temp.iloc[shuffled_indices_2]
    df.index = [x for x in range(len(df))]
    feature = df.iloc[:,1:-1]
    label = pd.DataFrame(df['target'])
    return feature, label

In [6]:
def zero_mean(df):
    for x in range(df.shape[1]):
        m = df[:,x].mean()
        df[:,x]-=m
    return df

In [7]:
def fillna(df):
    data = df
    for x in data.columns:
        data[x]=data[x].fillna(data[x].mean())
    return data

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
OHE = OneHotEncoder()

In [12]:
feature_train, label_train = get_balanced_data(1,31,30)
Y_train=OHE.fit_transform(label_train)
Y_train=Y_train.toarray()
X_train = np.array(feature_train)
X_train = zero_mean(X_train)

In [13]:
label_train = np.array(label_train)

In [14]:
x_test, y_test = get_data(41,51,10)
y_test = OHE.transform(y_test)
y_test = y_test.toarray()
x_test = np.array(x_test)
x_test = zero_mean(x_test)

In [15]:
feature_val, label_val = get_balanced_data(31,41,10)
y_val = OHE.transform(label_val)
y_val = y_val.toarray()
x_val = np.array(feature_val)
x_val = zero_mean(x_val)

In [16]:
label_val = np.array(label_val)

In [117]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(1302,)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(935,activation='elu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(85,activation='sigmoid'),
    tf.keras.layers.Dense(2, activation='softmax')])

In [118]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              metrics="accuracy",
              loss='sparse_categorical_crossentropy')

In [119]:
model.fit(X_train,label_train,
                epochs=50,
                batch_size=540,
                shuffle=True,
                validation_data=(x_val,label_val))
model.summary()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model: "sequential_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_26 (Flatten)        (None, 1302)              0         
                                                                 
 batch_normalization_52 (Bat  (None, 1302)             5208      
 chNormalization)                                       

 dense_80 (Dense)            (None, 2)                 172       
                                                                 
Total params: 1,306,985
Trainable params: 1,302,511
Non-trainable params: 4,474
_________________________________________________________________


In [120]:
prediction = model.predict(x_test)[:,1]

In [121]:
amex_metric(pd.DataFrame(y_test[:,1], columns=['target']), pd.DataFrame(prediction, columns= ['prediction']))

0.7579620644031844

### output

In [654]:
f = pd.DataFrame(columns = ['customer_ID','prediction'])
f.to_csv('prediction.csv', index = False)
for x in range(0,57):
    df_test = pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\transformed_test\test_{0}.csv'.format(x))
    feature = df_test.iloc[:,1:]
    X_test = np.array(fillna(feature))
    X_test = zero_mean(X_test)
    customer_id = df_test['customer_ID']
    prediction = model.predict(X_test)[:,1]
    output = pd.DataFrame(columns = ['customer_ID','prediction'])
    output['customer_ID'] = customer_id
    output['prediction'] = prediction
    old_file = pd.read_csv('prediction.csv')
    new_file = pd.concat([old_file,output], axis = 0)
    new_file.to_csv('prediction.csv', index = False)
    print(x, end= ' ')

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 

In [655]:
i = new_file.drop_duplicates(subset='customer_ID')
i.to_csv('prediction.csv', index = False)