In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense
from keras.models import Sequential, load_model
# from keras.utils import to_categorical
# from keras.callbacks import EarlyStopping

# from imblearn.over_sampling import SMOTE, RandomOverSampler

# Train model

In [111]:
GOAL = ['target']
FEATURES = ['CreditScore', 'Geography', 'Age', 'Tenure',
            'Balance', 'NumOfProducts', 'HasCrCard',
            'IsActiveMember', 'EstimatedSalary'
           ]
IGNORED_FEATURES = [] # a placeholder for irrelevant features
TRAIN_FEATURES = list(set([c for c in FEATURES if c not in IGNORED_FEATURES]))
NUMERICAL_FEATURES = ['CreditScore', 'Age', 'Tenure', 'Balance',
                      'NumOfProducts', 'HasCrCard', 'IsActiveMember',
                      'EstimatedSalary'] # requires imputation
CATEGORICAL_FEATURES = ['Geography', 'Gender'] # requires label encoding followed by imputation

over_sampling = False

In [112]:
def run_load_data():
    print ('load_data - START')
    df = pd.read_csv('../../data/data_binary_classification_train.csv')
    print ('load_data - END')
    return df

In [113]:
def run_train_test_split(df):
    print ('train_test_split - START')
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    print ('train_test_split - END')
    return train_df, test_df

In [114]:
def run_pre_processing(train_df, test_df):
    print ('pre_processing - START')  
    # Fill NULL with Mean / Median / -1
    for f in NUMERICAL_FEATURES:
        # Imputation
        imp = Imputer(missing_values='NaN', strategy='median',axis=0, copy=False)
        train_df[f] = imp.fit_transform(train_df[f].values.reshape(-1,1))
        test_df[f] = imp.fit_transform(test_df[f].values.reshape(-1,1))
    
    # Pre-processing non-numeric values using numeric encoding, followed by imputation of most_frequent value
    # Why use numeric encoding over one hot encoding:
    # https://medium.com/data-design/visiting-categorical-features-and-encoding-in-decision-trees-53400fa65931
    # Encode using .cat.codes or LabelEncoder:
    # http://pbpython.com/categorical-encoding.html
    for f in CATEGORICAL_FEATURES:
        # Numerical Encoding
        train_df[f] = train_df[f].astype('category').cat.codes
        test_df[f] = test_df[f].astype('category').cat.codes
#         lbl = LabelEncoder()
#         train_df[f] = lbl.fit_transform(train_df[f].reshape(-1,1))
#         test_df[f] = lbl.fit_transform(test_df[f].reshape(-1,1))
        # Imputation
        imp = Imputer(missing_values='NaN', strategy='most_frequent',axis=0, copy=False)
        train_df[f] = imp.fit_transform(train_df[f].values.reshape(-1,1))
        test_df[f] = imp.fit_transform(test_df[f].values.reshape(-1,1))

    for f in TRAIN_FEATURES:
        # Scaling of both numerical and categorical features
        sc = StandardScaler()
        train_df[f] = sc.fit_transform(train_df[f].values.reshape(-1,1))
        test_df[f] = sc.fit_transform(test_df[f].values.reshape(-1,1))
    print ('pre_processing - END') 
    return train_df, test_df

In [115]:
def run_x_y_split(train_df, test_df):
    print ('x_y_split - START')
    X_train = train_df[TRAIN_FEATURES]
    y_train = train_df[GOAL]
    X_test = test_df[TRAIN_FEATURES]
    y_test = test_df[GOAL]
    n_input_nodes = X_train.shape[1] # Number of input variables
    print ('x_y_split - END')
    return X_train, y_train, X_test, y_test, n_input_nodes

In [116]:
def run_over_sampling(X_train, y_train):
    print ('over_sampling - START')
#     TODO: Figure how to oversample using SMOTE 
#     X_train, y_train = SMOTE().fit_sample(X_train, y_train)
#     X_train, y_train = RandomOverSampler().fit_sample(X_train, y_train)
    print ('over_sampling - END')
    return X_train, y_train

In [117]:
def run_grid_search(X_train, y_train, n_input_nodes):
    print ('grid_search - START')

    ann_param_grid = {'batch_size': [25, 32],
                      'epochs': [20, 30],
                      'optimizer': ['adam', 'rmsprop']
                     }

    def build_classifier(optimizer):
        classifier = Sequential()
        # Rule of thumb: number of nodes for hidden layers = (n_input_nodes + 1) * 0.5
        classifier.add(Dense(units = int(round(((n_input_nodes+1)*0.5),0)), kernel_initializer = 'uniform', activation = 'relu', input_dim = n_input_nodes))
        classifier.add(Dense(units = int(round(((n_input_nodes+1)*0.5),0)), kernel_initializer = 'uniform', activation = 'relu'))
        classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
        classifier.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
        return classifier
    
    ann = KerasClassifier(build_fn = build_classifier)

    grid = GridSearchCV(estimator = ann, param_grid = ann_param_grid,
                               scoring = 'accuracy', cv = 5)
    
    grid = grid.fit(X_train, y_train)

    # Print the best parameters & metric
    print("Best parameters found: ", grid.best_params_)
    print("Best AUC found: ", grid.best_score_)
    
    best_batch_size = grid.best_params_["batch_size"]
    best_nb_epoch = grid.best_params_["epochs"]
    best_optimizer = grid.best_params_["optimizer"]    

    print ('grid_search - END')    
    return best_batch_size, best_nb_epoch, best_optimizer

In [118]:
def run_cross_validation(X_train, y_train, n_input_nodes, best_batch_size, best_nb_epoch, best_optimizer):
    print ('cross_validation - START')
    
    def build_classifier():
        classifier = Sequential()
        # Rule of thumb: number of nodes for hidden layers = (n_input_nodes + 1) * 0.5
        classifier.add(Dense(units = int(round(((n_input_nodes+1)*0.5),0)), kernel_initializer = 'uniform', activation = 'relu', input_dim = n_input_nodes))
        classifier.add(Dense(units = int(round(((n_input_nodes+1)*0.5),0)), kernel_initializer = 'uniform', activation = 'relu'))
        classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
        classifier.compile(optimizer = best_optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
        return classifier
    
    ann = KerasClassifier(build_fn = build_classifier, batch_size = best_batch_size, epochs = best_nb_epoch)
    accuracies = cross_val_score(estimator = ann, X = X_train, y = y_train, cv = 5)
    mean = accuracies.mean()
    std = accuracies.std()
    
    print "mean of accuracies for cross-validation on train dataset: ", round(mean, 2)
    print "standard deviation of accuracies for cross-validation on train dataset: ", round(std, 2)
    print ('cross_validation - END')    

In [119]:
def run_train_model(X_train, y_train, n_input_nodes, best_batch_size, best_nb_epoch, best_optimizer):
    print ('train_model - START')
    
    # Build Model
    classifier = Sequential()
    classifier.add(Dense(units = int(round(((n_input_nodes+1)*0.5),0)), kernel_initializer = 'uniform', activation = 'relu', input_dim = n_input_nodes))
    classifier.add(Dense(units = int(round(((n_input_nodes+1)*0.5),0)), kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = best_optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])    
    
    # Fit Model
    classifier.fit(X_train, y_train, batch_size = best_batch_size, nb_epoch = best_nb_epoch)
    
    print ('train_model - END')   
    return classifier

In [120]:
def run_prediction(classifier, X_test):
    print ('run_prediction - START')
    prediction_probability = classifier.predict(X_test)
    prediction = (prediction_probability > 0.5) # Higher the threshold, more conservative in prediction
    print ('run_prediction - END')  
    return prediction

In [121]:
def run_model_evaluation_on_test(prediction, y_test):
    print ('model_evaluation_on_test - START')
    
    cm = confusion_matrix(y_test, prediction)
    print "model confusion matrix: ", cm
    
    prediction = pd.DataFrame(data = prediction, index=y_test.index, columns = ['target'])
    model_evaluation_metric = accuracy_score(y_test, prediction)
    print "model accuracy on test dataset: ", model_evaluation_metric

    print ('model_evaluation_on_test - END')

In [122]:
def run_save_model(classifier):
    print ('save_model - START')
    classifier.save('trained_models/ann_binary_classification.h5')
    classifier.summary()
    print ('save_model - END')

In [123]:
def ann_binary_classification_train():
    start_time = time.time()
    df = run_load_data()
    train_df, test_df = run_train_test_split(df) # Don't have to run this if you given data is already splitted into train & test
    train_df, test_df = run_pre_processing(train_df, test_df) 
    X_train, y_train, X_test, y_test, n_input_nodes = run_x_y_split(train_df, test_df)
    if over_sampling:
        X_train, y_train = run_over_sampling(X_train, y_train)
    best_batch_size, best_nb_epoch, best_optimizer = run_grid_search(X_train, y_train, n_input_nodes)
#     # To check accuracy, run either Grid Search or Cross Validation as Cross Validation is already embedded in Grid Search
#     # Cross validation here helps to identify overfitting if mean of accuracy is high and variance of accuracy is high
    run_cross_validation(X_train, y_train, n_input_nodes, best_batch_size, best_nb_epoch, best_optimizer)
    classifier = run_train_model(X_train, y_train, n_input_nodes, best_batch_size, best_nb_epoch, best_optimizer)
    prediction = run_prediction(classifier, X_test)
    run_model_evaluation_on_test(prediction, y_test)
    run_save_model(classifier)
    print('elapsed_time (in seconds): ', time.time() - start_time)
ann_binary_classification_train()

load_data - START
load_data - END
train_test_split - START
train_test_split - END
pre_processing - START


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


pre_processing - END
x_y_split - START
x_y_split - END
grid_search - START
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
E

Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30


Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Ep

Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoc

Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoc

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 2

Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
('Best parameters found: ', {'epochs': 30, 'optimizer': 'adam', 'batch_size': 32})
('Best AUC found: ', 0.842375)
grid_search - END
cross_validation - START
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/

Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
mean of accuracies for cross-validation on train dataset:  0.83
standard deviation of accuracies for cross-validation on train dataset:  0.01
cross_validation - END
train_model - START
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
train_model - END
run_prediction - START
run_predicti

# Predicting outcome using trained model

In [132]:
FEATURES = ['CreditScore', 'Geography', 'Age', 'Tenure',
            'Balance', 'NumOfProducts', 'HasCrCard',
            'IsActiveMember', 'EstimatedSalary'
           ]
IGNORED_FEATURES = [] # a placeholder for irrelevant features
TRAIN_FEATURES = list(set([c for c in FEATURES if c not in IGNORED_FEATURES]))
NUMERICAL_FEATURES = ['CreditScore', 'Age', 'Tenure', 'Balance',
                      'NumOfProducts', 'HasCrCard', 'IsActiveMember',
                      'EstimatedSalary'] # requires imputation
CATEGORICAL_FEATURES = ['Geography', 'Gender'] # requires label encoding followed by imputation

over_sampling = False

In [133]:
def run_load_data():
    print ('load_data - START')
    df = pd.read_csv('../../data/data_binary_classification_test.csv')
    print ('load_data - END')
    return df

In [146]:
def run_pre_processing(df):
    print ('pre_processing - START')  
    # Extract uuid
    uuid = pd.DataFrame(df.loc[:,'uuid'], columns = ['uuid']) # Extract uuid column
    
    # Fill NULL with Mean / Median / -1
    for f in NUMERICAL_FEATURES:
        # Imputation
        imp = Imputer(missing_values='NaN', strategy='median',axis=0, copy=False)
        df[f] = imp.fit_transform(df[f].values.reshape(-1,1))

    # Pre-processing non-numeric values using numeric encoding, followed by imputation of most_frequent value
    # Why use numeric encoding over one hot encoding:
    # https://medium.com/data-design/visiting-categorical-features-and-encoding-in-decision-trees-53400fa65931
    # Encode using .cat.codes or LabelEncoder:
    # http://pbpython.com/categorical-encoding.html
    for f in CATEGORICAL_FEATURES:
        # Numerical Encoding
        df[f] = df[f].astype('category').cat.codes
#         lbl = LabelEncoder()
#         data[f] = lbl.fit_transform(data[f].reshape(-1,1))
        # Imputation
        imp = Imputer(missing_values='NaN', strategy='most_frequent',axis=0, copy=False)
        df[f] = imp.fit_transform(df[f].values.reshape(-1,1))

    for f in TRAIN_FEATURES:
        # Scaling of both numerical and categorical features
        sc = StandardScaler()
        df[f] = sc.fit_transform(df[f].values.reshape(-1,1))
        
    print ('pre_processing - END') 
    
    return df, uuid

In [141]:
def run_x_split(df):
    print ('x_split - START')  
    X = df[TRAIN_FEATURES]
    print ('x_split - END')  
    return X

In [142]:
def run_load_model():
    print ('load_model - START')  
    classifier = load_model('trained_models/ann_binary_classification.h5')
    print ('load_model - END')  
    return classifier

In [143]:
def run_prediction(classifier, X):
    print ('prediction - START')
    prediction_probability = classifier.predict(X)
    prediction = (prediction_probability > 0.5) # Higher the threshold, more conservative in prediction
    prediction = pd.DataFrame(prediction*1, columns=['prediction']) # Multiply by 1 to change T/F to 1/0
    print ('prediction - END')  
    return prediction

In [144]:
def run_save_prediction(uuid, prediction):
    print ('save_prediction - START')
    prediction = pd.concat([uuid, prediction], axis=1)
    prediction.to_csv('output/prediction_ann_binary_classification.csv', sep=',', index=False)
    print ('save_prediction - END')

In [145]:
def ann_binary_classification_test():
    start_time = time.time()
    df = run_load_data()
    df, uuid = run_pre_processing(df)
    X = run_x_split(df)
    classifier = run_load_model()
    prediction = run_prediction(classifier, X)
    run_save_prediction(uuid, prediction)
    print('elapsed_time (in seconds): ', time.time() - start_time)
ann_binary_classification_test()

load_data - START
load_data - END
pre_processing - START
pre_processing - END
x_split - START
x_split - END
load_model - START
load_model - END
prediction - START
prediction - END
save_prediction - START
save_prediction - END
('elapsed_time (in seconds): ', 14.048269987106323)
