In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve, learning_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import silhouette_score, homogeneity_score, completeness_score, confusion_matrix, classification_report, accuracy_score
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.random_projection import GaussianRandomProjection

from pandas.api.types import is_string_dtype
from pandas.api.types import is_categorical_dtype

In [2]:
def pre_processing(credit):
	# preprocessing - convert data type and dummy coding
	cols = credit.columns
	isCat_Index = list()
	for col in cols:
		if is_string_dtype(credit[col]):
			credit[col] = credit[col].astype('category')
		isCat_Index.append(is_categorical_dtype(credit[col]))
	
	credit_d = pd.get_dummies(credit, columns=list(cols[isCat_Index]))
	print(credit_d.shape)
	return credit_d

In [3]:
SEED = 166

# preprocessing: scaling the data and split dataset
data = pd.read_csv("../credit.csv")
data = pre_processing(data)
predictors = data[data.columns.difference(["default"])]
scaler = MinMaxScaler()
predictors_scaled = scaler.fit_transform(predictors)
target = data[["default"]].values.ravel()

train_X, test_X, train_y, test_y = train_test_split(predictors_scaled, target, train_size=0.8, random_state=SEED, stratify=target)

(1000, 62)


In [4]:
rca = GaussianRandomProjection(n_components=34, random_state=SEED)
rca.fit(train_X)
train_transform = rca.transform(train_X)
test_transform = rca.transform(test_X)

In [5]:
kmeans = KMeans(n_clusters=2, random_state=SEED)
kmeans.fit(train_transform)

cluster_train = kmeans.predict(train_transform)
cluster_test = kmeans.predict(test_transform)

cluster_train = np.eye(2)[cluster_train]
cluster_test = np.eye(2)[cluster_test]
print(cluster_train)
print(np.sum(cluster_train, 0))


train_X = np.concatenate([train_transform, cluster_train], axis=1)
test_X = np.concatenate([test_transform, cluster_test], axis=1)

print(train_X.shape)

[[0. 1.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]
[428. 372.]
(800, 36)


In [6]:
# Grid Search the best parameters
mlp_model = MLPClassifier(random_state=SEED, max_iter=1000)
nodes = [(20), (40), (60), (80), (100), (120), (140), (160),
         (10, 10), (20, 20), (30, 30), (40, 40), (50, 50), (60, 60), (70, 70), (80, 80), (90, 90), (100, 100),
         (30, 10), (50, 30), (50, 10), (70, 50), (70, 30), (70, 10), (90, 70), (90, 50), (90, 30), (90, 10)]
batch_size = [50, 100]

tuned_parameters = {'hidden_layer_sizes': nodes, 'batch_size': batch_size}
clf = GridSearchCV(mlp_model, tuned_parameters, scoring="accuracy", n_jobs=-1, cv=5)
clf.fit(train_X, train_y)
print(clf.best_score_, clf.best_params_)

0.7262500000000001 {'batch_size': 100, 'hidden_layer_sizes': 160}


In [7]:
mlp_model = MLPClassifier(batch_size=100,hidden_layer_sizes=(160), random_state=SEED, max_iter=1000)

tuned_parameters = {'learning_rate_init': np.arange(0.0005, 0.01, 0.0005),
                    'activation': ["relu", "tanh", "logistic"]}
clf = GridSearchCV(mlp_model, tuned_parameters, scoring="accuracy", n_jobs=-1, cv=5)
clf.fit(train_X, train_y)
print(clf.best_score_, clf.best_params_)

0.7499999999999999 {'activation': 'logistic', 'learning_rate_init': 0.001}


In [8]:
import time

mlp_model = MLPClassifier(batch_size=100,hidden_layer_sizes=(160), activation='logistic', learning_rate_init=0.001, random_state=SEED, max_iter=1000)

t0_clock = time.process_time()
mlp_model.fit(train_X, train_y)
pred = mlp_model.predict(test_X)  # Predict with test set
t1_clock = time.process_time()
print("The training time for final selected model is " + str(t1_clock - t0_clock) + " seconds")
print(classification_report(test_y, pred, digits=4))

The training time for final selected model is 4.015625 seconds
              precision    recall  f1-score   support

           1     0.7682    0.8286    0.7973       140
           2     0.5102    0.4167    0.4587        60

    accuracy                         0.7050       200
   macro avg     0.6392    0.6226    0.6280       200
weighted avg     0.6908    0.7050    0.6957       200

