In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score, homogeneity_score, completeness_score, confusion_matrix, classification_report, accuracy_score
from sklearn.mixture import GaussianMixture

from pandas.api.types import is_string_dtype
from pandas.api.types import is_categorical_dtype

In [None]:
def pre_processing(credit):
	# preprocessing - convert data type and dummy coding
	cols = credit.columns
	isCat_Index = list()
	for col in cols:
		if is_string_dtype(credit[col]):
			credit[col] = credit[col].astype('category')
		isCat_Index.append(is_categorical_dtype(credit[col]))
	
	credit_d = pd.get_dummies(credit, columns=list(cols[isCat_Index]))
	print(credit_d.shape)
	return credit_d

In [None]:
SEED = 166

# preprocessing: scaling the data and split dataset
data = pd.read_csv("../credit.csv")
data = pre_processing(data)
predictors = data[data.columns.difference(["default"])]
scaler = MinMaxScaler()
predictors_scaled = scaler.fit_transform(predictors)
target = data[["default"]].values.ravel()

train_X, test_X, train_y, test_y = train_test_split(predictors_scaled, target, train_size=0.8, random_state=SEED, stratify=target)

# in clustering experiments, only use training set
data, label = train_X, train_y

In [None]:
# Choosing the optimal k with two metrics - one is BIC, another is Silhouette Coefficient
k_grid = np.arange(1, 62)
loss1 = np.zeros(k_grid.size)
loss2 = np.zeros(k_grid.size)
for idx, k in enumerate(k_grid):
    print ("k = ", k)
    GMM = GaussianMixture(n_components=k, random_state=SEED)
    pred = GMM.fit_predict(data)
    loss1[idx] = GMM.bic(data)
    if k > 1:
        loss2[idx] = silhouette_score(data, pred)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator

# Plot loss vs k to find best k
plt.figure(figsize=(8,4))
plt.plot(k_grid, loss1,'-o')
plt.xticks(k_grid)
x_major_locator=MultipleLocator(2)
ax=plt.gca()
ax.xaxis.set_major_locator(x_major_locator)

plt.xlabel('k')
plt.ylabel('BIC')
plt.xlim(0, 62)
plt.title('Select K for credit dataset:  Bayesian Information Criterion')
plt.grid()
plt.savefig('../plots/gmm_credit_loss_1.png', dpi=300)
plt.show()

plt.figure(figsize=(8,4))
plt.plot(k_grid[1:], loss2[1:], marker='o')
plt.xticks(k_grid)
x_major_locator=MultipleLocator(2)
ax=plt.gca()
ax.xaxis.set_major_locator(x_major_locator)

plt.xlabel('k')
plt.ylabel('Silhouette Coefficient')
plt.xlim(0, 62)
plt.title('Select K for credit dataset: Silhouette Coefficient')
plt.grid()
plt.savefig('../plots/gmm_credit_loss_2.png', dpi=300)
plt.show()

In [None]:
GMM = GaussianMixture(n_components=2, random_state=SEED)
pred = GMM.fit_predict(data)
# Visualize the clustering effect
correlation = []
shape = data.shape
for i in range(shape[1]):
    c = np.corrcoef(data[:, i], pred)
    correlation.append(c[0, 1])
print(correlation)
print(np.argsort(np.array(correlation)))
print(np.sort(np.array(correlation)))

In [None]:
# from mpl_toolkits.mplot3d import Axes3D
# import matplotlib.pyplot as plt

# fig = plt.figure(1, figsize=(4, 3))
# ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)

# ax.scatter(data[pred==0, 20], data[pred==0, 21], data[pred==0,22], c='blue', edgecolor="k", label=("cluster 1 - wine"))
# ax.scatter(data[pred==1, 20], data[pred==1, 21], data[pred==1,22], c='red', edgecolor="k", label=("cluster 2 - wine"))
# # ax.scatter(data[pred==2, 0], data[pred==2, 7], data[pred==2,3], c='yellow', edgecolor="k", label=("cluster 3 - wine"))
# ax.set_xlabel("fixed acidity")
# ax.set_ylabel("density")
# ax.set_zlabel("residual sugar")
# ax.legend(loc='best')
# fig.savefig('../plots/gmm_wine_cluster.png', dpi=300)
# fig.show()

plt.figure()
plt.hist(pred, bins=np.arange(0, 2 + 1) - 0.5, rwidth=0.5, zorder=2)
plt.xticks(np.arange(0, 2))
plt.xlabel('Cluster label')
plt.ylabel('Number of samples')
plt.title('GMM - clustering for credit dataset')
plt.grid()
plt.savefig('../plots/gmm_credit_cluster.png', dpi=300)

In [None]:
# print(label)
# print(pred)
print(completeness_score(label, pred), homogeneity_score(label, pred))
print(accuracy_score(label, pred))
print(silhouette_score(data, pred))