In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score, homogeneity_score, completeness_score, confusion_matrix, classification_report, accuracy_score
from sklearn.cluster import KMeans

In [None]:
SEED = 77

# preprocessing: scaling the data and split dataset
data = pd.read_csv("../winequality.csv")
predictors = data[data.columns.difference(["label"])]
scaler = MinMaxScaler()
predictors_scaled = scaler.fit_transform(predictors)
target = data[["label"]].values.ravel()

train_X, test_X, train_y, test_y = train_test_split(predictors_scaled, target, train_size=0.8, random_state=SEED, stratify=target)

# in clustering experiments, only use training set
data, label = train_X, train_y

In [None]:
# Choosing the optimal k with two metrics - one is inertia, another is Silhouette Coefficient
k_grid = np.arange(1, 11)
loss1 = np.zeros(k_grid.size)
loss2 = np.zeros(k_grid.size)
for idx, k in enumerate(k_grid):
    print ("k = ", k)
    kmeans = KMeans(n_clusters=k, random_state=SEED)
    kmeans.fit(data)
    loss1[idx] = kmeans.inertia_
    if k > 1:
        loss2[idx] = silhouette_score(data, kmeans.labels_)

In [None]:
import matplotlib.pyplot as plt
# Plot loss vs k to find best k
plt.figure(figsize=(8,4))
plt.plot(k_grid, loss1,'-o')
plt.xticks(k_grid)
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Select K for wine dataset: Loss Function')
plt.grid()
plt.savefig('../plots/kmeans_wine_loss_1.png', dpi=300)
plt.show()

plt.figure(figsize=(8,4))
plt.plot(k_grid[1:], loss2[1:], marker='o')
plt.xticks(k_grid)
plt.xlabel('k')
plt.ylabel('Silhouette Coefficient')
plt.title('Select K for wine dataset: Silhouette Coefficient')
plt.grid()
plt.savefig('../plots/kmeans_wine_loss_2.png', dpi=300)
plt.show()

In [None]:
kmeans = KMeans(n_clusters=2, random_state=SEED)
kmeans.fit(data)
print(silhouette_score(data, kmeans.labels_))

# Visualize the clustering effect
correlation = []
shape = data.shape
for i in range(shape[1]):
    c = np.corrcoef(data[:, i], kmeans.labels_)
    correlation.append(c[0, 1])
print(correlation)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

fig = plt.figure(1)
ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)

ax.scatter(data[kmeans.labels_==0, 0], data[kmeans.labels_==0, 7], data[kmeans.labels_==0,3], c='blue', edgecolor="k", label=("wine - cluster 1"))
ax.scatter(data[kmeans.labels_==1, 0], data[kmeans.labels_==1, 7], data[kmeans.labels_==1,3], c='red', edgecolor="k", label=("wine - cluster 2"))
ax.set_xlabel("fixed acidity")
ax.set_ylabel("density")
ax.set_zlabel("residual sugar")
ax.legend(loc='best')
# ax.title("Winequality dataset - clustering")
fig.savefig('../plots/kmeans_wine_cluster.png', dpi=300)
fig.show()

In [None]:
print(label)
print(1 - kmeans.labels_)
print(completeness_score(label, kmeans.labels_), homogeneity_score(label, kmeans.labels_))
print(accuracy_score(label, 1-kmeans.labels_))