# 7b) Clustering

In [1]:
import sklearn
from sklearn.cluster import KMeans 
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score

#use pandas to load csv file
import pandas as pandas
#use numpy to calculate stuff
import numpy as np

### Data preparation

In [2]:
#load data and merge both tables to one, ignore_index to reindex
redwinedata = pandas.read_csv('data/winequality-red.csv', sep =';')
whitewinedata = pandas.read_csv('data/winequality-white.csv', sep =';')
concat_data = redwinedata.append(whitewinedata, ignore_index=True)

In [3]:
# drop the quality label and data
concat_data = concat_data.drop('quality', axis=1)
winearray = concat_data.values
# normalize the data 
winearray_norm = sklearn.preprocessing.scale(winearray)

### K-Means (with/without minibatch)

In [4]:
# http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
batchkmeans_sil_scores = []

for nr_clusters in range(2, 8):
    clusterer = MiniBatchKMeans(n_clusters=nr_clusters)
    cluster_labels = clusterer.fit_predict(winearray_norm)
    #print(cluster_labels)
    silhouette_avg = silhouette_score(winearray_norm, cluster_labels)
    #print("For n_clusters =", nr_clusters,"The average silhouette_score is :", silhouette_avg)
    batchkmeans_sil_scores.append([nr_clusters, silhouette_avg])

print(batchkmeans_sil_scores)

[[2, 0.2554064489554629], [3, 0.23387486968415436], [4, 0.22123871017610744], [5, 0.1243876329737778], [6, 0.179467005043885], [7, 0.17701427984357385]]


In [5]:
# http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
kmeans_sil_scores = []

for nr_clusters in range(2, 8):
    clusterer = sklearn.cluster.KMeans(n_clusters=nr_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(winearray_norm)
    #print(cluster_labels)
    silhouette_avg = silhouette_score(winearray_norm, cluster_labels)
    #print("For n_clusters =", nr_clusters,"The average silhouette_score is :", silhouette_avg)
    kmeans_sil_scores.append([nr_clusters, silhouette_avg])

print(kmeans_sil_scores)

[[2, 0.2766102466709183], [3, 0.23507051305667923], [4, 0.24752476415510705], [5, 0.18031988651384204], [6, 0.18626179598840933], [7, 0.1742929153287071]]


### Average Linkage Agglomerative Clustering

In [23]:
average_sil_scores = []

for nr_clusters in range(2, 8):
    clusterer = AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=nr_clusters)
    cluster_labels = clusterer.fit_predict(winearray_norm)
    print("Nr of samples per Cluster: "+str(np.bincount(cluster_labels)))
    silhouette_avg = silhouette_score(winearray_norm, cluster_labels)
    average_sil_scores.append([nr_clusters, silhouette_avg])

print(average_sil_scores)

Nr of samples per Cluster: [6496    1]
Nr of samples per Cluster: [6495    1    1]
Nr of samples per Cluster: [  29 6466    1    1]
Nr of samples per Cluster: [  28 6466    1    1    1]
Nr of samples per Cluster: [6466    7   21    1    1    1]
Nr of samples per Cluster: [6464    7   21    1    1    1    2]
[[2, 0.7827338879667882], [3, 0.7264973604547182], [4, 0.6177452770210612], [5, 0.6069273341713797], [6, 0.5192470945503151], [7, 0.4475012710042327]]


This looks suspiciously better. This is why I also print the number of elements per clusters above. And indeed it becomes clear that it just puts nearly all the values into one cluster which is not the intended behaviour. Another try with other affinity types, e.g. cosine, the results are very similar to K-means:

In [24]:
average_sil_scores = []

for nr_clusters in range(2, 8):
    clusterer = AgglomerativeClustering(linkage="average", affinity="cosine", n_clusters=nr_clusters)
    cluster_labels = clusterer.fit_predict(winearray_norm)
    print("Nr of samples per Cluster: "+str(np.bincount(cluster_labels)))
    silhouette_avg = silhouette_score(winearray_norm, cluster_labels)
    average_sil_scores.append([nr_clusters, silhouette_avg])

print(average_sil_scores)

Nr of samples per Cluster: [4846 1651]
Nr of samples per Cluster: [2327 1651 2519]
Nr of samples per Cluster: [2519 1651 2052  275]
Nr of samples per Cluster: [2052 1651 2482  275   37]
Nr of samples per Cluster: [1679 1651 2482  275   37  373]
Nr of samples per Cluster: [2482 1651  373  275   37 1561  118]
[[2, 0.2662649268569845], [3, 0.19405970145838983], [4, 0.11613055600380129], [5, 0.08760391368250511], [6, 0.07933900330106827], [7, 0.09020528542502969]]


### Ward Linkage Agglomerative Clustering

In [25]:
ward_sil_scores = []

for nr_clusters in range(2, 8):
    clusterer =  clustering = AgglomerativeClustering(linkage='ward', n_clusters=nr_clusters)
    cluster_labels = clusterer.fit_predict(winearray_norm)
    silhouette_avg = silhouette_score(winearray_norm, cluster_labels)
    ward_sil_scores.append([nr_clusters, silhouette_avg])

print(ward_sil_scores)

[[2, 0.2668673064992951], [3, 0.20047387790440627], [4, 0.20673579244071638], [5, 0.13432744813769548], [6, 0.1439585385849316], [7, 0.12321328875640453]]


### Summarize the results in a table

In [26]:
table_index = [];
table_kmeans = [];
for member in kmeans_sil_scores:
    table_index.append(member[0])
    table_kmeans.append(member[1])

table_batchkmeans = [];
for member in batchkmeans_sil_scores:
    table_batchkmeans.append(member[1])  
    
table_ward = [];
for member in ward_sil_scores:
    table_ward.append(member[1])

table_average = [];
for member in average_sil_scores:
    table_average.append(member[1])     
        

df = pandas.DataFrame({'Clusters':table_index, 'K-Means':table_kmeans, 'Minibatch-K-Mean':table_batchkmeans, 'Aggl. Ward':table_ward, 'Aggl. Average':table_average}).round(3)
print (df[['Clusters', 'K-Means', 'Minibatch-K-Mean', 'Aggl. Average', 'Aggl. Ward']])


   Clusters  K-Means  Minibatch-K-Mean  Aggl. Average  Aggl. Ward
0         2    0.277             0.255          0.266       0.267
1         3    0.235             0.234          0.194       0.200
2         4    0.248             0.221          0.116       0.207
3         5    0.180             0.124          0.088       0.134
4         6    0.186             0.179          0.079       0.144
5         7    0.174             0.177          0.090       0.123


Surprisingly, the silhoutte score results were overall rather bad and very similar, which K-Means having the best outcome while also beeing very efficient.    

All the algorithms show that the more clusters are defined, the less accurate the clustering is. This hints to the fact that the algorithms are able to pick up the difference between white and red wines. 

### Basic statistics for K-Means
As the redwine and whitewine data was appended to each other and never reordered, it is easy to check if the clustering picked up the difference between red and white wines by splitting the resulting array according to the sizes of the sample for white and red wines.    

It shows very clearly that the clustering found out the difference between red and white wines, because the first 1599 (sample size of red whines) nearly all went into one cluster and the rest into the second cluster.

In [35]:
clusterer = KMeans(n_clusters=2, random_state=10)
cluster_labels = clusterer.fit_predict(winearray_norm)
silhouette_avg = silhouette_score(winearray_norm, cluster_labels)

print("Overall number of samples per Cluster: "+str(np.bincount(cluster_labels)))
cluster_red = cluster_labels[:1599]
cluster_white = cluster_labels[1599:]
print("Number of samples per Cluster in Redwine only: "+str(np.bincount(cluster_red)))
print("Number of samples per Cluster in Whitewine only: "+str(np.bincount(cluster_white)))

Overall number of samples per Cluster: [1641 4856]
Number of samples per Cluster in Redwine only: [1575   24]
Number of samples per Cluster in Whitewine only: [  66 4832]


Here is some additional data about the clusters. More or less all of the features have very different data in both clusters.

In [34]:
#convert nparray to dataframe and merge with normalized data (converted to dataframe) 
cluster_labels_dataframe = pandas.DataFrame({'cluster_nr':cluster_labels})
concat_with_cluster_nr = pandas.concat([pandas.DataFrame(winearray_norm), cluster_labels_dataframe], axis=1)

cluster1_data = concat_with_cluster_nr[concat_with_cluster_nr.cluster_nr == 0]
print("first cluster:")
print(cluster1_data.describe().round(2).drop(['25%', '50%', '75%']).drop('cluster_nr', axis=1))

cluster2_data = concat_with_cluster_nr[concat_with_cluster_nr.cluster_nr == 1]
print("")
print("second cluster:")
print(cluster2_data.describe().round(2).drop(['25%', '50%', '75%']).drop('cluster_nr', axis=1))

first cluster:
            0        1        2        3        4        5        6        7   \
count  1641.00  1641.00  1641.00  1641.00  1641.00  1641.00  1641.00  1641.00   
mean      0.83     1.17    -0.34    -0.59     0.92    -0.83    -1.19     0.68   
std       1.34     1.08     1.33     0.46     1.40     0.58     0.63     0.72   
min      -2.02    -1.33    -2.19    -0.98    -1.23    -1.66    -1.94    -1.63   
max       6.70     7.53     4.69    12.69    15.84     2.73     1.97    14.77   

            8        9        10  
count  1641.00  1641.00  1641.00  
mean      0.57     0.84    -0.07  
std       0.97     1.14     0.90  
min      -2.98    -1.42    -1.75  
max       4.92     9.87     3.70  

second cluster:
            0        1        2        3        4        5        6        7   \
count  4856.00  4856.00  4856.00  4856.00  4856.00  4856.00  4856.00  4856.00   
mean     -0.28    -0.40     0.11     0.20    -0.31     0.28     0.40    -0.23   
std       0.65     0.57     