# 7b) Clustering

In [66]:
import sklearn
#import plotly.plotly as py
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
#from plotly import figure_factory as figureFact
from sklearn.cluster import KMeans 
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

#use pandas to load csv file
import pandas as pandas
#use numpy to calculate stuff
import numpy as np

# init offline plotlib
#init_notebook_mode(connected=True)

### Data preparation

In [67]:
#load data and merge both tables to one, ignore_index to reindex
redwinedata = pandas.read_csv('data/winequality-red.csv', sep =';')
whitewinedata = pandas.read_csv('data/winequality-white.csv', sep =';')

#simplified data for testing
#redwinedata = pandas.read_csv('data/red_onlysugar.csv', sep =';')
#whitewinedata = pandas.read_csv('data/white_onlysugar.csv', sep =';')

concat_data = redwinedata.append(whitewinedata, ignore_index=True)

In [68]:
# drop the quality label and data
concat_data = concat_data.drop('quality', axis=1)

In [69]:
#show the head of data
# concat_data.head()

In [70]:
winearray = concat_data.values
# print(winearray)

In [71]:
# normalize the data 
winearray_norm = sklearn.preprocessing.scale(winearray)

### K-Means

In [72]:
# http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

kmeans_sil_scores = []

for nr_clusters in range(2, 5):
    clusterer = KMeans(n_clusters=nr_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(winearray_norm)
    #print(cluster_labels)
    silhouette_avg = silhouette_score(winearray_norm, cluster_labels)
    #print("For n_clusters =", nr_clusters,"The average silhouette_score is :", silhouette_avg)
    kmeans_sil_scores.append([nr_clusters, silhouette_avg])

print(kmeans_sil_scores)

[[2, 0.9627657604238751], [3, 0.7111175344744713], [4, 0.6039490901736749]]


### Spectral Clustering

In [73]:
spectral_sil_scores = []

for nr_clusters in range(2, 5):
    clusterer = SpectralClustering(n_clusters=nr_clusters, eigen_solver=None)
    #clusterer = SpectralClustering(n_clusters=nr_clusters, eigen_solver='arpack')
    cluster_labels = clusterer.fit_predict(winearray_norm)
    #print(cluster_labels)
    silhouette_avg = silhouette_score(winearray_norm, cluster_labels)
    print("For n_clusters =", nr_clusters,"The average silhouette_score is :", silhouette_avg)
    spectral_sil_scores.append([nr_clusters, silhouette_avg])

print(spectral_sil_scores)


For n_clusters = 2 The average silhouette_score is : 0.9627657604238751
For n_clusters = 3 The average silhouette_score is : 0.7111175344744713
For n_clusters = 4 The average silhouette_score is : 0.2991327300150591
[[2, 0.9627657604238751], [3, 0.7111175344744713], [4, 0.2991327300150591]]


### Ward Agglomerative Clustering

In [74]:
ward_sil_scores = []

for nr_clusters in range(2, 5):
    clusterer =  clustering = AgglomerativeClustering(linkage='ward', n_clusters=nr_clusters)
    cluster_labels = clusterer.fit_predict(winearray_norm)
    #print(cluster_labels)
    silhouette_avg = silhouette_score(winearray_norm, cluster_labels)
    #print("For n_clusters =", nr_clusters,"The average silhouette_score is :", silhouette_avg)
    ward_sil_scores.append([nr_clusters, silhouette_avg])

print(ward_sil_scores)

[[2, 0.9627657604238751], [3, 0.7111175344744713], [4, 0.6039490901736749]]


### Summarize the results in a table

In [75]:
table_index = [];
table_kmeans = [];
for member in kmeans_sil_scores:
    table_index.append(member[0])
    table_kmeans.append(member[1])

    
table_spectral = [];
for member in spectral_sil_scores:
    table_spectral.append(member[1])  
    
table_ward = [];
for member in ward_sil_scores:
    table_ward.append(member[1])    
        

print(pandas.DataFrame({'clusters':table_index, 'k-means':table_kmeans, 'ward':table_ward, 'spectral':table_spectral}))


   clusters   k-means  spectral      ward
0         2  0.962766  0.962766  0.962766
1         3  0.711118  0.711118  0.711118
2         4  0.603949  0.299133  0.603949


Surprisingly, the quickest and simplest algorithm "k-means" was the best performing, although the performance of all three was very similar.

### Basic statistics for K-Mean Clusters

In [76]:
clusterer = KMeans(n_clusters=2, random_state=10)
cluster_labels = clusterer.fit_predict(winearray_norm)

#convert nparray to dataframe and merge with normalized data (converted to dataframe) 
cluster_labels_dataframe = pandas.DataFrame({'cluster_nr':cluster_labels})
concat_with_cluster_nr = pandas.concat([pandas.DataFrame(winearray_norm), cluster_labels_dataframe], axis=1)

cluster1_data = concat_with_cluster_nr[concat_with_cluster_nr.cluster_nr == 1]
print("first cluster:")
print(cluster1_data.describe().round(2).drop(['count', '25%', '50%', '75%']).drop('cluster_nr', axis=1))

cluster2_data = concat_with_cluster_nr[concat_with_cluster_nr.cluster_nr == 0]
print("")
print("second cluster:")
print(cluster2_data.describe().round(2).drop(['count', '25%', '50%', '75%']).drop('cluster_nr', axis=1))

first cluster:
         0
mean  1.00
std   0.10
min   0.89
max   1.13

second cluster:
         0
mean -1.00
std   0.01
min  -1.01
max  -0.98


More or less all of the features have very different data in both clusters. I assume it found out the difference between red and whitewine and separated it according to this.