Let's first setup our imports

In [64]:
import pandas as pd
import numpy as np
from sklearn import cluster
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import pyplot

## Step 1: Import and format the data

Sklearn already has the Iris dataset built in, so all we have to do is import it!

In [65]:
iris = datasets.load_iris()

Define the "x" and "y" variables. *Remember*, y is our classifer, and x is our attributes.

In [66]:
x = pd.DataFrame(iris.data)
x.columns = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
y = pd.DataFrame(iris.target)
y.columns = ['Targets']

## Step 2: Plot the data to estimate clusters

Plot the data using scatterplots - take a look at all the combinations of variables to get a feel for how the data is distributed. 

In [67]:
x.plot(kind='scatter',x='Sepal_Length',y='Petal_Length')

<matplotlib.axes._subplots.AxesSubplot at 0x11942cb10>

In [68]:
x.plot(kind='scatter',x='Sepal_Width',y='Petal_Length')

<matplotlib.axes._subplots.AxesSubplot at 0x1194e1bd0>

In [69]:
x.plot(kind='scatter',x='Sepal_Length',y='Sepal_Width')

<matplotlib.axes._subplots.AxesSubplot at 0x1195a5e90>

## Step 3: Cluster the data

Run the clustering analysis using scikit-learn.

*Hint*: Estimate the number of clusters, k, based on your visual examination of the distributions.

In [70]:
k = 3
kmeans = cluster.KMeans(n_clusters=k)
kmeans.fit(x)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

Compute the labels and centroids

In [71]:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

## Step 4: Evaluate the clusters

First, go ahead and plot the results of your clustering analysis

Plot the clusters

In [72]:
for i in range(k):
    ds = iris[np.where(labels==i)]
    plt.plot(ds[:,0],ds[:,1],'o')
    lines = plt.plot(centroids[i,0],centroids[i,1],'kx')
    plt.setp(lines,ms=15.0)
    plt.setp(lines,mew=2.0)
plt.show()

TypeError: unhashable type: 'numpy.ndarray'

Plot the predicted vs actual classifcations to see how our clustering analysis compares

In [73]:
colormap = np.array(['red', 'blue', 'yellow'])

In [74]:
plt.subplot(1, 2, 1)
plt.scatter(x.Petal_Length, x.Petal_Width, c=colormap[y.Targets], s=40)
plt.title('Actual Classification')
 
plt.subplot(1, 2, 2)
plt.scatter(x.Petal_Length, x.Petal_Width, c=colormap[labels], s=40)
plt.title('K-Means Classification')

<matplotlib.text.Text at 0x119719490>

Check the centroids to see where each cluster is lying 

In [75]:
print(centroids)

[[ 5.9016129   2.7483871   4.39354839  1.43387097]
 [ 5.006       3.418       1.464       0.244     ]
 [ 6.85        3.07368421  5.74210526  2.07105263]]


Check the labels of the clusters

In [76]:
print(labels)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]


In [77]:
predY = np.choose(labels, [1, 0, 2]).astype(np.int64)

Compute the accuracy score using scikit to see how accurate our analysis is

In [78]:
metrics.accuracy_score(y, predY)

0.89333333333333331

Compute the silhoutte coefficient to see how consistent our data is within the clusters

In [79]:
metrics.silhouette_score(y, predY, metric='euclidean')

0.70172421600536772

Calculate the Precision, Recall, and F - Score to see the test's accuracy

In [80]:
print(metrics.classification_report(y, predY))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       0.77      0.96      0.86        50
          2       0.95      0.72      0.82        50

avg / total       0.91      0.89      0.89       150



Compute the Confusion Matrix to test the performance of the clustering analysis

In [81]:
print(metrics.confusion_matrix(y, predY))

[[50  0  0]
 [ 0 48  2]
 [ 0 14 36]]
