In [167]:
%matplotlib notebook

from sklearn import datasets
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd



In [None]:
### Let's generate some synthetic data

In [170]:
X, y = datasets.make_blobs(n_samples=1000, n_features=2, centers=5, cluster_std=[0.5, 0.5, 0.5, 1, 1],random_state=111)

In [171]:
plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], s=2)
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()

<IPython.core.display.Javascript object>

In [172]:
from sklearn.cluster import KMeans


k=5
kmeans = KMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)

Let's take a look at cluster centers

In [173]:
all_centers=kmeans.cluster_centers_
print(all_centers)


[[-4.05351568 -6.90941348]
 [-1.34304201  5.43215273]
 [-5.16949597 -3.17397257]
 [ 2.19461578 -6.62978549]
 [-9.52314013 -1.56755724]]


In [174]:
## Visualize data points and cluster centers
def plot_cluster_centers(X,all_centers,k,label):
    plt.figure(figsize = (8, 8))
    plt.scatter(X[:, 0], X[:, 1], s=3,label='data points')
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.title(label)
    plt.scatter(all_centers[:, 0], all_centers[:, 1], color = 'red', s = 200, label = 'centroids')
    for i in range(k):
        plt.annotate('cluster '+str(i),(all_centers[i,0],all_centers[i,1]),fontsize=15)
    plt.show()

In [175]:
plot_cluster_centers(X,all_centers,k,'An optimal clustering')

<IPython.core.display.Javascript object>

Now that we have a trained clustering model, let's try to predict the cluster(class) of a new data point.

In [176]:
X_new = np.array([[0, 5], [0, 0], [-3, 3], [-3, 2.5]])
kmeans.predict(X_new)

array([1, 1, 1, 1])

### Let us plot the decision boundaries for all the clusters obtained.

In [133]:
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),np.arange(y_min, y_max, 0.1))

Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize=(8, 8))
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=20, edgecolor='k')
plt.show()

<IPython.core.display.Javascript object>

### Hard Clustering : Assigning each instance to single cluster.

### Soft Clustering : A data point can belong to multiple clusters. Can you think of use cases where soft clustering would be more helpful than hard clustering? (Hint: think of information retrieval on web)

In [134]:
kmeans.transform(X_new) ## measures distance between the data points and clusters. In case of hard clustering, nearest one is chosen.

array([[11.5681894 ,  9.67147952,  1.41085712, 11.83504326, 12.58034653],
       [ 9.65129181,  6.06611824,  5.59571667,  6.98358032,  8.01067937],
       [ 7.96328678,  6.54405456,  2.94293675, 10.94151734,  9.96525819],
       [ 7.68741693,  6.07459278,  3.36794142, 10.50414281,  9.46820771]])

## Do you think centroid initialization can affect the quality of clusters ?

In [177]:
bad_init = np.array([[-4,-2], [-4,-2.1], [-4.1,-2], [-4,-2.1], [-4.05,-2]])
kmeans_bad_init = KMeans(n_clusters=5, init=bad_init, n_init=1)

kmeans_bad_init.fit(X)


plot_cluster_centers(X,kmeans_bad_init.cluster_centers_,k,'Bad Initialization of centroids')

<IPython.core.display.Javascript object>

### How do we decide if a clustering good or bad without visualizing the locations ?
#### Answer : Inertia -> Mean squared distance between each instance and its closest centroid. We look for clustering with lower inertia.

In [178]:
print('Good Initialization inertia score : ',kmeans.inertia_)
print('Bad Initialization inertia score : ',kmeans_bad_init.inertia_)

Good Initialization inertia score :  1066.710561741862
Bad Initialization inertia score :  2482.3524752049966


### What do you think was bad about above initialization ? (Hint: think of mean of data distribution)

### Good news: KMeans by default uses K-Means++ , a smart initialization technique that tends to select centroids that are distant from one another.

### Do you think number of clusters can affect the optimality of clustering?

#### Let's try with k=3 and k=8

In [179]:
kmeans_3=KMeans(n_clusters=3)
kmeans_3.fit(X)


plot_cluster_centers(X,kmeans_3.cluster_centers_,3,'k is too small')

<IPython.core.display.Javascript object>

In [180]:
kmeans_8=KMeans(n_clusters=8)
kmeans_8.fit(X)


plot_cluster_centers(X,kmeans_8.cluster_centers_,8,'k is too large')

<IPython.core.display.Javascript object>

### Can we decide correct number of clusters by comparing inertia ? Think about it!

#### Let's plot inertia as a function of number of clusters

In [182]:
candidate_clusters=[2,3,4,5,6,7,8]
inertia_scores=[]
for cluster in candidate_clusters:
    kmeans=KMeans(cluster)
    kmeans.fit(X)
    inertia_scores.append(kmeans.inertia_)

plt.figure(figsize=(6, 6))
plt.plot(candidate_clusters,inertia_scores,marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

<IPython.core.display.Javascript object>

### The above graph looks like an arm where the elbow looks like a good choice. Any lower values would be dramatic and higher values wouldn't help too much. This is called elbow method.

### Task: Explore Silhouette method for choosing number of clusters.

### Let's use KMeans to perform clustering on a real world dataset.

In [183]:
from sklearn.datasets import load_wine

wine_data=load_wine()

X = pd.DataFrame(wine_data.data, columns = wine_data.feature_names)
y = pd.DataFrame(wine_data.target, columns = ['target'])

In [184]:
## Take a look at data and target
X.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [185]:
y.tail()

Unnamed: 0,target
173,2
174,2
175,2
176,2
177,2


In [186]:
print('Features are : ',X.columns)
print('Classes are : ',wine_data.target_names)

Features are :  Index(['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
       'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
       'proanthocyanins', 'color_intensity', 'hue',
       'od280/od315_of_diluted_wines', 'proline'],
      dtype='object')
Classes are :  ['class_0' 'class_1' 'class_2']


In [187]:
## Visualizing 2 features - Total Phenol and Alcohol

phenol_alcohol_data=X[['total_phenols','alcohol']]
phenol_alcohol_data['target']=y['target']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenol_alcohol_data['target']=y['target']


In [189]:
plt.figure(figsize = (6, 6))

plt.scatter(phenol_alcohol_data['total_phenols'],phenol_alcohol_data['alcohol'],c=phenol_alcohol_data['target'])

plt.xlabel('Total phenol')
plt.ylabel('Alcohol');

<IPython.core.display.Javascript object>

In [190]:
## Visualizing 13 features is difficult. Let's visualize any 3 of them.

al_malic_ash_data=X[['alcohol','malic_acid','ash']]
al_malic_ash_data['target']=y['target']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  al_malic_ash_data['target']=y['target']


In [191]:


fig = plt.figure()
ax = Axes3D(fig)
labels=['class 0','class 1','class 2']
ax.scatter(al_malic_ash_data['alcohol'],al_malic_ash_data['malic_acid'],al_malic_ash_data['ash'],c=al_malic_ash_data['target'])
ax.set_xlabel('Alcohol')
ax.set_ylabel('Malic acid')
ax.set_zlabel('Ash')

plt.show()

<IPython.core.display.Javascript object>

  ax = Axes3D(fig)


### Clustering dataset using 2 features - Total Phenol and Alcohol (Visualized above)

In [193]:
from sklearn.preprocessing import StandardScaler

phenol_alcohol_data_wo_target=phenol_alcohol_data.drop('target',axis='columns',inplace=False)
std_scaler_phenol_alcohol=StandardScaler()
kmeans_phenol_alcohol=KMeans(n_clusters=3)
phenol_alcohol_data_transformed=std_scaler_phenol_alcohol.fit_transform(phenol_alcohol_data_wo_target)
kmeans_phenol_alcohol.fit(phenol_alcohol_data_transformed)



all_centers=kmeans_phenol_alcohol.cluster_centers_
plt.figure(figsize = (8, 8))
plt.scatter(phenol_alcohol_data_transformed[:,0], phenol_alcohol_data_transformed[:,1],c=phenol_alcohol_data['target'], s=8,label='data points')
plt.xlabel('x1')
plt.ylabel('x2')
plt.title('Wine clusters')
plt.scatter(all_centers[:, 0], all_centers[:, 1], color = 'red', s = 200, label = 'centroids')
for i in range(3):
    plt.annotate('cluster '+str(i),(all_centers[i,0],all_centers[i,1]),fontsize=15)
plt.show()


<IPython.core.display.Javascript object>

### Predict cluster for a test instance

In [194]:
test_data=[[19,15],[20,1]] 
kmeans_phenol_alcohol.predict(std_scaler_phenol_alcohol.transform(test_data))

array([1, 2])

### Clustering dataset using 3 features - Alcohol, Malic acid and Ash (Visualized above)

In [195]:
al_malic_ash_data_wo_target=al_malic_ash_data.drop('target',axis='columns',inplace=False)
std_scaler_al_malic_ash=StandardScaler()
kmeans_al_malic_ash=KMeans(n_clusters=3)
al_malic_ash_data_transformed=std_scaler_al_malic_ash.fit_transform(al_malic_ash_data_wo_target)
kmeans_al_malic_ash.fit(al_malic_ash_data_transformed)

fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(al_malic_ash_data_transformed[:,0], al_malic_ash_data_transformed[:,1],al_malic_ash_data_transformed[:,2],
           c=al_malic_ash_data['target'], s=8,label='data points')
ax.set_xlabel('Alcohol')
ax.set_ylabel('Malic acid')
ax.set_zlabel('Ash')

all_centers=kmeans_al_malic_ash.cluster_centers_

ax.scatter(all_centers[:, 0], all_centers[:, 1],all_centers[:, 2], color = 'red', s = 200, label = 'centroids')
for i in range(3):
    ax.text(all_centers[i,0],all_centers[i,1],all_centers[i,2],  'Cluster '+str(i), size=10, zorder=1,  
    color='k')
plt.show()

<IPython.core.display.Javascript object>

  ax = Axes3D(fig)


### Task : Perform clustering using all the features of dataset and find correctness using predict().