# Scikit-learn Unsupervised learning

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.datasets import make_blobs, load_iris

## Clustering

In [None]:
# generate data

data_x, data_y = make_blobs(n_samples = 50, n_features = 2, centers = 4, random_state = 99)

In [None]:
data_x[:6,:]

In [None]:
data_y[:6]

In [None]:
plt.scatter(data_x[:,0], data_x[:,1], c = data_y)

---

### KMeans

In [None]:
kmeans_fit = KMeans(n_clusters = 4).fit(data_x)

cluster_labels = kmeans_fit.labels_

cluster_labels[:6]

In [None]:
plt.scatter(data_x[:,0], data_x[:,1], c = cluster_labels)

## Practice

Set different number of clusters (n_clusters) to see the result.

### Advance - how to decide number of cluster

In [None]:
kmeans_fit = KMeans(n_clusters = 4).fit(data_x)

In [None]:
## extension : how to decide number of clusters

from sklearn.metrics import silhouette_score

silhouette_avgs = []
ks = range(2, 6)
for k in ks:
    kmeans_fit = KMeans(n_clusters = k).fit(data_x)
    cluster_labels = kmeans_fit.labels_
    silhouette_avg = silhouette_score(data_x, cluster_labels)
    silhouette_avgs.append(silhouette_avg)

plt.bar([k for k in ks], silhouette_avgs)
plt.show()
print(silhouette_avgs)

---

### Hierarchical Clustering

In [None]:
hclust = AgglomerativeClustering(affinity = 'euclidean', n_clusters = 4)

hclust.fit(data_x)
cluster_labels = hclust.labels_
cluster_labels[:6]

In [None]:
plt.scatter(data_x[:,0], data_x[:,1], c = cluster_labels)

In [None]:
hclust.children_

---

### DBSCAN

In [None]:
dbscan_model = DBSCAN(eps = 3)

cluster_labels = dbscan_model.fit_predict(data_x)
plt.scatter(data_x[:,0], data_x[:,1], c = cluster_labels)

## Practice

User different cluster algorithms on the following data.

In [None]:
from sklearn.datasets import make_circles

n_data_x, n_data_y = make_circles(n_samples = 200, noise = 0.05, factor = 0.5)

plt.scatter(n_data_x[:,0], n_data_x[:,1], c = n_data_y)

In [None]:
# your code starts from here


---

## Dimension Reduction

### Principal Componenet Analysis

In [None]:
# iris dataset

iris = load_iris()

data_x = iris.data
data_y = iris.target

data_x.shape

In [None]:
pca = PCA()
pca.fit(data_x)

In [None]:
pca.explained_variance_ratio_

In [None]:
plt.bar([i+1 for i in range(len(pca.explained_variance_ratio_))] , pca.explained_variance_ratio_)

In [None]:
result = pca.transform(data_x)
plt.scatter(result[:,0], result[:,1], c = data_y)