## Clustering

#### Clustering Using K-Means

In [718]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Load data
iris = datasets.load_iris()
features = iris.data

# Standardize features
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# create cluster
cluster = KMeans(n_clusters =3, random_state = 0, n_jobs =-1)
cluster.fit(features)

# predict class
cluster.labels_

# view true class
iris.target

# Predict observation's cluster
new_observation = [[0.8, 0.8, 0.8, 0.8]]
cluster.predict(new_observation)

# View cluster centers
cluster.cluster_centers_

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

#### Speeding Up K-Means Clustering

In [714]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

# load data
iris = datasets.load_iris()
features = iris.data

# standardize features
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# create model
cluster =MiniBatchKMeans(n_clusters =3, random_state =0,
                         batch_size = 100)
cluster.fit(features_std)

# batch_size: number if observation each batch

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=3, n_init=3, random_state=0, reassignment_ratio=0.01,
                tol=0.0, verbose=0)

#### Clustering Using Meanshift: no need to set the number of clusters prior

In [715]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MeanShift

# Load data
iris = datasets.load_iris()
features = iris.data

# Standardize features
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# Create meanshift object
cluster = MeanShift(n_jobs=-1)
model = cluster.fit(features_std)

# band_width: radius of area ie kernel. estimated auto
# cluster_all=false: do not cluster outliers

#### Clustering Using DBSCAN

In [716]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# load data
iris = load_iris()
features = iris.data

# standardize features
features_std = StandardScaler().fit_transform(features)

# create cluster
dbscan = DBSCAN(n_jobs=-1, metric = 'euclidean')
dbscan.fit(features_std)

dbscan.labels_ # outliers: -1

# eps: max distance from an observation to be considered its neighbor
# min_samples: number of observation in a cluster

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,
        1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1, -1,  1,  1, -1, -1,
       -1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1,  1,  1,  1, -1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1],
      dtype=int64)

#### Clustering Using Hierarchical Merging

In [717]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

# Load data
iris = datasets.load_iris()
features = iris.data

# Standardize features
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# Create meanshift object
cluster = AgglomerativeClustering(n_clusters=3,
                                 linkage = 'ward',
                                 affinity = 'minkowski')

# Train model
model = cluster.fit(features_std)

# all observation start as their own cluster and are merged 
# together based on some criterias
# linkage: - ward: variance ; - average: average distance btw obs
# -complete: max distance btw obs
# affinity: distance used: euclidean, mikoski, manhattan