In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Other Clustering Methods

- [DBSCAN](#1.-DBSCAN)
- [Agglomerative clustering](#2.-Agglomerative-clustering)
- [Gaussian Mixtures](3.-Gaussian-Mixtures)

## 1. DBSCAN

Density-based spatial clustering of applications with noise (DBSCAN) is a clustering algorithm that works well if all the clusters are dense enough and if they are well separated by low-density regions.

There are two parameters to the algorithm, `min_samples` and `eps`, which define formally what we mean when we say dense.

DBSCAN:
    
- $\epsilon$-neighborhoods: For each data point, DBSCAN counts how many points are located within a distance `eps`.
- Core data points: If a data point has at least `min_samples` points in its $\epsilon$-neighborhood, then it is considered a **core point**.
- Clusters: all points in the neighborhood of a core point belong to the same cluster. 


In [None]:
from sklearn.datasets import make_moons
X,y = make_moons(n_samples = 1000,
                 noise = 0.05)

plt.scatter(X[:,0],X[:,1])

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
dbscan = DBSCAN(eps=0.05,min_samples=5)
dbscan.fit(X)
# cluster labels (label = -1 means that the point is considered an anomaly)
labels = dbscan.labels_
# core points
cores = dbscan.components_

In [None]:
plt.figure(figsize=(10,7))
for i in np.unique(labels):
    if i!=-1:
        plt.scatter(X[labels==i,0],X[labels==i,1], label='cluster '+str(i))
    else:
        plt.scatter(X[labels==i,0],X[labels==i,1], label='anomaly')
plt.legend()

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(X[:,0],X[:,1])
plt.scatter(cores[:,0],cores[:,1],s=100,marker='x',color='red')

In [None]:
dbscan = DBSCAN(eps=0.2,min_samples=5)
dbscan.fit(X)
# cluster labels (label = -1 means that the point is considered an anomaly)
labels = dbscan.labels_
# core points
cores = dbscan.components_

In [None]:
plt.figure(figsize=(10,7))
for i in np.unique(labels):
    if i!=-1:
        plt.scatter(X[labels==i,0],X[labels==i,1], label='cluster '+str(i))
    else:
        plt.scatter(X[labels==i,0],X[labels==i,1], label='anomaly')
plt.legend()

**DBSCAN pros and cons**:

**Pros:**
- does not require to specify number of clusters beforehand.
- Performs well with arbitrary shapes clusters.
- DBSCAN is robust to outliers and able to detect the outliers.

**Cons:**
- determining an appropriate distance of neighborhood (eps) is not easy and it requires domain knowledge.
- it does not generalize well to clusters with much different densities.

## 2. Agglomerative clustering

A hierarchy of clusters is built from the bottom up.
 Each data point is assumed to be a separate cluster at first. Then the similar clusters are iteratively combined.
 There are 4 different methods implemented in scikit-learn to measure the similarity:
- Ward’s linkage: Minimizes the variance of the clusters being merged. Least increase in total variance around cluster centroids is aimed.
- Average linkage: Average distance of each data point in two clusters.
- Complete (maximum) linkage: Maximum distance among all data points in two clusters.
- Single (minimum) linkage: Maximum distance among all data points in two clusters.

In [None]:
from sklearn.datasets import make_circles
X,y = make_circles(n_samples=1000, noise=0.05, factor=0.5)
plt.scatter(X[:,0],X[:,1])

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
# linkage = ‘ward’, ‘complete’, ‘average’, or ‘single’}
k = 2
aggclt = AgglomerativeClustering(n_clusters = k, linkage='single')
aggclt.fit(X)
# cluster labels
labels = aggclt.labels_
# plot clusters
for i in range(k):
    plt.scatter(X[labels==i,0],X[labels==i,1])

## Hierarchical Clustering Dendrogram

It’s possible to visualize the tree representing the hierarchical merging of clusters as a dendrogram. Visual inspection can often be useful for understanding the structure of the data, though more so in the case of small sample sizes.

In [None]:
import pandas as pd

In [None]:
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/iris.csv'
iris = pd.read_csv(url)
iris.head()

In [None]:
X = iris.drop('species',axis=1)

In [None]:
iris.species.value_counts()

In [None]:
k = 3
aggclt = AgglomerativeClustering(n_clusters=k,linkage='average')

In [None]:
aggclt.fit(X)
labels = aggclt.labels_

In [None]:
# clusters = species?
pd.crosstab(labels, iris.species)

In [None]:
# The function below plots the corresponding dendrogram of a hierarchical
# clustering using AgglomerativeClustering and the dendrogram method available in scipy.

def plot_dendrogram(model):

    from scipy.cluster.hierarchy import dendrogram
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix)

In [None]:
aggclt = AgglomerativeClustering(n_clusters=None, linkage='average', distance_threshold=0)
aggclt.fit(X)

In [None]:
plt.figure(figsize=(12,12))
plot_dendrogram(aggclt)

## 3. Gaussian Mixtures

It assumes that the cluster points were generated from a mixture of Gaussian distributions.

In [None]:
from sklearn.datasets import make_blobs

In [None]:
X1, y1 = make_blobs(n_samples=1000, centers=((4, -4), (0, 0)), random_state=42)
X1 = X1.dot(np.array([[0.374, 0.95], [0.732, 0.598]]))
X2, y2 = make_blobs(n_samples=250, centers=1, random_state=42)
X2 = X2 + [6, -8]
X = np.r_[X1, X2]
y = np.r_[y1, y2]

plt.scatter(X[:,0],X[:,1])

In [None]:
from sklearn.mixture import GaussianMixture
gm = GaussianMixture(n_components=3)
gm.fit(X)

Now let's plot the resulting decision boundaries (dashed lines) and density contours:

In [None]:
def plot_gaussian_mixture(model, X):
    from matplotlib.colors import LogNorm    
    # feature names
    try:
        feature_names = X.columns
    except:
        feature_names = ['feature #1','feature #2']
    
    # put data into numpy arrays
    try:
        X = np.array(X)
    except:
        print('something went wrong')
    
    
    resolution = 1000
    
    # create a mesh grid
    mins = X.min(axis=0) - 0.1
    maxs = X.max(axis=0) + 0.1
    xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
                         np.linspace(mins[1], maxs[1], resolution))
    Z = -model.score_samples(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z,
                 norm=LogNorm(vmin=1.0, vmax=30.0),
                 levels=np.logspace(0, 2, 12))
    plt.contour(xx, yy, Z,
                norm=LogNorm(vmin=1.0, vmax=30.0),
                levels=np.logspace(0, 2, 12),
                linewidths=1, colors='k')

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contour(xx, yy, Z,
                linewidths=2, colors='r', linestyles='dashed')
    
    # plot data
    plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)
    
    # plot centroids
    centroids = model.means_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='o', s=20, linewidths=8,
                color='w', zorder=10, alpha=0.9)
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=50, linewidths=2,
                color='r', zorder=11, alpha=1)

    plt.xlabel(feature_names[0], fontsize=15)
    plt.ylabel(feature_names[1], fontsize=15)
   

In [None]:
plot_gaussian_mixture(gm, X)