# Clustering

In [1]:
# packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, dendrogram

# helper modules
exec(open('helper_functions.py').read())

# load our previously processed well data
unscaled_well_data = pd.read_csv('../data/unscaled_well_data.csv')
scaled_well_data = pd.read_csv('../data/scaled_well_data.csv')

## K-Means Clustering
we will use the scaled well completion dataset from here on
say we want to cluster a section of wells into three type curves instead of one
k-means

In [None]:
kmeans_2 = KMeans(n_clusters=2, random_state=0).fit(scaled_well_data)
plot_cluster_results(kmeans_2, scaled_well_data)
#within cluster sum of squared error (i.e. withinness)
kmeans.inertia_
kmeans_elbowplot(X)

In [None]:
kmeans_5 = KMeans(n_clusters=3, random_state=0).fit(scaled_well_data)
plot_cluster_results(kmeans_3, scaled_well_data)
#within cluster sum of squared error (i.e. withinness)
kmeans.inertia_
kmeans_elbowplot(X)

In [None]:
kmeans_3plus = KMeans(n_clusters=3, init='k-means++', max_iter=300, 
                n_init=10, random_state=0).fit(scaled_well_data)
plot_cluster_results(kmeans_3plus, scaled_well_data)
#within cluster sum of squared error (i.e. withinness)
kmeans.inertia_
kmeans_elbowplot(X)

Now we look at another type of clustering to see how the sklearn framework is relatively consistent and convienient for doing unsupervised learning 

In [None]:
#TODO : Put a good dbscan example in
db = DBSCAN(eps=20, min_samples=10).fit(X)
plot_cluster_results(db, scaled_well_data)


## HIERARCHICAL CLUSTERING

In [None]:
# "ward" minimizes the variance of the clusters being merged.
ward_hclust = AgglomerativeClustering(n_clusters=3, linkage='ward').fit(X)
plot_dendrogram(X, method = 'ward')

In [None]:
# "average" uses the average of the distances of each observation of the two sets.
avg_hclust = AgglomerativeClustering(n_clusters=3, linkage='average').fit(X)
plot_dendrogram(X, method = 'average')

In [None]:
# "complete" or maximum linkage uses the maximum distances between all observations.
comp_hclust = AgglomerativeClustering(n_clusters=3, linkage='complete').fit(X)
plot_dendrogram(X, method = 'complete')