In [None]:
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.set_style("darkgrid")
plt.rcParams["font.size"] = 8
plt.rcParams["figure.figsize"] = (4,3)
plt.rcParams["figure.facecolor"] = "#00000000"

## Cluster

In [None]:
iris_df = sns.load_dataset("iris")
iris_df.head()

In [None]:
sns.scatterplot(iris_df, x="sepal_length", y="petal_length", hue="species")

In [None]:
num_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
x = iris_df[num_cols]

## K Means Clustering

In [None]:
model = KMeans(n_clusters=3, random_state=42)
model.fit(x)

In [None]:
model.cluster_centers_

In [None]:
preds = model.predict(x)

sns.scatterplot(x, x="sepal_length", y="petal_length", hue=preds)
centers_x, centers_y = model.cluster_centers_[:,0], model.cluster_centers_[:,2]
plt.plot(centers_x, centers_y, "xb");

In [None]:
model.inertia_ # Sum of squared distances of samples to their closest cluster center. low is better

In [None]:
model = KMeans(n_clusters=6, random_state=42).fit(x)
preds = model.predict(x)
sns.scatterplot(x, x="sepal_length", y="petal_length", hue=preds);

In [None]:
# Elbow
options = range(2,6)
inercias = []

for n_clusters in options:
    model = KMeans(n_clusters, random_state=42).fit(x)
    inercias.append(model.inertia_)

plt.title("N of clusters vs Inertia")
plt.plot(options, inercias, "-o")
plt.xlabel("N of cluster (K)")
plt.ylabel("Inertia");

## DBSCAN (Density-based spatial clustering), 2 parameters (epsilon and min samples)

In [None]:
model = DBSCAN(eps=1.1, min_samples=4)
model.fit(x)
model.labels_

In [None]:
sns.scatterplot(x, x="sepal_length", y="petal_length", hue=model.labels_);

![MarineGEO circle logo](./dataset/DBSCAN_vs_KMean.png "MarineGEO logo")

# Reduction Dimension

In [None]:
# PCA
pca = PCA(n_components=2)
pca.fit(iris_df[num_cols])
transformed = pca.transform(iris_df[num_cols])
sns.scatterplot(x=transformed[:,0], y=transformed[:,1], hue=iris_df["species"]);

In [None]:
# t-SNE (t-Distributed Stochastic Neighbor Embedding)
tsne = TSNE(n_components=2)
transformed = tsne.fit_transform(iris_df[num_cols])
sns.scatterplot(x=transformed[:,0], y=transformed[:,1], hue=iris_df["species"]);