In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, scale
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
import seaborn as sns; sns.set()
import gdown
from PIL import Image, ImageOps
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from matplotlib import style
style.use('ggplot') or plt.style.use('ggplot')
from matplotlib.patches import Ellipse
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# GMM (Libreria)

In [None]:
#GMM
modelo_gmm = GaussianMixture(n_components=7, covariance_type='full', random_state=123)
modelo_gmm.fit(pca_df)

In [None]:
# Media (componente)
modelo_gmm.means_

# Ellipses
def make_ellipses(gmm, ax):
    for n in range(gmm.n_components):
        if gmm.covariance_type == 'full':
            covariances = gmm.covariances_[n]
        elif gmm.covariance_type == 'tied':
            covariances = gmm.covariances_
        elif gmm.covariance_type == 'diag':
            covariances = np.diag(gmm.covariances_[n])
        elif gmm.covariance_type == 'spherical':
            covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n]
        v, w = np.linalg.eigh(covariances)
        u = w[0] / np.linalg.norm(w[0])
        angle = np.arctan2(u[1], u[0])
        angle = 180 * angle / np.pi
        v = 2. * np.sqrt(2.) * np.sqrt(v)
        
        for i in range(1,3):
            ell = mpl.patches.Ellipse(gmm.means_[n], i*v[0], i*v[1],
                                      180 + angle, color="blue")
            ell.set_clip_box(ax.bbox)
            ell.set_alpha(0.1)
            ax.add_artist(ell)
        

fig, axs = plt.subplots(1, 1, figsize=(10, 6))

# Distribución de probabilidad (componente)
for i in np.unique(clasificacion):
    axs.scatter(
        x = pca_df.iloc[clasificacion == i, 0],
        y = pca_df.iloc[clasificacion == i, 1], 
        c = plt.rcParams['axes.prop_cycle'].by_key()['color'][i],
        marker    = 'o',
        edgecolor = 'white', 
        cmap='viridis',
        label= f"Componente {i}"
    )

make_ellipses(modelo_gmm, ax = axs)
axs.set_title('Distribución de probabilidad de cada componente')
axs.legend()

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(10, 6))

# Distribución de probabilidad (modelo completo)
xs = np.linspace(min(pca_df.iloc[:, 0]), max(pca_df.iloc[:, 0]), 1000)
ys = np.linspace(min(pca_df.iloc[:, 1]), max(pca_df.iloc[:, 1]), 1000)
xx, yy = np.meshgrid(xs, ys)
scores = modelo_gmm.score_samples(np.c_[xx.ravel(), yy.ravel()], )
axs.scatter(pca_df.iloc[:, 0], pca_df.iloc[:, 1], s=5, alpha=.6, c=plt.cm.tab10(clasificacion), cmap='viridis')
scores = np.exp(scores) # Las probabilidades están en log
axs.contour(
    xx, yy, scores.reshape(xx.shape),
    levels=np.percentile(scores, np.linspace(0, 100, 10))[1:-1]
)
axs.set_title('Distribución de probabilidad del modelo completo')

# K-means (Libreria)

In [None]:
#KMEANS
model_kmeans = KMeans(n_clusters=7)
model_kmeans.fit(y)
y_kmeans = model_kmeans.predict(y)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))

for i in np.unique(y_kmeans):
  ax.scatter(
      x=y[:,0], 
      y=y[:,1], 
      c=y_kmeans, 
      s=50, 
      cmap='viridis',
      label= f"Cluster {i}"
      )

centers = model_kmeans.cluster_centers_

ax.scatter(
    x=centers[:,0],
    y=centers[:,1], 
    c='black', 
    s=200, 
    alpha=0.5
    )
ax.set_title('Cluster generados por kmeans')
ax.legend()

# DBSCAN (Libreria)

In [None]:
min_points = 2*2
pca_df.shape

In [None]:
nbrs = NearestNeighbors(n_neighbors=5).fit(pca_df)
neigh_dist, neigh_ind = nbrs.kneighbors(pca_df)
sort_neigh_dist=np.sort(neigh_dist,axis=0)
k_dist=sort_neigh_dist[:,4]
plt.plot(k_dist)
plt.axhline(y=30,linewidth=1,linestyle='dashed',color='k')
plt.ylabel("KNN Distance")
plt.xlabel("Sorted neighbors distance")
plt.title("Distancia Maxima Optima - Epsilon Optimo")
plt.show()

In [None]:
# DBSCAN
modelo_dbscan2 = DBSCAN(eps=6,min_samples=min_points, metric='euclidean')
modelo_dbscan2.fit_predict(pca_df)
labels2 = modelo_dbscan2.labels_
n_clusters2 = len(set(labels2)) - (1 if -1 in labels else 0)
print(f'Número de clusters encontrados: {n_clusters2}')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,6))

ax.scatter(
    x = pca_df.iloc[:, 0],
    y = pca_df.iloc[:, 1], 
    c = labels2,
    marker    = 'o',
    edgecolor = 'black',
    cmap='viridis'
)

# Los outliers se identifican con el label -1
ax.scatter(
    x = pca_df.iloc[labels2 == -1, 0],
    y = pca_df.iloc[labels2 == -1, 1], 
    c = 'white',
    marker    = 'o',
    edgecolor = 'black',
    label = 'noise',
    cmap='viridis'
)

ax.legend()
ax.set_title('Clusterings para los 7 tejidos generados por DBSCAN')