# <p style="text-align:center;"> No supervizado </p>

**Todo ejercicio debe tener un análisis fundamentado en la teoría vista en la materia, dicho análisis sera parte del informe a entregar en pdf**

- <span style="color:blue">*Se recomienda hacer uso de las herramientas vistas en los demos de la materia.*</span>

- <span style="color:red">**Usar lo hecho en el práctico Análisis Exploratorio y Curación de Datos**.</span>


**Objetivos:** 
- Implementar modelos de clustering, variando el número de clusters.
- Usar embeddings:  PCA, correlación y t-distributed stochastic neighbor.

Implementar dos modelos de clustering con y sin embeddings uno de ellos k-means.

Realizar un análisis de lo obtenido. 
- Es muy recomendable integrar indicadores de mala calidad como por ejemplo "hay un cluster muy grande y el resto son muy chicos", lo cual indica que en el espacio no se distinguen bien grupos separados y hay que usar otro espacio 
- Evaluar con Silohuette y pureza con algunos datos etiquetados.

**NOTA:** Es de suma importancia usar el conocimiento del experto en este práctico. 

**Librerías**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import pandas as pd
 
%matplotlib inline

In [None]:
plt.rcParams['figure.figsize'] = (10.0, 8.0)

**Funciones**

In [None]:
def galaxy_morf(row):
    if row["elliptical"]:
        return "E"
    elif row["spiral"]:
        return "S"
    else:
        return "I"
    
def distribution_per_type(df, col_name="", bins=20):
    #]plt.title(f"{col_name.capitalize()} Distribution")
    sns.distplot(df[df["elliptical"] == 1][col_name],label="elliptical", bins=bins)
    sns.distplot(df[df["spiral"] == 1][col_name],label="spiral", bins=bins)
    sns.distplot(df[df["uncertain"] == 1][col_name],label="irregular", bins=bins)
    plt.legend()
    
def exploratory_plots(df, col_name=""):
    plt.subplot(3, 1, 1)
    distribution_per_type(df, col_name)
    plt.subplot(3, 1, 2)
    #plt.title(f"{col_name.capitalize()} Boxplot")
    sns.boxplot(x="type", y=col_name, data=df, **box_params)
    plt.subplot(3, 1, 3)
   # plt.title(f"{col_name.capitalize()} Boxplot w/o Outliers")
    sns.boxplot(x="type", y=col_name, data=df, showfliers=False, **box_params)
    
    plt.tight_layout()
    
def num_type(row):
    if row["type"] == "I":
        return 1
    elif row["type"] == "S":
        return 2
    else:
        return 3

**Load**

In [None]:
data = pd.read_csv('galaxias_1.csv')
display(data.head(2))
display (data.shape)
data.describe()

**Valores faltantes y Tipo de los Features**

In [None]:
display(data.info())
sns.heatmap(data.isna(), yticklabels=False)

**Eliminación de Valores Duplicados**


In [None]:
data.set_index("objID", inplace=True)
data[data.astype(str).duplicated()].shape
data[data.index.astype(str).duplicated()].shape[0]
data_cl = data.loc[~(data.index.astype(str).duplicated(keep="first"))].copy()

In [None]:
display(data_cl.shape)
display(data_cl.head())

**Distribución de Clases**

In [None]:
data_cl["type"] = data_cl.apply(galaxy_morf, axis=1)
sns.countplot(data_cl["type"], order=["I", "S", "E"])

**Eliminación de valores que carecen de sentido físico**

In [None]:
mask_color = (data_cl["Color"] < 0) & (data_cl["Color"]>-5)
data_cl_color = data_cl[mask_color]

mask_petro = data_cl["petroR90_r"]<40
data_cl_petro = data_cl[mask_petro]

mask_mag = ((abs(data_cl["modelMag_u"]) < 30) & 
    (abs(data_cl["modelMag_g"] < 30)) & 
    (abs(data_cl["modelMag_r"] < 30)) & 
    (abs(data_cl["modelMag_i"] < 30)) & 
    (abs(data_cl["modelMag_z"] < 30))) 
data_cl_mag = data_cl[mask_mag]

mask_no_outl = mask_color & mask_petro & mask_mag
data_cl_no_outl = data_cl[mask_no_outl].copy()
print(data_cl.shape)
print(data_cl_no_outl.shape)
print(data_cl_no_outl.shape[0]/data_cl.shape[0]*100)


del data_cl_color
del data_cl_mag
del data_cl_petro

**Variable Númerica**

In [None]:
diplomatura2021data_cl_no_outl["type_n"] = data_cl_no_outl.apply(num_type, axis=1)
data_cl["type_n"] = data_cl.apply(num_type,axis=1)

# Clustering

In [None]:
from sklearn.cluster import DBSCAN, KMeans, MeanShift
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler

## Muestra Estratificada

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=2411)

In [None]:
for train_idx, test_idx in sss.split(data_cl_no_outl, data_cl_no_outl["type_n"]):
    #strat_train_set = data_cl_no_outl.loc[train_idx]
    strat_test_set  = data_cl_no_outl.iloc[test_idx]

diplomatura2021## Análisis sin variables de ubicación y tipo

In [None]:
pos_cols  = ["ra", "dec", "z"]
type_cols = ['elliptical', 'spiral', 'uncertain', 'type', 'type_n']
data_clus_pos = strat_test_set.drop(type_cols, axis=1)
data_clus_pos = StandardScaler().fit_transform(data_clus_pos)
data_clus     = strat_test_set.drop(type_cols + pos_cols, axis=1)
data_clus_cols = data_clus.columns
data_clus     = pd.DataFrame(data=StandardScaler().fit_transform(data_clus), 
                             columns=data_clus_cols)

data_clus_cols

In [None]:
def plot_silouette(silhouette_values, cluster_labels, silhouette_avg, 
                   title="Visualizacion de los datos"):
    fig, ax1 = plt.subplots(1, 1)
    y_lower = 10
    n_clusters = len(np.unique(cluster_labels))
    for i in np.unique(cluster_labels):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        color   = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title(title)
    # ax1.set_xlabel("espacio de la primera caracteristica")
    # ax1.set_ylabel("espacio de la segunda caracteristica")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    plt.show()

diplomatura2021### K-Means

In [None]:
range_n_clusters = [2, 3, 4, 5, 6]
def serch_k_optimus(data_clus, range_n_clusters):
    sse = {}
    for n_clusters in range_n_clusters:
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        # "Start fitting"
        cluster_labels = clusterer.fit_predict(data_clus)
        # "Stop fitting"
        sse[n_clusters] = clusterer.inertia_

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(data_clus, cluster_labels, random_state=352)
        print("Para n_clusters =", n_clusters,
              "El silhouette_score promedio es :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(data_clus, cluster_labels)

        plot_silouette(sample_silhouette_values, cluster_labels,
                       silhouette_avg, title="k={}".format(n_clusters))
    return sse


In [None]:
sse = serch_k_optimus(data_clus, range_n_clusters)

In [None]:
fig, ax1 = plt.subplots(1, 1)
fig.set_size_inches(18, 7)
plt.suptitle(("Metodo del codo para kmedias "),
                 fontsize=14, fontweight='bold')
ax1.plot(list(sse.keys()), list(sse.values()))
ax1.set_xlabel("Numero of cluster")
ax1.set_ylabel("Inercia")
plt.grid()

<span style="color:red">**Responda:**</span>

<span style="color:red">**Qué pueden decir en general de las siluetas para cada k?**</span>  

<span style="color:red">**Qué  pueden decir acarca del score cuando incrementa el número de clusters?**</span> 



<span style="color:red">**Según el método del codo más el análisis de las siluetas, que número de clusters es el mejor?**</span> 

### DBScan

DBSCAN nos devuelve una etiqueta -1 para las muestras rudiosas. Por lo tanto, si tenemos un clustering con mucha de esas muestras lo descartamos

In [None]:
from itertools import product

In [None]:
n_min_samples = [2, 3, 4]
n_eps         = [0.2, 0.25, 0.3, 0.35]
noise_ratio_limit = 0.47
def search_dbscan_optimus(data_clus, n_min_samples, n_eps, noise_ratio_limit):
    for min_samples, eps in product(n_min_samples, n_eps):
        print("*"*80)
        print("min_samples={} y eps={}".format(min_samples, eps))
        print("*"*80)
        dbscan_clusters = DBSCAN(eps=eps, min_samples=min_samples)
        print("Start fitting")
        cluster_labels = dbscan_clusters.fit_predict(data_clus, )
        print("Stop fitting")
        noise_samples_ratio = sum(cluster_labels == -1) / len(cluster_labels)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        if len(np.unique(cluster_labels)) > 1 and noise_samples_ratio<noise_ratio_limit:
            silhouette_avg = silhouette_score(data_clus, cluster_labels,
                                              random_state=352)
            print("El silhouette_score promedio es :", silhouette_avg)

            # Compute the silhouette scores for each sample
            sample_silhouette_values = silhouette_samples(data_clus, 
                                                          cluster_labels)

            plot_silouette(sample_silhouette_values, cluster_labels,
                           silhouette_avg, title="{}-{}".format(eps, min_samples))
        elif len(np.unique(cluster_labels)) == 1:
            print("Solo 1 cluster identificado")
        elif noise_samples_ratio>=noise_ratio_limit:
            print("El cluster ruido es muy grande: {}".format(noise_samples_ratio))

In [None]:
search_dbscan_optimus(data_clus, n_min_samples, n_eps, noise_ratio_limit)

### Gaussian Mixtures

In [None]:
range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]

def serch_gauss_optimus(data_clus, range_n_clusters):
    sse ={}
    for n_clusters in range_n_clusters:
        clusterer = GaussianMixture(n_components=n_clusters, random_state=10)
        print("Start fitting")
        cluster_labels = clusterer.fit_predict(data_clus)
        print("Stop fitting")
        #sse[n_clusters] = clusterer.inertia_

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(data_clus, cluster_labels, random_state=352)
        print("Para n_clusters =", n_clusters,
              "El silhouette_score promedio es :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(data_clus, cluster_labels)

        plot_silouette(sample_silhouette_values, cluster_labels,
                       silhouette_avg, title="k={}".format(n_clusters))
    return sse


### Mean Shift

In [None]:
def serch_ms_optimus(data_clus, bandwidth):
    sse ={}
    for bw in bandwidth:
        clusterer = MeanShift(bandwidth=bw, bin_seeding=True)
        print("Start fitting")
        cluster_labels = clusterer.fit_predict(data_clus)
        print("Stop fitting")
        #sse[n_clusters] = clusterer.inertia_

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(data_clus, cluster_labels, random_state=352)
        print("Para bamdwith =", bw,
              "El silhouette_score promedio es :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(data_clus, cluster_labels)

        plot_silouette(sample_silhouette_values, cluster_labels,
                       silhouette_avg, title="bw={}".format(bw))
    return sse


<span style="color:red">**Responda:**</span>

<span style="color:red">**Qué pueden decir en general de las siluetas en los métodos DBScan, Gaussian Mixtures y Mean shift**</span>  

## Visualizacion segun cluster label

In [None]:
def distribution_per_label(df, col_name="", bins=20):
    plt.title('col_name.capitalize()} Distribution')
    for kk in df["cluster_label"].unique():
        sns.distplot(df[df["cluster_label"] == kk][col_name],label='cluster_label={kk}', 
                     bins=bins)
    plt.legend()
    
def exploratory_plots_label(df, col_name=""):
    plt.subplot(2, 1, 1)
    distribution_per_label(df, col_name)
    plt.subplot(2, 1, 2)
    #plt.title(f"{col_name.capitalize()} Boxplot w/o Outliers")
    sns.boxplot(x="cluster_label", y=col_name, data=df, showfliers=False, **box_params)
    
    plt.tight_layout()


<span style="color:red">**Elijan dos modelos que para ustedes tienen los mejores resultados y visalicen sus propiedades según el cluster label.**</span>  

**ayuda**

In [None]:
cluster_1 = MODELO1()
cluster_labels_1 = cluster_1.fit_predict(data_clus)

cluster_2 = MODELO2()
cluster_labels_2 = cluster_2.fit_predict(data_clus)

In [None]:
data_clus_1 = data_clus.copy()
data_clus_2 = data_clus.copy()
data_clus_1["cluster_label"] = cluster_labels_1
data_clus_2["cluster_label"] = cluster_labels_2

### Color

In [None]:
exploratory_plots_label(data_clus_1, "color")
exploratory_plots_label(data_clus_2, "color")

### Petro R90

In [None]:
exploratory_plots_label(data_clus_1, "petroR90_r")
exploratory_plots_label(data_clus_2, "petroR90_r")

### Mag Distributions

In [None]:
for mag in ['modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i','modelMag_z']:
    plt.figure()
    exploratory_plots_label(data_clus_1, mag)

In [None]:
for mag in ['modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i','modelMag_z']:
    plt.figure()
    exploratory_plots_label(data_clus_2, mag)

<span style="color:red">**A qué conclusiones llegan?**</span>  

<span style="color:red">**Escojan un modelo y apliquen los siguientes embeddings**</span> 

In [None]:
data_clus_N = 

# Embeddings

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
def print_title(title="", delimiter="*"):
    print(delimiter*80)
    print(title)
    print(delimiter*80)

In [None]:
def plot_2d(pca, df, colname=""):
    from mpl_toolkits.mplot3d import Axes3D 
    fig = plt.figure(figsize=(15,10))
    ax = fig.add_subplot(1,1,1)
    im = ax.scatter(pca[:,0],pca[:,1],
                    c=df[colname],cmap=plt.get_cmap("jet"),
                    alpha=0.5)
    ax.set_xlabel("Axis 1")
    ax.set_ylabel("Axis 2")
    fig.colorbar(im, ax=ax)

In [None]:
def plot_3d(pca, df, colname=""):
    from mpl_toolkits.mplot3d import Axes3D 
    fig = plt.figure(figsize=(15,10))
    for idx in range(1,5):
        ax = fig.add_subplot(2,2,idx, projection='3d')
        im = ax.scatter(pca[:,0],pca[:,1],pca[:,2], 
               c=df[colname],cmap=plt.get_cmap("jet"),
              alpha=0.5)
        ax.view_init(30, 45+90*idx)
        ax.set_xlabel("Axis 1")
        ax.set_ylabel("Axis 2")
        ax.set_zlabel("Axis 3")
        fig.colorbar(im, ax=ax)

## PCA

In [None]:
pca_3dim = PCA(n_components=3)
pca_2dim = PCA(n_components=2)


pca_std_3d = pca_3dim.fit_transform(data_clus_N)
pca_std_2d = pca_2dim.fit_transform(data_clus_N)


In [None]:
print_title("3D")
print("nombre_modelo", pca_3dim.explained_variance_ratio_, sum(pca_3dim.explained_variance_ratio_))
print_title("2D")
print("nombre_modelo", pca_2dim.explained_variance_ratio_, sum(pca_2dim.explained_variance_ratio_))


In [None]:
axis_components = pd.DataFrame(index=data_clus_N.columns, columns=["Ax1", "Ax2", "Ax3"], 
                                  data=pca_3dim.components_.T)

In [None]:
plot_3d(pca_std_3d, data_clus_N, "cluster_label")

In [None]:
plot_3d(pca_std_2d, data_clus_N, "cluster_label")

## TSNE

In [None]:
tsne_2dim = TSNE(n_components=2, )
tsne_std = tsne_2dim.fit_transform(data_clus_N)

In [None]:
plot_2d(tsne_std, data_clus_N, "cluster_label")

<span style="color:red">**A qué conclusiones llegan?**</span>  

## Images

<span style="color:red">**Elijan un modelo y apiquelo en las imágenes más un embedding, analicen**</span>  

**OPCIONAL**

<span style="color:red">**Realizar selección de features en imágenes usando Información Mutua/Chi-cuadrado**</span>  