# loading libs

In [2]:
import matplotlib.cm as cm
import pandas as pd
from sklearn.metrics import davies_bouldin_score
from copy import copy
from tqdm.notebook import tqdm
import nimfa
import matplotlib.pyplot as plt
import numpy as np 


### Plot clustering stats for nmf
- Euclidean Distance
- Kullback-leibler Distance
- Number of iterations
- Sparseness
- Cophenetic

In [None]:
def plot_clustering_stats_nmf(data, rank=(2,5), model='Bmf'):

    # Vizz parameters
    height = 20
    width = 18
    rank_min, rank_max = rank
    # Trainig

    model = nimfa.Bmf if model=='Bmf' else nimfa.Nmf
    
    results = model(data, 
                    seed="random", 
                    rank=100, 
                    random_state=420,
                    max_iter=100).estimate_rank(rank_range=range(rank_min, rank_max),
                                                n_run=10,
                                                what='all')

    results = pd.DataFrame(results)
    # Creating subplots
    fig, axes = plt.subplots(nrows=5, 
                             ncols=1, 
                             figsize=(40,18), 
                             sharex=True)

    fig_box, axes_box = plt.subplots(nrows=2, 
                                     ncols=1, 
                                     figsize=(25,18)
#                                      , 
#                                      sharex=True
                                    )
    # Euclidean distance
    results.loc['rss'].plot(figsize=(width,height),
                            title='Distance - Euclidean',
                            grid=True,
                           ax=axes[0],
                           lw=4)
    # Kullback-Leibler distance
    results.loc['kl'].plot(figsize=(width,height),
                            title='Distance - Kullback-leibler',
                            grid=True,
                           ax=axes[1],
                          lw=4)
    # Iteration number in which the optimization stopped
    results.loc['n_iter'].plot(figsize=(width,height),
                        title='Number of iterations',
                        grid=True,
                           ax=axes[2],
                              lw=4)
    # Sparseness of the H and W matrices
    spar = results.loc['sparseness'].values
    pd.DataFrame(spar.tolist(),
                 columns=['H','W'],
                index=results.columns).plot(figsize=(width,height),
                                            title='Sparseness',
                                            grid=True,
                                            ax=axes[3],
                                           lw=4)
    # Cophenetic score
    results.loc['cophenetic'].plot(figsize=(width,height),
                            title='Cophenetic',
                            grid=True,
                           ax=axes[4],
                           lw=4)
    # Predict features probabilities     
    predict_features_prob = pd.DataFrame(dict(list(zip(results.loc['predict_features'].index, 
                                                 results.loc['predict_features'].map(lambda x: x[1]).values))))

    predict_features_prob.boxplot(figsize=(width,height),
    #                         title='predict features probabilities',
                            grid=True,
                            ax=axes_box[0])
    # Predict samples probabilities
    predict_samples_prob = pd.DataFrame(dict(list(zip(results.loc['predict_samples'].index, 
                                                 results.loc['predict_samples'].map(lambda x: x[1]).values))))

    predict_samples_prob.boxplot(figsize=(width,height),
    #                           title='predict samples probabilities',
                              grid=True,
                              ax=axes_box[1])
    
    plt.setp(axes, xticks=results.loc['predict_samples'].index)
    plt.setp(axes_box, xticks=results.loc['predict_samples'].index)
    
    plt.show()

In [None]:
def silhouette_plot(X, y, n_clusters, ax=None):
    from sklearn.metrics import silhouette_samples, silhouette_score
    # References: https://gist.github.com/clintval/e9afc246e77f6488cda79f86e4d37148
    if ax is None:
        ax = plt.gca()

    # Compute the silhouette scores for each sample
    silhouette_avg = silhouette_score(X, y)
    sample_silhouette_values = silhouette_samples(X, y)

    y_lower = padding = 2
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        ith_cluster_silhouette_values = sample_silhouette_values[y == i]
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.get_cmap("Spectral")(float(i) / n_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                         0,
                         ith_cluster_silhouette_values,
                         facecolor=color,
                         edgecolor=color,
                         alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i + 1))

        # Compute the new y_lower for next plot
        y_lower = y_upper + padding

    ax.set_xlabel("The silhouette coefficient values")
    title_label = 'Cluster label' if ax is None else f"{n_clusters}-Clusters"
    ax.set_ylabel(f"{title_label}")

    # The vertical line for average silhoutte score of all the values
    ax.axvline(x=silhouette_avg, c='r', alpha=0.8, lw=0.8, ls='-')
    ax.annotate('Average',
                xytext=(silhouette_avg, y_lower * 1.025),
                xy=(0, 0),
                ha='center',
                alpha=0.8,
                c='r')

    ax.set_yticks([])  # Clear the yaxis labels / ticks
    ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
    ax.set_ylim(0, y_upper + 1)
    ax.set_xlim(-0.075, 1.0)
    return ax

### Silhouette plot for KPrototypes algorithm

In [None]:
def KPrototypes_silhouette_plot(n_cluster, modelled, ax=None):
    kp = KPrototypes(random_state=420, 
                    n_jobs=-1, 
                    init='Huang',
                    n_init=10,
                    n_clusters=n_cluster)

    kp.fit(modelled, categorical=[i for i, x in enumerate(modelled.columns[:-3])])

    predicted = kp.predict(modelled, categorical=[i for i, x in enumerate(modelled.columns[:-3])])

    silhouette_plot(modelled, predicted, len(pd.unique(predicted)), ax=ax)
    
#     plt.show()

### Davies-Bouldin Index

In [None]:
def davies_bouldin_plot(model, data, cluster_tuple):
    
    initial, final = cluster_tuple
    
    n_clusters = []
    db_score = []

    for k in tqdm(range(initial, final)):
        km = copy(model)
        km.set_params(n_clusters=k)
        predicted = km.fit_predict(data)

        n_clusters.append(k)
        
        db_score.append(davies_bouldin_score(data, predicted))
        
    return pd.DataFrame({'n_clusters':n_clusters,
                        'davies_bouldin_score':db_score})

### Visualização da distância de entre os grupos
- Nos gráficos abaixo, são representados as distâncias entre os grupos de um método de agrupamento.
- O tamanho do círculo representa a proporção dos elementos na base que pertencem aos grupos.
- A distância entre os centróides dos círculos, representa a distância entre os centróides dos grupos.
- A sobreposição dos grupos não necessariamente signifca que eles se sobrepõe no espaço original, é apenas uma representação.

In [None]:
from yellowbrick.cluster import InterclusterDistance

# Instantiate the clustering model and visualizer
model = KMeans(9,
               random_state=420,
               n_jobs=-1)
visualizer_intercluster = InterclusterDistance(model, random_state=420)

visualizer_intercluster.fit(modelled)        # Fit the data to the visualizer
visualizer_intercluster.show()        # Finalize and render the figure

### Método da silhouetta
- Essa métrica busca definir quão compacto e pouco disperso os elementos do grupo são, sendo os valores entre 1 e -1, sendo 1 o mais definido e -1 o menos definido.

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

# Instantiate the clustering model and visualizer
model = KMeans(9, random_state=420, n_jobs=-1)
visualizer_four = SilhouetteVisualizer(model, colors='yellowbrick')
    
visualizer_four.fit(modelled)        # Fit the data to the visualizer
visualizer_four.show()        # Finalize and render the figure

### Método do cotovelo

In [None]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
    
# Instantiate the clustering model and visualizer
model = KMeans(random_state=420)
visualizer_silhouette = KElbowVisualizer(model, k=(2,20), metric='silhouette')

visualizer_silhouette.fit(modelled)        # Fit the data to the visualizer
# visualizer_silhouette.fit(sample)        # Fit the data to the visualizer
visualizer_silhouette.show()        # Finalize and render the figure

### Método de Calinski Harabasz
- Essa métrica, define uma proporção entre a dispersão do agrupamento com o próprio grupo e a dispersão entre os grupos, onde a dispersão é definida como a soma das distâncias ao quadrado.
- Para essa métrica, é definido que quanto maior o valor, mais bem definidos os clusters estão.

In [None]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans

# The score is defined as ratio between the within-cluster dispersion and the between-cluster dispersion.
# The index is the ratio of the sum of between-clusters dispersion and of inter-cluster 
# dispersion for all clusters (where dispersion is defined as the sum of distances squared)
# also known as the Variance Ratio Criterion - can be used to evaluate the model, where a higher 
# Calinski-Harabasz score relates to a model with better defined clusters.
    
    
# Instantiate the clustering model and visualizer
model = KMeans(random_state=420)
visualizer_calinski_harabasz = KElbowVisualizer(model, k=(2,20), metric='calinski_harabasz')

visualizer_calinski_harabasz.fit(modelled)        # Fit the data to the visualizer
visualizer_calinski_harabasz.show()               # Finalize and render the figure