# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np

### régression
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

### graphical matplotlib basics
import matplotlib.pyplot as plt
# for jupyter notebook management
%matplotlib inline

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

In [None]:
df_ri_raw = dfc.load_dataset_from_config('ruspini_data', sep=',')

if df_ri_raw is not None and isinstance(df_ri_raw, pd.DataFrame):
    # display(df_auto_raw.head())
    dfc.log_general_info(df_ri_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_ri_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_ri_raw))
    df_ri = df_ri_raw.copy()
    display(df_ri.head())

In [None]:
df_ri_desc = df_ri.select_dtypes(include=np.number).describe()
display(df_ri_desc)
df_ri_cr = df_ri.select_dtypes(include=np.number).corr()
display(df_ri_cr)

## 2.2 Data quality refinement

In [None]:
# Original backup and duplicates management
df_ri_orig = df_ri.copy()
# df_ri = df_ri.drop_duplicates()

# 2. Data Clustering

## 2.1 General Analysis

In [None]:
# Visualisation brute des données
plt.scatter(df_ri.x,df_ri.y)
plt.ylabel('y')
plt.xlabel('x')
plt.title('Relation entre X et Y')
plt.tight_layout()
plt.show()

In [None]:
# Visualisation de distribution des données
liste_data = [df_ri['x'], df_ri['y']]
plt.figure()
plt.title('Diagramme en boîte des deux variables explicatives')
plt.boxplot(liste_data, tick_labels = ['x', 'y'])
plt.tight_layout()
plt.show()

## 2.2 K-Means

In [None]:
# Definition et Entrainement du modèle initial (paramètre naif)
clfKM = KMeans(n_clusters=2)
clfKM.fit(df_ri)

In [None]:
# Récupération des information des clusters et visualisation
centroids = clfKM.cluster_centers_
labels = clfKM.labels_
for i, cluster in enumerate(centroids):
    print(f"Cluster [{i}] avec centre en position [{centroids[i][0]}, {centroids[i][1]}]")
colors = ["g.","r."]
# Graphique des données
for i in range(len(df_ri)):
    plt.plot(np.array(df_ri.iloc[i,0]), np.array(df_ri.iloc[i,1]), colors[labels[i]], markersize = 10)
# Graphique des centroïdes
plt.scatter(centroids[:, 0],centroids[:, 1], marker = "o", color = "blue",s=30, linewidths = 1, zorder = 10)
plt.show()

In [None]:
# Optimisation du modèle sur ses hyperparamètres et représentation graphique
range_n_clusters = [i for i in range(2,8)]
d_scores = []
for k in range_n_clusters:
    clfKM = KMeans(n_clusters = k)
    clfKM.fit(df_ri)
    centroids = clfKM.cluster_centers_
    labels = clfKM.labels_
    d_score = np.sum(np.min(cdist(df_ri, centroids, 'euclidean'), axis=1))/np.size(df_ri, axis=0)
    d_scores.append(d_score) 

plt.plot(range_n_clusters, d_scores, 'gx-')
plt.xticks(range_n_clusters)
plt.xlabel('Nombre de Clusters K')
plt.ylabel('Distorsion SSW/(SSW+SSB)')
plt.title('Méthode du coude affichant le nombre de clusters optimal')
plt.show()

In [None]:
# Définition et entrainement du modèle final
clfKM = KMeans(n_clusters=4)
clfKM.fit(df_ri)

In [None]:
# Récupération des information des clusters et visualisation
centroids = clfKM.cluster_centers_
labels = clfKM.labels_
for i, cluster in enumerate(centroids):
    print(f"Cluster [{i}] avec centre en position[{centroids[i][0]}, {centroids[i][1]}]")
colors = ["g.","r.","c.","y."]
# Graphique des données
for i in range(len(df_ri)):
    plt.plot(np.array(df_ri.iloc[i,0]), np.array(df_ri.iloc[i,1]), colors[labels[i]], markersize = 10)
# Graphique des centroïdes
plt.scatter(centroids[:, 0],centroids[:, 1], marker = "o", color = "blue",s=30, linewidths = 1, zorder = 10)
plt.show()