# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np

### régression
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.metrics import silhouette_score

### graphical matplotlib basics
import matplotlib.pyplot as plt
# for jupyter notebook management
%matplotlib inline

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

In [None]:
df_fro_raw = dfc.load_dataset_from_config('fromage_data', sep='\t', index_col=0)

if df_fro_raw is not None and isinstance(df_fro_raw, pd.DataFrame):
    # display(df_auto_raw.head())
    dfc.log_general_info(df_fro_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_fro_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_fro_raw))
    df_fro = df_fro_raw.copy()
    display(df_fro.head())

In [None]:
df_fro_desc = df_fro.select_dtypes(include=np.number).describe()
display(df_fro_desc)
df_fro_cr = df_fro.select_dtypes(include=np.number).corr()
display(df_fro_cr)

## 2.2 Data quality refinement

In [None]:
# Original backup and duplicates management
df_fro_orig = df_fro.copy()
# df_ri = df_ri.drop_duplicates()

# 2. Data Clustering

## 2.1 General Analysis

In [None]:
# Visualisation brute des données
# Not applicable

In [None]:
# Visualisation de distribution des données
liste_data = [df_fro[col] for col in df_fro.columns]
plt.figure()
plt.title('Diagramme en boîte des variables explicatives')
plt.boxplot(liste_data, tick_labels=list(df_fro.columns))
plt.tight_layout()
plt.show()

## 2.2 Agglomerative Clustering (CAH : Classification Ascendante Hiérarchique )

In [None]:
# Definition et Entrainement du modèle initial (paramètre naif)
clfAC = AgglomerativeClustering(n_clusters=4)
clfAC.fit(df_fro)

In [None]:
# Récupération des information des clusters et visualisation
features = clfAC.feature_names_in_
labels = clfAC.labels_
print(features)

lk = linkage(df_fro, method = 'ward', metric = 'euclidean')
# Initialisaion de la figrue
plt.figure(figsize=(15, 6))
plt.title("Dendrogramme CAH avec matérialisation des 4 classes (hauteur t = 300)")
dendrogram(lk, labels = df_fro.index, leaf_rotation = 90., color_threshold = 300);

In [None]:
# Analyse du coefficient de silhouette (entre -1 et 1)
print("Coefficient de silhouette:", silhouette_score(df_fro, labels=labels, metric='seuclidean'))

In [None]:
# Optimisation du modèle sur ses hyperparamètres et représentation graphique
range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
s_scores = []
for k in range_n_clusters:
    clfAC = AgglomerativeClustering(n_clusters = k)
    clfAC.fit(df_fro)
    labels = clfAC.labels_
    s_score = silhouette_score(df_fro, labels=labels, metric='seuclidean')
    s_scores.append(s_score) 

# NB : le coefficient de silhouette évalue l'homogénéité intra-cluster et la séparation inter-clusters en même temps
plt.plot(range_n_clusters, s_scores, 'gx-')
plt.xticks(range_n_clusters)
plt.xlabel('Nombre de Clusters K')
plt.ylabel('Coefficient de silhouette b-a/max(ab)')
plt.title('Graphique du coefficient de silhouette en fonction du nombre de clusters')
plt.show()