# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np

### classification
from sklearn import svm
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, label_binarize
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

### graphical plotly basics
import plotly.graph_objects as go
import plotly.express as px
# for jupyter notebook display management
import plotly.io as pio
pio.renderers.default = "notebook"

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

In [None]:
df_wines_raw = dfc.load_dataset_from_config('wines_data', sep=',')

if df_wines_raw is not None and isinstance(df_wines_raw, pd.DataFrame):
    display(df_wines_raw.head())
    dfc.log_general_info(df_wines_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_wines_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_wines_raw))
    df_wines = dfc.normalize_column_names(df_wines_raw)
    display(df_wines.head())

In [None]:
df_wines_desc = df_wines.select_dtypes(include=np.number).describe()
display(df_wines_desc)
df_wines_cr = df_wines.select_dtypes(include=np.number).corr()
display(df_wines_cr)

## 2.2 Data quality refinement

In [None]:
# Original backup and dupplicates management
df_wines_orig = df_wines.copy()
df_wines = df_wines.drop_duplicates()

# 2. Data Classification

## 2.1 General Analysis variable/target Separation

In [None]:

# Categorisation for first level human eye classification estimation using quartiles
malic_acid = pd.cut(
    x=df_wines.malic_acid, 
    bins=[df_wines_desc.malic_acid['min'],
          df_wines_desc.malic_acid['25%'],
          df_wines_desc.malic_acid['50%'],
          df_wines_desc.malic_acid['75%'],
          df_wines_desc.malic_acid['max']],
    labels=['low', 'medium-', 'medium+', 'high']
)
display(pd.crosstab(df_wines['class'], malic_acid, normalize='columns'))
flavanoids = pd.cut(    
    x=df_wines.flavanoids, 
    bins=[df_wines_desc.flavanoids['min'],
          df_wines_desc.flavanoids['25%'],
          df_wines_desc.flavanoids['50%'],
          df_wines_desc.flavanoids['75%'],
          df_wines_desc.flavanoids['max']],
    labels=['low', 'medium-', 'medium+', 'high']
)
display(pd.crosstab(df_wines['class'], flavanoids, normalize='columns'))


In [None]:
# Separation des variables explicatives (features) et de la variable à prédire (target)
data = df_wines.drop('class', axis=1)
target = df_wines['class']

In [None]:
# Séparation de données d'entrainement et données de test
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.8, random_state=66)

## 2.2 Logistic Regression and One Hot Encoder preprocessing

In [None]:
# Preprocessing des variables explicatives d'entrainement et de test (encodage de discrétisation pour le machine learning)
enc = OneHotEncoder(handle_unknown='ignore')
# NB : fit réinitialise l'encodeur avec les catégories et les statistiques des données d'entrainement
# inutile de le refaire pour la partie donnée de test donc
enc.fit(X_train)
X_train_enc = enc.transform(X_train)
# X_train_enc = encoder.fit_transform(X_train)
X_test_enc = enc.transform(X_test)

In [None]:
# Definition et Entrainement du modèle
clfLR = linear_model.LogisticRegression(C=1.0)
clfLR.fit(X_train_enc, y_train.to_numpy().ravel())

In [None]:
# Prédiction du modèle sur les données de test
y_pred = clfLR.predict(X_test_enc)

In [None]:
# Matrice de confusion et évaluation du modèle
cm = confusion_matrix(y_test,y_pred)
print(cm)
df_cm = pd.crosstab(y_test.to_numpy().ravel(), y_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)

score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", clfLR.score(X_test_enc, y_test))
print("Rapport de classification complet:\n", classification_report(y_test, y_pred))

In [None]:
# [Optionel] Récupération des probabilités (et remaniement des règles d'identification de classe : par défaut la classe prédite est celle avec la proba la plus elevée)
y_probs = clfLR.predict_proba(X_test_enc)
# print(y_probs)
y_pred_prob_class1 = np.where(
    (y_probs[:, 0] >= y_probs[:, 1]) & (y_probs[:, 0] >= y_probs[:, 2]),1,0)
y_pred_prob_class2 = np.where(
    (y_probs[:, 1] >= y_probs[:, 0]) & (y_probs[:, 1] >= y_probs[:, 2]),1,0)
y_pred_prob_class3 = np.where(
    (y_probs[:, 2] > y_probs[:, 0]) & (y_probs[:, 2] >= y_probs[:, 1]),1,0)
print("Application manuelle des règles\n",
      y_pred_prob_class1,"\n",
      y_pred_prob_class2,"\n",
      y_pred_prob_class3)

In [None]:
# Vérification de l'AUC et visualisation avec la courbe ROC (pour Receiver Operating Characteristic)

# y_pred_bin : vraies classes, par exemple [1, 2, 3, 1, 3]
# y_probs : probabilités prédites, de forme (n_samples, n_classes)
n_classes = len(df_wines['class'].unique())
classes=[i for i in range(1,n_classes+1)]
y_pred_bin = label_binarize(y_test, classes=classes)  # one-hot
fpr = {}
tpr = {}
roc_auc = {}

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(np.asarray(y_pred_bin)[:, i], y_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# # Optionnel : macro-average
# all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# mean_tpr = np.zeros_like(all_fpr)
# for i in range(n_classes):
#     mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
# mean_tpr /= n_classes
# macro_auc = auc(all_fpr, mean_tpr)

# Tracer la courbe ROC+AUC avec Plotly
fig = go.Figure()
# Ajout de la diagonale (chance)
fig.add_trace(go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Aléatoire',
    line=dict(dash='dash')
))
# Ajout des courbes ROC et AUC
for i in range(n_classes):
    fig.add_trace(go.Scatter(
        x=fpr[i],
        y=tpr[i],
        mode='lines',
        name=f"Classe {i} (AUC = {roc_auc[i]:.2f})"
    ))
# Mise en forme
fig.update_layout(
    title="Courbes ROC multi-classes (One-vs-Rest)",
    xaxis_title="Taux de faux positifs",
    yaxis_title="Taux de vrais positifs",
    legend_title="Classes",
    width=800,
    height=600
)
fig.show()

## 2.3 Support Vector Machine (SVM) with scaler preprocessing

In [None]:
# Preprocessing des variables explicatives d'entrainement et de test (scaler)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 2.3.1 Sans hyperparamètres

In [None]:
# Definition et Entrainement du modèle 
clfSVM = svm.SVC(gamma=0.01,  kernel='poly')
clfSVM.fit(X_train_scaled, y_train)

In [None]:
# Application du modèle aux données de test
y_pred = clfSVM.predict(X_test_scaled)

In [None]:
# Matrice de confusion et évaluation du modèle
cm = confusion_matrix(y_test,y_pred)
print(cm)
df_cm = pd.crosstab(y_test.to_numpy().ravel(), y_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)

score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", clfSVM.score(X_test_scaled, y_test))
# print("Rapport de classification complet:\n", classification_report(y_test, y_pred))

### 2.3.2 Avec hyperparamètres

In [None]:
# Définition des hyper paramètres
parameters = {
    'C':[0.1,1,10],
    'kernel': ['rbf', 'linear','poly'],
    'gamma':[0.001, 0.1, 0.5]
}
grid_clfSVM = GridSearchCV(estimator=clfSVM, param_grid=parameters)

In [None]:
# Application du modèle aux données d'entrainement à l'aide d'une grille de combinaison d'hyperparamètres
result_grid = grid_clfSVM.fit(X_train_scaled, y_train)
# on peut récupérer le best estimator après l'entrainement ainsi qu'afficher ses paramètres
best_clfSVM = result_grid.best_estimator_
print("Meilleure combinaison de paramètres trouvée pour les données d'entrainement:",result_grid.best_estimator_)

In [None]:
# Visualisation de l'efficacité de la grille avec Plotly express
df_result_grid = pd.DataFrame.from_dict(result_grid.cv_results_)
df_result_grid["params_str"] = df_result_grid["params"].apply(lambda d: str(d))
df_result_grid["mts_str"] = df_result_grid["mean_test_score"].round(3)
fig3 = px.bar(
    df_result_grid,
    x="rank_test_score",      
    y="mean_test_score",      
    color="param_kernel", 
    text="rank_test_score",
    hover_data={
        "params_str": True,           
        "mean_test_score": True,      
        "rank_test_score": False,     
        "param_kernel": False,          
        "mts_str": False             
    })
fig3.update_layout(
    title="Distribution des scores par rang avec coloration selon le kernel",
    xaxis_title="Rang",
    yaxis_title="Mean Test Score",
    width=1000,
    height=600,
    legend_title='Kernel utilisé',
)

fig3.show()

In [None]:
# Application du modèle aux données de test
y_pred = grid_clfSVM.predict(X_test_scaled)

In [None]:
# Matrice de confusion et évaluation du modèle
cm = confusion_matrix(y_test,y_pred)
print(cm)
df_cm = pd.crosstab(y_test.to_numpy().ravel(), y_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)

score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", grid_clfSVM.score(X_test_scaled, y_test))
print("Rapport de classification complet:\n", classification_report(y_test, y_pred))