# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np

### classification
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, label_binarize
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

### graphical plotly basics
import plotly.graph_objects as go
# for jupyter notebook display management
import plotly.io as pio
pio.renderers.default = "notebook"

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

### 2.1.1 WINES

In [None]:
df_wines_raw = dfc.load_dataset_from_config('wines_data', sep=',')

if df_wines_raw is not None and isinstance(df_wines_raw, pd.DataFrame):
    display(df_wines_raw.head())
    dfc.log_general_info(df_wines_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_wines_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_wines_raw))
    df_wines = dfc.normalize_column_names(df_wines_raw)
    display(df_wines.head())

In [None]:
df_wines_desc = df_wines.select_dtypes(include=np.number).describe()
display(df_wines_desc)
df_wines_cr = df_wines.select_dtypes(include=np.number).corr()
display(df_wines_cr)

## 2.2 Data quality refinement

### 2.2.1 WINES

In [None]:
# original backup and dupplicates management
df_wines_orig = df_wines.copy()
df_wines = df_wines.drop_duplicates()

# 2. Data Classification

## 2.1 General Analysis

In [None]:

# categorisation for first level human eye classification estimation using quartiles
malic_acid = pd.cut(
    x=df_wines.malic_acid, 
    bins=[df_wines_desc.malic_acid['min'],
          df_wines_desc.malic_acid['25%'],
          df_wines_desc.malic_acid['50%'],
          df_wines_desc.malic_acid['75%'],
          df_wines_desc.malic_acid['max']],
    labels=['low', 'medium-', 'medium+', 'high']
)
display(pd.crosstab(df_wines['class'], malic_acid, normalize='columns'))
flavanoids = pd.cut(    
    x=df_wines.flavanoids, 
    bins=[df_wines_desc.flavanoids['min'],
          df_wines_desc.flavanoids['25%'],
          df_wines_desc.flavanoids['50%'],
          df_wines_desc.flavanoids['75%'],
          df_wines_desc.flavanoids['max']],
    labels=['low', 'medium-', 'medium+', 'high']
)
display(pd.crosstab(df_wines['class'], flavanoids, normalize='columns'))


## 2.2 Linear Regression

In [None]:
# separation des variables explicatives (features) et de la variable à prédire (target)
data = df_wines.drop('class', axis=1)
target = df_wines['class']

In [None]:
# séparation de données d'entrainement et données de test
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.8, random_state=66)

In [None]:
# preprocessing des variables explicatives d'entrainement et de test (encodage de discrétisation pour le machine learning)
encoder = OneHotEncoder(handle_unknown='ignore')
# NB : fit réinitialise l'encodeur avec les catégories et les statistiques des données d'entrainement
# inutile de le refaire pour la partie donnée de test
X_train_enc = encoder.fit_transform(X_train)
X_test_enc = encoder.transform(X_test)

In [None]:
# Application du modèle de type logistic regression
logReg = linear_model.LogisticRegression(C=1.0)
logReg.fit(X_train_enc, y_train.to_numpy().ravel())

In [None]:
# aplication du modèle aux données de test
y_pred = logReg.predict(X_test_enc)
y_probs = logReg.predict_proba(X_test_enc)
print(y_probs)
y_pred_prob = np.where(
    (y_probs[:, 2] >= y_probs[:, 0]) & (y_probs[:, 2] >= y_probs[:, 1]),1,0)
y_pred_prob_strict = np.where(
    (y_probs[:, 2] > y_probs[:, 0]) & (y_probs[:, 2] > y_probs[:, 1]),1,0)
print("Comparaison avec des seuils strict ou non\n",y_pred_prob,"\n",y_pred_prob_strict)

In [None]:
# évaluation du score du modèle par matrice de confusion
cm = confusion_matrix(y_test,y_pred)
print(cm)
df_cm = pd.crosstab(y_test.to_numpy().ravel(), y_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)
score = (cm[0,0]+cm[1,1]+cm[2,2])/cm.sum()
print("Score manuel:",score)
print("Score du classifier:", logReg.score(X_test_enc, y_test))
print("Rapport de classification complet:\n", classification_report(y_test, y_pred))

In [None]:
# vérification de l'AUC et visualisation avec la courbe ROC (pour Receiver Operating Characteristic)
fpr1, tpr1, seuils1 = roc_curve(y_test, y_probs[:,0], pos_label=1)
fpr2, tpr2, seuils2 = roc_curve(y_test, y_probs[:,1], pos_label=2)
fpr3, tpr3, seuils3 = roc_curve(y_test, y_probs[:,2], pos_label=3)
roc_auc1 = auc(fpr1, tpr1)
roc_auc2 = auc(fpr2, tpr2)
roc_auc3 = auc(fpr3, tpr3)

# Affichage des courbes ROC avec AUC
fig = go.Figure()
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash'), name="AUC aléatoire=0.5"))
fig.add_trace(go.Scatter(x=fpr1, y=tpr1, mode='lines', name=f"AUC classe 1={round(roc_auc1,2)}"))
fig.add_trace(go.Scatter(x=fpr2, y=tpr2, mode='lines', name=f"AUC classe 2={round(roc_auc2,2)}"))
fig.add_trace(go.Scatter(x=fpr3, y=tpr3, mode='lines', name=f"AUC classe 3={round(roc_auc3,2)}"))
fig.update_layout(
    title='Courbe ROC', 
    xaxis_title='Taux faux positifs', 
    yaxis_title='Taux vrais positifs',
    legend_title="Classes",
    width=800,
    height=600,
)
fig.show()

In [None]:
# y_pred : vraies classes, par exemple [1, 2, 3, 1, 3]
# y_probs : probabilités prédites, de forme (n_samples, n_classes)
n_classes = 3
y_pred_bin = label_binarize(y_test, classes=[1, 2, 3])  # one-hot
fpr = {}
tpr = {}
roc_auc = {}

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_pred_bin[:, i], y_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Optionnel : macro-average
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= n_classes
macro_auc = auc(all_fpr, mean_tpr)

# Tracer
fig = go.Figure()
# Ajout de la diagonale (chance)
fig.add_trace(go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Aléatoire',
    line=dict(dash='dash')
))
# Ajout des courbes ROC et AUC
for i in range(n_classes):
    fig.add_trace(go.Scatter(
        x=fpr[i],
        y=tpr[i],
        mode='lines',
        name=f"Classe {i} (AUC = {roc_auc[i]:.2f})"
    ))
# Mise en forme
fig.update_layout(
    title="Courbes ROC multi-classes (One-vs-Rest)",
    xaxis_title="Taux de faux positifs",
    yaxis_title="Taux de vrais positifs",
    legend_title="Classes",
    width=800,
    height=600
)
fig.show()