# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np

### classification
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# ### graphical plotly basics
# import plotly.graph_objects as go
import plotly.express as px
# for jupyter notebook display management
import plotly.io as pio
pio.renderers.default = "notebook"

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

In [None]:
df_diab_raw = dfc.load_dataset_from_config('diabete_data', sep=',')

if df_diab_raw is not None and isinstance(df_diab_raw, pd.DataFrame):
    display(df_diab_raw.head())
    dfc.log_general_info(df_diab_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_diab_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_diab_raw))
    df_diab = dfc.normalize_column_names(df_diab_raw)
    display(df_diab.head())

In [None]:
df_diab_desc = df_diab.select_dtypes(include=np.number).describe()
display(df_diab_desc)
df_diab_cr = df_diab.select_dtypes(include=np.number).corr()
display(df_diab_cr)

## 2.2 Data quality refinement

In [None]:
# Original backup and dupplicates management
df_diab_orig = df_diab.copy()
df_diab = df_diab.drop_duplicates()

# 2. Data Classification

## 2.1 General Analysis variable/target Separation

In [None]:
# Separation des variables explicatives (features) et de la variable à prédire (target)
data = df_diab.drop(['outcome'], axis=1)
target = df_diab['outcome']

In [None]:
# Séparation de données d'entrainement et données de test
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=4)

In [None]:
# modèles unitaires utilisés par les meta modèles
clf_KNN = KNeighborsClassifier(n_neighbors=3)
clf_RFC = RandomForestClassifier(random_state= 123)
clf_LR = LogisticRegression(max_iter=1000)

## 2.2 Voting Classifier
- Aggregate Heterogeneous classifier (models)
- Hard (frequence) or Soft (mean) voting with weight in the end to select the good one
- /!\ High CPU consumption

### 2.2.1 Sans hyperparamètres

In [None]:
# Definition et Entrainement du modèle
clf_Voting = VotingClassifier(estimators=[('KNN',clf_KNN), ('RFC',clf_RFC), ('LR',clf_LR)], voting='hard')
clf_Voting.fit(X_train, y_train)

In [None]:
# Validation croisée pour la classification (accuracy / F1)
cv3 = KFold(n_splits=3, shuffle = True, random_state=111)
# d)
for clf, label in zip([clf_KNN, clf_RFC, clf_LR, clf_Voting], ['KNN', 'Random Forest', 'Logistic Regression', 'Voting Classifier']):
    scores = cross_validate(clf, X_train, y_train, cv=cv3, scoring=['accuracy','f1'])
    print(f"[{label}]: \n Accuracy: "
          f"{scores['test_accuracy'].mean().round(2)} "
          f"(+/- {scores['test_accuracy'].std().round(2)}) "
          f"F1 score: {scores['test_f1'].mean().round(2)} "
          f"(+/- {scores['test_f1'].std().round(2)})")

In [None]:
# Prédiction du modèle sur les données de test
y_pred = clf_Voting.predict(X_test)

In [None]:
# Matrice de confusion et évaluation du modèle sur les données de test
cm = confusion_matrix(y_test, y_pred)
print(cm)
df_cm = pd.crosstab(y_test, pd.Series(y_pred, index=y_test.index), rownames=['real'], colnames=['predicted'])
display(df_cm)

# Score
score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", clf_Voting.score(X_test, y_test))
print("Rapport de classification complet:\n", classification_report(y_test, y_pred))

### 2.2.2 Avec hyperparamètres

In [None]:
# Definition et Entrainement du modèle
clf_KNN = KNeighborsClassifier(n_neighbors=3)
clf_RFC = RandomForestClassifier(random_state= 123)
clf_LR = LogisticRegression(max_iter=1000)
clf_Voting = VotingClassifier(estimators=[('KNN',clf_KNN), ('RFC',clf_RFC), ('LR',clf_LR)], voting='hard')
clf_Voting.fit(X_train, y_train)

parameters = { # syntaxe : "__" signifie que les paramètres s'applique au modele nommé dans estimators
    'knn__n_neighbors': [5, 9], # Ex : s'applique à knn uniquement
    'rfc__n_estimators': [20, 100, 200],
    'lr__C': [0.01, 0.1, 1],
    'estimators': [[('knn',clf_KNN), ('rfc',clf_RFC), ('lr',clf_LR)]] 
}

grid_clf_Voting = GridSearchCV(estimator=clf_Voting, param_grid=parameters, cv=5)

In [None]:
# Application du modèle aux données d'entrainement à l'aide d'une grille de combinaison d'hyperparamètres
result_grid = grid_clf_Voting.fit(X_train, y_train)
# on peut récupérer le best estimator après l'entrainement ainsi qu'afficher ses paramètres
best_clf_Voting = result_grid.best_estimator_
print("Meilleure combinaison de paramètres trouvée pour les données d'entrainement:",best_clf_Voting)

In [None]:
# Visualisation de l'efficacité de la grille avec Plotly express
df_result_grid = pd.DataFrame.from_dict(result_grid.cv_results_)
display(df_result_grid.head())
fig = px.bar(
    df_result_grid,
    x="rank_test_score",      
    y="mean_test_score",      
    color="param_rfc__n_estimators", 
    color_continuous_scale="plasma_r",
    text="rank_test_score",
)
fig.update_layout(
    title="Distribution des scores par rang avec coloration selon le kernel",
    xaxis_title="Rang",
    yaxis_title="Mean Test Score",
    width=1000,
    height=600,
    coloraxis_colorbar=dict(
        title="n_estimators",    
        tickvals=[20, 100, 200],  
        ticktext=["20", "100", "200"], 
    )
)
fig.update_traces(
    customdata=df_result_grid[['param_knn__n_neighbors', 'param_rfc__n_estimators', 'param_lr__C']],
    hovertemplate=
        "Score moyen: %{y:.4f}<br>"
        "KNN/Nb Neighbors: %{customdata[0]}<br>"
        "RFC/Nb Estimators: %{customdata[1]}<br>"
        "LR/C: %{customdata[2]}<br>"
        "<extra></extra>"
)

fig.show()

## 2.3 Stacking
- meta model based on the predictions of L models on N values, the input for the Stacking model is the predictions only (unless enriched specifically)
- /!\ High CPU consumption

In [None]:
# Definition et Entrainement du modèle
clf_Stacking = StackingClassifier(estimators=[('KNN',clf_KNN), ('RFC',clf_RFC), ('LR',clf_LR)], final_estimator=clf_LR, cv=5)
clf_Stacking.fit(X_train, y_train)

In [None]:
# Validation croisée
cv3 = KFold(n_splits=3, shuffle = True, random_state=111)
# d)
for clf, label in zip([clf_KNN, clf_RFC, clf_LR, clf_Stacking], ['KNN', 'Random Forest', 'Logistic Regression', 'Stacking Classifier']):
    scores = cross_validate(clf, X_train, y_train, cv=cv3, scoring=['accuracy','f1'])
    print(f"[{label}]: \n Accuracy: "
          f"{scores['test_accuracy'].mean().round(2)} "
          f"(+/- {scores['test_accuracy'].std().round(2)}) "
          f"F1 score: {scores['test_f1'].mean().round(2)} "
          f"(+/- {scores['test_f1'].std().round(2)})")

In [None]:
# Prédiction du modèle sur les données de test
y_pred = clf_Stacking.predict(X_test)

In [None]:
# Matrice de confusion et évaluation du modèle sur les données de test
cm = confusion_matrix(y_test,y_pred)
print(cm)
df_cm = pd.crosstab(y_test.to_numpy().ravel(), y_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)

# Score
score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", clf_Stacking.score(X_test, y_test))
print("Rapport de classification complet:\n", classification_report(y_test, y_pred))