# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np
from typing import cast

### classification
from sklearn import neighbors
from sklearn import datasets
from sklearn.utils import Bunch
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

### graphical plotly basics
import plotly.graph_objects as go
# import plotly.express as px
# for jupyter notebook display management
import plotly.io as pio
pio.renderers.default = "notebook"

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

In [None]:
digits =  cast(Bunch, datasets.load_digits())

## 2.2 Data quality refinement

In [None]:
# None

# 2. Data Classification

## 2.1 General Analysis variable/target Separation

In [None]:
# Separation des variables explicatives (features) et de la variable à prédire (target)
X_digits = pd.DataFrame(digits.data)
display(X_digits.head())
y_digits = digits.target
display(np.unique(y_digits))

In [None]:
# Séparation de données d'entrainement et données de test
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, train_size=0.8, random_state=126)

## 2.2 KNN Classification (minkowski)

In [None]:
# Definition et Entrainement du modèle
clfKNN = neighbors.KNeighborsClassifier(n_neighbors=7, metric='minkowski')
clfKNN.fit(X_train, y_train)

In [None]:
# Evaluation du modèle sur les données d'entrainement
print("Score calculé par le modèle:", clfKNN.score(X_train, y_train))

In [None]:
# Prédiction du modèle sur les données de test
y_test_pred = clfKNN.predict(X_test)

In [None]:
# Matrice de confusion sur les données de test prédites
cm = confusion_matrix(y_test,y_test_pred)
print(cm)
df_cm = pd.crosstab(y_test, y_test_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)

# Evaluation du modèle sur les données de test
score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", clfKNN.score(X_test, y_test))
print("Rapport de classification complet:\n", classification_report(y_test, y_test_pred))

## 2.2 KNN Classification (Manhattan)

In [None]:
# Definition et Entrainement du modèle
clfKNN_man = neighbors.KNeighborsClassifier(n_neighbors=5, metric='manhattan')
clfKNN_man.fit(X_train, y_train)

In [None]:
# Evaluation du modèle sur les données d'entrainement
print("Score calculé par le modèle:", clfKNN_man.score(X_train, y_train))

In [None]:
# Prédiction du modèle sur les données de test
y_test_pred = clfKNN_man.predict(X_test)

In [None]:
# Matrice de confusion sur les données de test prédites
cm = confusion_matrix(y_test,y_test_pred)
print(cm)
df_cm = pd.crosstab(y_test, y_test_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)

# Evaluation du modèle sur les données de test
score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", clfKNN_man.score(X_test, y_test))
print("Rapport de classification complet:\n", classification_report(y_test, y_test_pred))

In [None]:
scores = {
    'minkowski' : [],
    'manhattan' : [],
    'chebyshev' : []
}
for i in range(1,41):
    clfKNN_minko = neighbors.KNeighborsClassifier(n_neighbors=i, metric='minkowski')
    clfKNN_minko.fit(X_train, y_train)
    y_test_pred = clfKNN_minko.predict(X_test)
    scores['minkowski'].append(clfKNN_minko.score(X_test, y_test))
    clfKNN_man = neighbors.KNeighborsClassifier(n_neighbors=i, metric='manhattan')
    clfKNN_man.fit(X_train, y_train)
    y_test_pred = clfKNN_man.predict(X_test)
    scores['manhattan'].append(clfKNN_man.score(X_test, y_test))
    clfKNN_cheb = neighbors.KNeighborsClassifier(n_neighbors=i, metric='chebyshev')
    clfKNN_cheb.fit(X_train, y_train)
    y_test_pred = clfKNN_cheb.predict(X_test)
    scores['chebyshev'].append(clfKNN_cheb.score(X_test, y_test))
    print(f"k={i}:",scores['minkowski'][i-1], scores['manhattan'][i-1], scores['chebyshev'][i-1])

In [None]:
# Tracer la courbe score en fonction de K pour différents modèles avec Plotly
fig = go.Figure()
for metric in scores:
    fig.add_trace(go.Scatter(
        x=np.arange(1,41,1),
        y=scores[metric],
        mode='lines',
        name=f'Score {metric}',
        line=dict(dash='dash', width=3)
    ))
fig.update_layout(
    title="Courbes Score en fonction de K pour les modèles KNN",
    xaxis_title="valeur de K",
    yaxis_title="Score",
    legend_title="Métrique",
    width=800,
    height=600
)
fig.show()