# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np

### classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# ### graphical plotly basics
# import plotly.graph_objects as go
# import plotly.express as px
# # for jupyter notebook display management
# import plotly.io as pio
# pio.renderers.default = "notebook"

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

In [None]:
df_letters_raw = dfc.load_dataset_from_config('letters_data', sep=',')

if df_letters_raw is not None and isinstance(df_letters_raw, pd.DataFrame):
    dfc.log_general_info(df_letters_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_letters_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_letters_raw))
    df_letters = dfc.normalize_column_names(df_letters_raw)
    display(df_letters.head())

In [None]:
df_letters_desc = df_letters.select_dtypes(include=np.number).describe()
display(df_letters_desc)
df_letters_cr = df_letters.select_dtypes(include=np.number).corr()
display(df_letters_cr)

## 2.2 Data quality refinement

In [None]:
# Original backup and dupplicates management
df_letters_orig = df_letters.copy()
df_letters = df_letters.drop_duplicates()

# 3. Data Classification

## 3.1 General Analysis variable/target Separation

In [None]:
# Separation des variables explicatives (features) et de la variable à prédire (target)
data = df_letters.drop('lettr', axis=1)
target = df_letters['lettr']

In [None]:
# Séparation de données d'entrainement et données de test
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.8, random_state=66)
print("Train Set:", X_train.shape)
print("Test Set:", X_test.shape)

## 3.2 Decision Tree Classifier

In [None]:
# Definition et Entrainement du modèle logistic regression
clfDTC = DecisionTreeClassifier(max_depth=5)
clfDTC.fit(X_train, y_train)

In [None]:
# Evaluation du modèle sur les données d'entrainement
print("Score calculé par le modèle:", clfDTC.score(X_train, y_train))

In [None]:
# Prédiction du modèle sur les données de test
y_pred = clfDTC.predict(X_test)

In [None]:
# Matrice de confusion sur les données de test prédites
cm = confusion_matrix(y_test,y_pred)
print(cm)
df_cm = pd.crosstab(y_test.to_numpy().ravel(), y_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)

# Evaluation du modèle sur les données de test
score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", clfDTC.score(X_test, y_test))
print("Rapport de classification complet:\n", classification_report(y_test, y_pred))

## 3.3 Adaptive Boost Classifier over Tree 
- Adapted to outstanding outliers
- Suffer from over fitting with high number of estimators
- Reduces the variance

In [None]:
# Definition et Entrainement du modèle
clfABC = AdaBoostClassifier(estimator=clfDTC, n_estimators=400)
clfABC.fit(X_train, y_train)

In [None]:
# Evaluation du modèle sur les données d'entrainement
print("Score calculé par le modèle:", clfABC.score(X_train, y_train))

In [None]:
# Prédiction du modèle sur les données de test
y_pred = clfABC.predict(X_test)

In [None]:
# Matrice de confusion sur les données de test prédites
cm = confusion_matrix(y_test,y_pred)
print(cm)
df_cm = pd.crosstab(y_test.to_numpy().ravel(), y_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)

# Evaluation du modèle sur les données de test
score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", clfABC.score(X_test, y_test))
print("Rapport de classification complet:\n", classification_report(y_test, y_pred))

## 3.3 Bootstrap AGGregatING Classifier
- Can be used over several concepts (Regression or Classification) as long as the estimator is affected heavily by variation 
  - Adapted for tree estimator and neural networks
  - Not adapted for KNN and linear regression
- Doesn't suffer from over fitting with high number of estimators
- Reduces biases

In [None]:
# Entrainement du modèle
clfBC = BaggingClassifier(n_estimators=1000, oob_score=True)
clfBC.fit(X_train, y_train)

In [None]:
# Evaluation du modèle sur les données d'entrainement
print("Score calculé par le modèle:", clfBC.score(X_train, y_train))

In [None]:
# Prédiction du modèle sur les données de test
y_pred = clfBC.predict(X_test)

In [None]:
# Matrice de confusion sur les données de test prédites
cm = confusion_matrix(y_test,y_pred)
print(cm)
df_cm = pd.crosstab(y_test.to_numpy().ravel(), y_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)

# Evaluation du modèle sur les données de test
score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", clfBC.score(X_test, y_test))
print("Score Out of Bag :",clfBC.oob_score_)
print("Rapport de classification complet:\n", classification_report(y_test, y_pred))