# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd

### classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier

# ### graphical plotly basics
# import plotly.graph_objects as go
# import plotly.express as px
# # for jupyter notebook display management
# import plotly.io as pio
# pio.renderers.default = "notebook"

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

In [None]:
df_bc_raw = dfc.load_dataset_from_config('breast_cancer_data', sep=',', index_col=0)

if df_bc_raw is not None and isinstance(df_bc_raw, pd.DataFrame):
    display(df_bc_raw.head())
    dfc.log_general_info(df_bc_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_bc_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_bc_raw))
    df_bc = dfc.normalize_column_names(df_bc_raw)
    display(df_bc.head())

## 2.2 Data quality refinement

In [None]:
df_bc = df_bc.drop("unnamed_32", axis=1)

# 2. Data Classification

## 2.1 General Analysis variable/target Separation

In [None]:
# Separation des variables explicatives (features) et de la variable à prédire (target)
data = df_bc.drop("diagnosis", axis=1)
display(data.head())
target = df_bc.diagnosis
display(target.head())

In [None]:
# Séparation de données d'entrainement et données de test
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=123)

## 2.2 Decision Tree Classification (entropy)

In [None]:
# Definition et Entrainement du modèle
clfDT = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=123)
clfDT.fit(X_train, y_train)

In [None]:
# Evaluation du modèle sur les données d'entrainement
print("Score calculé par le modèle:", clfDT.score(X_train, y_train))

In [None]:
# Prédiction du modèle sur les données de test
y_pred = clfDT.predict(X_test)

In [None]:
# Matrice de confusion sur les données de test prédites
cm = confusion_matrix(y_test,y_pred)
print(cm)
df_cm = pd.crosstab(y_test, y_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)

# Evaluation du modèle sur les données de test
score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", clfDT.score(X_test, y_test))
print("Rapport de classification complet:\n", classification_report(y_test, y_pred))

In [None]:
feats = {}
for value, column in zip(clfDT.feature_importances_, data.columns):
    feats[column] = value
df_feats = pd.DataFrame.from_dict(feats, orient='index')
df_feats = df_feats.rename(columns={0:'Importance'})
df_feats = df_feats.sort_values(by='Importance', ascending=False)
df_feats.head(8)

## 2.2 Decision Tree Classification (gini impurity)

In [None]:
# Definition et Entrainement du modèle
clfDT_gini = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=226)
clfDT_gini.fit(X_train, y_train)

In [None]:
# Evaluation du modèle sur les données d'entrainement
print("Score calculé par le modèle:", clfDT_gini.score(X_train, y_train))

In [None]:
# Prédiction du modèle sur les données de test
y_pred = clfDT_gini.predict(X_test)

In [None]:
# Matrice de confusion sur les données de test prédites
cm = confusion_matrix(y_test,y_pred)
print(cm)
df_cm = pd.crosstab(y_test, y_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)

# Evaluation du modèle sur les données de test
score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", clfDT_gini.score(X_test, y_test))
print("Rapport de classification complet:\n", classification_report(y_test, y_pred))

In [None]:
feats = {}
for value, column in zip(clfDT_gini.feature_importances_, data.columns):
    feats[column] = value
df_feats = pd.DataFrame.from_dict(feats, orient='index')
df_feats = df_feats.rename(columns={0:'Importance'})
df_feats = df_feats.sort_values(by='Importance', ascending=False)
df_feats.head(8)