# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np

### classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# from lce import LCEClassifier # not compatible python 3.12
import xgboost as xgb

# ### graphical plotly basics
# import plotly.graph_objects as go
# import plotly.express as px
# for jupyter notebook display management
import plotly.io as pio
pio.renderers.default = "notebook"

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

In [None]:
df_adult_raw = dfc.load_dataset_from_config('adult_data', sep=',')

if df_adult_raw is not None and isinstance(df_adult_raw, pd.DataFrame):
    dfc.log_general_info(df_adult_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_adult_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_adult_raw))
    df_adult = dfc.normalize_column_names(df_adult_raw)
    display(df_adult.head())

In [None]:
df_adult_desc = df_adult.select_dtypes(include=np.number).describe()
display(df_adult_desc)
df_adult_cr = df_adult.select_dtypes(include=np.number).corr()
display(df_adult_cr)

## 2.2 Data quality refinement

In [None]:
# Original backup and dupplicates management
df_adult_orig = df_adult.copy()
df_adult = df_adult.drop_duplicates()

In [None]:
df_adult = df_adult.replace('?', np.nan)
df_adult.native_country = df_adult.native_country.replace(
    ['Cambodia', 'China', 'Hong', 'India','Iran', 'Japan', 'Laos', 'Philippines','Taiwan', 'Thailand','Vietnam'],
    'Asia'
)
df_adult.native_country = df_adult.native_country.replace(
    ['Columbia', 'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador','Guatemala', 'Haiti', 
     'Honduras', 'Jamaica', 'Mexico', 'Nicaragua','Peru', 'Puerto-Rico', 'Trinadad&Tobago', 'South'],
    'Center & South America'
)
df_adult.native_country = df_adult.native_country.replace(
    ['England', 'France', 'Germany', 'Greece', 'Holand-Netherlands', 'Hungary', 'Ireland', 'Italy', 'Poland', 'Portugal',
     'Scotland', 'Yugoslavia'],
    'Europe'
)
df_adult.native_country = df_adult.native_country.replace(
    ['United-States', 'Canada'],
    'Canada&USA'
)

# 3. Data Classification

## 3.1 General Analysis variable/target Separation

In [None]:
# Separation des variables explicatives (features) et de la variable à prédire (target)
features = df_adult.drop(['income'], axis=1)
target = df_adult['income']
# Dichotomisation et normalisation des variables catégorielles (0 ou 1)
target = [1 if x=='>50K' else 0 for x in target]
features_matrix = pd.get_dummies(features)

In [None]:
# Séparation des données de validation, d'entrainement et de test en DMatrix
X, X_valid, y, y_valid = train_test_split(features_matrix, target, test_size=0.1, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
train = xgb.DMatrix(X_train, y_train)
test = xgb.DMatrix(X_test, y_test)
valid = xgb.DMatrix(X_valid, y_valid)
print("Train Set:", X_train.shape)
print("Test Set:", X_test.shape)
print("Valid Set:", X_valid.shape)

## 3.2 eXtreme Gradient Boosting (XGBoost)
- Minimize the cost (loss) fonction with iterative search for its (local) minimum
- parallel optimized processing
- logic is to decreasing learning_rate, increase number of tree, while keeping computation time fair enough 

In [None]:
# Definition et Entrainement fin du modèle (récupération du booster bas niveau sans son XGBClassifier)
params = {
    'booster':'gbtree', 
    'learning_rate':0.01, 
    'objective':'binary:logistic'
}
boost_xgb = xgb.train(
    params, 
    train, 
    num_boost_round=700, 
    early_stopping_rounds=15, 
    evals=[(train, 'train'), (test, 'eval')]
)

In [None]:
# Evaluation du modèle sur les données d'entrainement
# NB : utilise directement le Booster de XGBClassifier donc il n'y a pas de score calculé par le modèle directement accessible
print("Evaluation par le booster:", boost_xgb.eval(train))

In [None]:
# Affichage de l'importance de chaque feature dans la création du modèle (le Gain étant une mesure très fiable)
types= ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
for type in types:
    xgb.plot_importance(boost_xgb ,max_num_features=15, importance_type=type, title='importance: '+type);

In [None]:
# Validation croisée sur les données d'entrainement
bst_cv = xgb.cv(    
    params, 
    train, 
    num_boost_round=100, 
    nfold=3,
    early_stopping_rounds=15
)
display("Best CV:",bst_cv)

In [None]:
# Prédiction du modèle sur les données de test
y_pred = boost_xgb.predict(test)
y_pred_s = pd.Series(np.where(y_pred>=0.5, 1, 0))

In [None]:
# Matrice de confusion sur les données de test prédites
cm = confusion_matrix(y_test, y_pred_s)
print(cm)
df_cm = pd.crosstab(y_test, y_pred_s, rownames=['real'], colnames=['predicted'])
display(df_cm)

# Evaluation du modèle sur les données de test
# NB : utilise directement le Booster de XGBClassifier donc il n'y a pas de score calculé par le modèle directement accessible
score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Evaluation par le booster:",boost_xgb.eval(test))
print("Rapport de classification complet:\n", classification_report(y_test, y_pred_s))


## 3.3 Local Ensemble Gradient (LCE)

In [None]:
# # Definition et Entrainement du modèle
# clf_LCE = LCEClassifier(n_estimators=2, n_jobs=-1, random_state=0)
# clf_LCE.fit(X_train, y_train)

In [None]:
# # Evaluation du modèle sur les données d'entrainement
# print("Score calculé par le modèle:", clf_LCE.score(X_train, y_train))

In [None]:
# # Prédiction du modèle sur les données de test
# y_pred = clf_LCE.predict(X_test)

In [None]:
# # Matrice de confusion sur les données de test prédites
# cm = confusion_matrix(y_test,y_pred)
# print(cm)
# df_cm = pd.crosstab(y_test, y_pred, rownames=['real'], colnames=['predicted'])
# display(df_cm)

# # Evaluation du modèle sur les données de test
# score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
# print("Score reconstruit manuellement:",score)
# print("Score calculé par le modèle:", clf_LCE.score(X_test, y_test))
# print("Rapport de classification complet:\n", classification_report(y_test, y_pred))