In [18]:
import os
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from imblearn.pipeline import make_pipeline

### Data 

In [19]:
DATA_DIRECTORY = "/home/fehrdelt/data_ssd/data/clinical_data/"

In [20]:
X = pd.read_csv(DATA_DIRECTORY+"cleaned_dataframe.csv")

X.head()

Unnamed: 0,age,hemocue_initial,fracas_du_bassin,catecholamines,pression_arterielle_systolique_PAS_arrivee_du_smur,pression_arterielle_diastolique_PAD_arrivee_du_smur,score_glasgow_initial,score_glasgow_moteur_initial,anomalie_pupillaire_prehospitalier,frequence_cardiaque_FC_arrivee_du_smur,arret_cardio_respiratoire_massage,penetrant_objet,ischemie_du_membre,hemorragie_externe,amputation
0,79.0,,0.0,0.0,190.0,103.0,15.0,6.0,0.0,137.0,0.0,0.0,0.0,0.0,0.0
1,52.0,,0.0,0.0,87.0,49.0,15.0,6.0,0.0,56.0,0.0,0.0,0.0,0.0,0.0
2,23.0,,0.0,0.0,100.0,60.0,15.0,6.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
3,42.0,13.1,0.0,0.0,101.0,64.0,14.0,6.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0
4,34.0,15.8,0.0,0.0,110.0,71.0,15.0,6.0,0.0,107.0,0.0,0.0,0.0,0.0,0.0


In [21]:
Y = pd.read_csv(DATA_DIRECTORY+"cleaned_dataframe_outcome.csv")
Y.head()


Unnamed: 0,neurochir+pic
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [22]:
# !!! A executer qu'une seule fois !! 
# remove rows from X and Y where Y has nan


nan_indexes = Y.loc[pd.isna(Y["neurochir+pic"]), :].index # indexes where there is a nan value.
print(nan_indexes)

Y = Y.dropna()
X = X.drop(nan_indexes)

Index([68, 99, 111, 114, 133, 135, 145, 149, 155, 156, 166], dtype='int64')


In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [24]:
print(np.sum(list(Y_test["neurochir+pic"])))

3.0


In [25]:
train_data = lgb.Dataset(X_train, label=Y_train, categorical_feature=["fracas_du_bassin", "catecholamines", "anomalie_pupillaire_prehospitalier", "arret_cardio_respiratoire_massage", "penetrant_objet", "ischemie_du_membre", "hemorragie_externe", "amputation"]) # categorical features

### LightGBM

In [26]:
# Define parameters for GBDT
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary_logloss',
    'num_leaves': 11,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}
 
# Train the GBDT model
gbm = lgb.train(params, train_data, num_boost_round=100)
 
# Make predictions on the test set
Y_pred = gbm.predict(X_test)
 
# Evaluate the model
report = classification_report(Y_test, (Y_pred > 0.5).astype(int))
print(report)

[LightGBM] [Info] Number of positive: 21, number of negative: 274
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.125102 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281
[LightGBM] [Info] Number of data points in the train set: 295, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.071186 -> initscore=-2.568606
[LightGBM] [Info] Start training from score -2.568606
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98        71
         1.0       0.50      0.33      0.40         3

    accuracy                           0.96        74
   macro avg       0.74      0.66      0.69        74
weighted avg       0.95      0.96      0.96        74



### Sklearn HistGradientBoostingClassifier + Imbalanced-learn RandomUnderSampler

In [27]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_validate


model = make_pipeline(

    RandomUnderSampler(random_state=0),
    HistGradientBoostingClassifier(random_state=0)

)

cv_results = cross_validate(

    model, X_train, Y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1

)

print(
    f"Balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}"
)

  y = column_or_1d(y, warn=True)


Balanced accuracy mean +/- std. dev.: 0.000 +/- 0.000


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
