#### This notebook utilizes auto ML to predict the ground truth pathology.
First read the train data after preprocessing (by preprocessed.ipynb)

In [9]:
import pandas as pd
train = pd.read_csv("../20043374/trainProcessed.csv")

In [3]:
train

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,INITIAL_EVIDENCE,I30,diarrhee,bode,lesions_peau_endroitducorps_@_face_dorsale_main_D_,douleurxx_irrad_@_sous_la_machoire,...,etourdissement,hernie_hiatale,douleurxx_irrad_@_trachée,douleurxx_endroitducorps_@_orteil__1__G_,ww_dd,lesions_peau_endroitducorps_@_petite_lèvre_G_,lesions_peau_elevee_@_2,j17_j18,lesions_peau_intens_@_0,lesions_peau_endroitducorps_@_vagin
0,18,"[['Bronchite', 0.19171203430383882], ['Pneumon...",0,IVRS ou virémie,fievre,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,"[['VIH (Primo-infection)', 0.5189500564407601]...",0,VIH (Primo-infection),diaph,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,19,"[['Bronchite', 0.11278064619119596], ['Pneumon...",1,Pneumonie,expecto,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3,34,"[['IVRS ou virémie', 0.23859396799565236], ['C...",1,IVRS ou virémie,douleurxx,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,36,"[['IVRS ou virémie', 0.23677812769175735], ['P...",0,IVRS ou virémie,toux,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025597,18,"[['Épiglottite', 0.28156957795466475], ['VIH (...",0,Épiglottite,fievre,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1025598,28,"[['Épiglottite', 0.3703962237298842], ['Laryng...",1,Épiglottite,fievre,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1025599,0,"[['Épiglottite', 0.13193905052537108], ['Laryn...",1,Épiglottite,stridor,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1025600,26,"[['Épiglottite', 0.3028258988138983], ['Laryng...",1,Épiglottite,stridor,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Drop the columns that won't be used

In [10]:
def preprocessing(df):
    df = df.drop('DIFFERENTIAL_DIAGNOSIS', axis=1) # We use pathology as the groud truth here
    df = df.drop('INITIAL_EVIDENCE', axis=1) # This part doesn't involve the inquiry of the system
    # make all one-hot encoded columns as boolean (make it easy and less memory ocuppied when training)
    df[list(set(df.columns) - set(['AGE', 'PATHOLOGY']))] = df[list(set(df.columns) - set(['AGE', 'PATHOLOGY']))].astype(bool)
    return df

In [11]:
diff_diagnosis = train['DIFFERENTIAL_DIAGNOSIS'].copy()
train = preprocessing(train)

In [26]:
train

Unnamed: 0,AGE,SEX,PATHOLOGY,I30,diarrhee,bode,lesions_peau_endroitducorps_@_face_dorsale_main_D_,douleurxx_irrad_@_sous_la_machoire,douleurxx_irrad_@_cartilage_thyroidien,douleurxx_irrad_@_arrière_de_tête,...,etourdissement,hernie_hiatale,douleurxx_irrad_@_trachée,douleurxx_endroitducorps_@_orteil__1__G_,ww_dd,lesions_peau_endroitducorps_@_petite_lèvre_G_,lesions_peau_elevee_@_2,j17_j18,lesions_peau_intens_@_0,lesions_peau_endroitducorps_@_vagin
0,18,False,IVRS ou virémie,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,21,False,VIH (Primo-infection),False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,19,True,Pneumonie,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
3,34,True,IVRS ou virémie,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,36,False,IVRS ou virémie,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025597,18,False,Épiglottite,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1025598,28,True,Épiglottite,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1025599,0,True,Épiglottite,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1025600,26,True,Épiglottite,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Store the data after processing

In [27]:
import pickle

pickle.dump(train, open('train.pickle', 'wb'))
pickle.dump(diff_diagnosis, open('diff_diagnosis.pickle', 'wb'))

In [1]:
import pickle
train = pickle.load(open('train.pickle', 'rb'))

#### Train the model

In [2]:
from autogluon.tabular import TabularDataset, TabularPredictor

predictor = TabularPredictor(label='PATHOLOGY', problem_type='multiclass').fit(
    train, num_gpus=1, presets='medium_quality', 
    num_bag_folds=5, time_limit=3600,
    hyperparameters= {'XGB': {}, 'GBM': {}, 'RF': {}, 'XT': {}, 'FASTAI': {}})

No path specified. Models will be saved in: "AutogluonModels\ag-20240208_143450"
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels\ag-20240208_143450"
AutoGluon Version:  1.0.0
Python Version:     3.10.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
CPU Count:          16
Memory Avail:       15.01 GB / 23.63 GB (63.5%)
Disk Space Avail:   108.21 GB / 453.84 GB (23.8%)
Train Data Rows:    1025602
Train Data Columns: 518
Label Column:       PATHOLOGY
Problem Type:       multiclass
Preprocessing data ...
Train Data Class Count: 49
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    15362.23 MB
	Train Data (Original)  Memory Usage: 513.50 MB (3.3% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes 

#### Analysis

In [4]:
# load the model
from autogluon.tabular import TabularPredictor
predictor = TabularPredictor.load("../AutogluonModels/ag-20240208_143450")

In [2]:
# summary of the model
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                    model  score_val eval_metric  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     WeightedEnsemble_L2   0.997518    accuracy      46.615749  2698.870767                0.265819         361.470306            2       True          4
1  NeuralNetFastAI_BAG_L1   0.997515    accuracy      16.003241  1891.999093               16.003241        1891.999093            1       True          1
2         LightGBM_BAG_L1   0.997239    accuracy      30.346689   445.401368               30.346689         445.401368            1       True          2
3          XGBoost_BAG_L1   0.995911    accuracy       5.930332  1084.309741                5.930332        1084.309741            1       True          3
Number of models trained: 4
Types of models trained:
{'StackerEnsembleModel_NNFastAiTabular', 'StackerEnsembleModel_LGB', 'WeightedEnsembleModel', 'StackerEnsemble



Read validation set and perform the same process

In [13]:
import pandas as pd
valid = pd.read_csv("../20043374/validateProcessed.csv")
valid = preprocessing(valid)
test = pd.read_csv("../20043374/testProcessed.csv")
test = preprocessing(test)

Predict and calculate metric

In [14]:
prediction_valid = predictor.predict(valid)
prediction_test = predictor.predict(test)

In [15]:
import pandas as pd
import sklearn.metrics

def calculate_metric(pathology, prediction, index=["data"]):
    return pd.DataFrame({
        "accuracy": sklearn.metrics.accuracy_score(pathology, prediction),
        "precision": sklearn.metrics.precision_score(pathology, prediction, average="macro"),
        "recall": sklearn.metrics.recall_score(pathology, prediction, average="macro"),
        "f1 score": sklearn.metrics.f1_score(pathology, prediction, average="macro"),
        "balanced accuracy": sklearn.metrics.balanced_accuracy_score(pathology, prediction)
        }, index=index)
    
valid_metric = calculate_metric(valid['PATHOLOGY'], prediction_valid, index=["validation"])
test_metric = calculate_metric(test['PATHOLOGY'], prediction_test, index=["test"])
pd.concat([valid_metric, test_metric])

Unnamed: 0,accuracy,precision,recall,f1 score,balanced accuracy
validation,0.997184,0.997265,0.996283,0.996631,0.996283
test,0.997599,0.997609,0.996812,0.997111,0.996812


In [16]:
print(sklearn.metrics.classification_report(test['PATHOLOGY'], prediction_test))

                                                                 precision    recall  f1-score   support

                                                    Anaphylaxie       1.00      1.00      1.00      3799
                                                Angine instable       1.00      0.98      0.99      2880
                                                  Angine stable       0.98      1.00      0.99      2386
                                                         Anémie       1.00      1.00      1.00      6842
                               Asthme exacerbé ou bronchospasme       1.00      1.00      1.00      2222
                                             Attaque de panique       1.00      1.00      1.00      3387
                                                Bronchiectasies       1.00      1.00      1.00      2454
                                                   Bronchiolite       1.00      1.00      1.00        36
                                                      