In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from MultilabelPredictor import MultilabelPredictor

train = pd.read_csv('../../data/trainProcessed.csv')
validate = pd.read_csv('../../data/validateProcessed.csv')
test = pd.read_csv('../../data/testProcessed.csv')

In [4]:
import ast
def convert_string_to_list(string):
    try:
        return ast.literal_eval(string)
    except ValueError:
        return [] 

train['DIFFERENTIAL_DIAGNOSIS'] = train['DIFFERENTIAL_DIAGNOSIS'].apply(convert_string_to_list)
test['DIFFERENTIAL_DIAGNOSIS'] = test['DIFFERENTIAL_DIAGNOSIS'].apply(convert_string_to_list)
validate['DIFFERENTIAL_DIAGNOSIS'] = validate['DIFFERENTIAL_DIAGNOSIS'].apply(convert_string_to_list)

def data_pre(df):
    y=df['DIFFERENTIAL_DIAGNOSIS']
    X=df.drop(['DIFFERENTIAL_DIAGNOSIS','PATHOLOGY'], axis=1)
    unique_diseases = set()
    for diagnosis_list in y:
      for diagnosis in diagnosis_list:
        if isinstance(diagnosis, list) and len(diagnosis) == 2:
            unique_diseases.add(diagnosis[0])
    disease_to_index = {disease: i for i, disease in enumerate(unique_diseases)}
    disease_array = np.zeros((len(y), len(unique_diseases)))
    for i, diagnosis_list in enumerate(y):
      for diagnosis in diagnosis_list:
        if isinstance(diagnosis, list) and len(diagnosis) == 2:
          disease_name, probability = diagnosis
          if disease_name in disease_to_index:
            disease_index = disease_to_index[disease_name]
            disease_array[i, disease_index] = probability
    diseases_list = list(unique_diseases)
    y = pd.DataFrame(disease_array, columns=diseases_list)
    return X,y

In [4]:
features, diff = data_pre(train)

In [5]:
features[list(set(features.columns) - set(['AGE']))] = features[list(set(features.columns) - set(['AGE']))].astype(bool)
diff = diff.rename(columns={'Possible NSTEMI / STEMI': 'Possible NSTEMI or STEMI', 
                     'Exacerbation aigue de MPOC et/ou surinfection associée': 'Exacerbation aigue de MPOC et.ou surinfection associée', 
                     'Fibrillation auriculaire/Flutter auriculaire': 'Fibrillation auriculaire or Flutter auriculaire'
                     })
train = pd.concat([features, diff], axis=1, ignore_index=False).drop("INITIAL_EVIDENCE", axis=1)

In [8]:
train

Unnamed: 0,AGE,SEX,I30,diarrhee,bode,lesions_peau_endroitducorps_@_face_dorsale_main_D_,douleurxx_irrad_@_sous_la_machoire,douleurxx_irrad_@_cartilage_thyroidien,douleurxx_irrad_@_arrière_de_tête,douleurxx_endroitducorps_@_hypochondre_G_,...,Épiglottite,Scombroïde,IVRS ou virémie,Myocardite,Lupus érythémateux disséminé (LED),Possible NSTEMI or STEMI,Réaction dystonique aïgue,Sarcoïdose,Chagas,Otite moyenne aigue (OMA)
0,18,False,False,False,False,False,False,False,False,False,...,0.000000,0.000000,0.160781,0.000000,0.000000,0.000000,0.000000,0.000000,0.049842,0.000000
1,21,False,False,True,False,False,False,False,False,False,...,0.000000,0.134968,0.000000,0.000000,0.000000,0.000000,0.000000,0.024300,0.321782,0.000000
2,19,True,False,False,False,False,False,False,False,False,...,0.000000,0.058790,0.067060,0.000000,0.000000,0.083432,0.000000,0.016407,0.029234,0.000000
3,34,True,False,False,False,False,False,False,False,False,...,0.000000,0.000000,0.238594,0.000000,0.000000,0.000000,0.000000,0.000000,0.070650,0.000000
4,36,False,False,False,False,False,False,False,False,False,...,0.000000,0.000000,0.236778,0.000000,0.000000,0.000000,0.000000,0.000000,0.065779,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025597,18,False,False,False,False,False,False,False,False,False,...,0.281570,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.110448,0.000000
1025598,28,True,False,False,False,False,False,False,False,False,...,0.370396,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.170900,0.000000
1025599,0,True,False,False,False,False,False,False,False,False,...,0.131939,0.028965,0.000000,0.045791,0.021199,0.000000,0.045791,0.000000,0.057025,0.065024
1025600,26,True,False,False,False,False,False,False,False,False,...,0.302826,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [None]:
multi_predictor = MultilabelPredictor(labels=diff.columns, problem_types=['regression']*49)
multi_predictor.fit(train, presets='medium_quality', time_limit=210, hyperparameters={'FASTAI': {}}, fit_weighted_ensemble=False)

In [5]:
test_features, test_diff = data_pre(test)
test_features[list(set(test_features.columns) - set(['AGE']))] = test_features[list(set(test_features.columns) - set(['AGE']))].astype(bool)
test_features = test_features.drop("INITIAL_EVIDENCE", axis=1)
test_diff = test_diff.rename(columns={'Possible NSTEMI / STEMI': 'Possible NSTEMI or STEMI', 
                     'Exacerbation aigue de MPOC et/ou surinfection associée': 'Exacerbation aigue de MPOC et.ou surinfection associée', 
                     'Fibrillation auriculaire/Flutter auriculaire': 'Fibrillation auriculaire or Flutter auriculaire'
                     })

In [6]:
multi_predictor = MultilabelPredictor.load("./AutogluonModels/")  # unnecessary, just demonstrates how to load previously-trained multilabel predictor from file

predictions = multi_predictor.predict(test_features)

Predicting with TabularPredictor for label: néoplasie pulmonaire ...
Predicting with TabularPredictor for label: Anémie ...
Predicting with TabularPredictor for label: Bronchiolite ...
Predicting with TabularPredictor for label: Pneumonie ...
Predicting with TabularPredictor for label: OAP/Surcharge pulmonaire ...
Predicting with TabularPredictor for label: Ebola ...
Predicting with TabularPredictor for label: Bronchiectasies ...
Predicting with TabularPredictor for label: Pharyngite virale ...
Predicting with TabularPredictor for label: Laryngo-trachéo-bronchite (Croup) ...
Predicting with TabularPredictor for label: Asthme exacerbé ou bronchospasme ...
Predicting with TabularPredictor for label: Possible influenza ou syndrome virémique typique ...
Predicting with TabularPredictor for label: Syndrome de Boerhaave ...
Predicting with TabularPredictor for label: Exacerbation aigue de MPOC et.ou surinfection associée ...
Predicting with TabularPredictor for label: Coqueluche ...
Predicti

In [7]:
predictions

Unnamed: 0,néoplasie pulmonaire,Anémie,Bronchiolite,Pneumonie,OAP/Surcharge pulmonaire,Ebola,Bronchiectasies,Pharyngite virale,Laryngo-trachéo-bronchite (Croup),Asthme exacerbé ou bronchospasme,...,Épiglottite,Scombroïde,IVRS ou virémie,Myocardite,Lupus érythémateux disséminé (LED),Possible NSTEMI or STEMI,Réaction dystonique aïgue,Sarcoïdose,Chagas,Otite moyenne aigue (OMA)
0,0.000348,0.072350,0.000082,0.000000,0.000377,0.000000,0.002226,0.000369,0.000000,0.000000,...,0.000000,0.001505,0.000578,0.000582,0.000745,0.110626,0.001953,0.000000,0.001110,0.001295
1,0.000468,0.042037,0.000174,0.048472,0.000000,0.000159,0.001558,0.114614,0.053290,0.047101,...,0.000000,0.029204,0.047587,0.049720,0.034274,0.001018,0.036677,0.026939,0.003265,0.001558
2,0.001005,0.000123,0.000000,0.001848,0.000168,0.000211,0.003658,0.000000,0.000000,0.000000,...,0.000000,0.000799,0.000455,0.001124,0.000000,0.000108,0.631364,0.001077,0.001715,0.000000
3,0.000387,0.000321,0.000000,0.002810,0.000000,0.000074,0.001792,0.173988,0.000000,0.000000,...,0.003085,0.001178,0.001179,0.000480,0.000000,0.000899,0.001888,0.000000,0.003942,0.000000
4,0.000310,0.002692,0.000000,0.088330,0.000046,0.000000,0.001742,0.000000,0.000000,0.000000,...,0.000000,0.001936,0.190898,0.000124,0.000000,0.000660,0.001956,0.000000,0.043233,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134524,0.000557,0.000000,0.000257,0.002568,0.000026,0.000203,0.001781,0.000000,0.000000,0.000000,...,0.000000,0.018966,0.000000,0.000563,0.235495,0.191824,0.000660,0.000000,0.076687,0.000000
134525,0.087334,0.048656,0.000115,0.071456,0.067119,0.000154,0.056352,0.000000,0.000000,0.035715,...,0.000000,0.001223,0.000528,0.046845,0.000380,0.057944,0.031581,0.000000,0.000501,0.000370
134526,0.000555,0.000000,0.000091,0.000466,0.000867,0.000000,0.001983,0.000000,0.000000,0.000000,...,0.000000,0.003058,0.000787,0.159813,0.000000,0.000000,0.157287,0.119445,0.001257,0.000081
134527,0.000543,0.089548,0.000000,0.000760,0.000000,0.000000,0.001883,0.000000,0.003415,0.000000,...,0.000000,0.152507,0.000510,0.085898,0.013135,0.000423,0.060812,0.000000,0.013445,0.000000


In [8]:
test_diff = test_diff[predictions.columns]

In [9]:
test_diff

Unnamed: 0,néoplasie pulmonaire,Anémie,Bronchiolite,Pneumonie,OAP/Surcharge pulmonaire,Ebola,Bronchiectasies,Pharyngite virale,Laryngo-trachéo-bronchite (Croup),Asthme exacerbé ou bronchospasme,...,Épiglottite,Scombroïde,IVRS ou virémie,Myocardite,Lupus érythémateux disséminé (LED),Possible NSTEMI or STEMI,Réaction dystonique aïgue,Sarcoïdose,Chagas,Otite moyenne aigue (OMA)
0,0.000000,0.097062,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.143091,0.000000,0.000000,0.000000,0.0
1,0.000000,0.029430,0.0,0.065450,0.000000,0.0,0.000000,0.065594,0.041791,0.080220,...,0.0,0.027191,0.046760,0.046519,0.013625,0.000000,0.029430,0.024101,0.008858,0.0
2,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.626705,0.000000,0.000000,0.0
3,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.169478,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,0.000000,0.000000,0.0,0.101290,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.212576,0.000000,0.000000,0.000000,0.000000,0.000000,0.032849,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134524,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.245774,0.268768,0.000000,0.000000,0.000000,0.0
134525,0.090948,0.048338,0.0,0.065528,0.069015,0.0,0.049657,0.000000,0.000000,0.042877,...,0.0,0.000000,0.000000,0.049657,0.000000,0.058675,0.031115,0.000000,0.000000,0.0
134526,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.172835,0.000000,0.000000,0.156732,0.128351,0.000000,0.0
134527,0.000000,0.080561,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.138959,0.000000,0.082651,0.026264,0.000000,0.056732,0.000000,0.017075,0.0


In [11]:
from metric_utils import compute_metric

result = compute_metric(np.array(test_diff), np.array(predictions))

In [12]:
result

{'ACC': 0.997,
 'DDR': 0.9932240746581708,
 'DDP': 0.9088519592715754,
 'DDF1': 0.9409629899571459,
 'GM': 0.9654842228965763}