In [1]:
!pip install session_info
!pip install pycaret



In [2]:
pip install pycaret[tuners]

Note: you may need to restart the kernel to use updated packages.


#### Import librairies

In [3]:
import pandas as pd
import os
import re


from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset , random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau

import random

from pycaret.classification import *
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix

import session_info

device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
session_info.show()

#### Set the random seed for reproducibility

In [4]:
seed = 64
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

#### Get the data

In [5]:
raw_data= pd.read_csv('data/train.csv', low_memory=False)
data_test= pd.read_csv('data/test.csv', low_memory=False)

In [6]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2989 entries, 0 to 2988
Columns: 365 entries, Patient_ID to Type_of_Venom_Allergy_IGE_Venom
dtypes: float64(322), int64(32), object(11)
memory usage: 8.3+ MB


#### Looking which are the targets to predict

In [7]:
missing_cols = set(raw_data.columns) ^ set(data_test.columns)
print(missing_cols)
len(missing_cols)

{'Type_of_Food_Allergy_Mammalian_Milk', 'Type_of_Respiratory_Allergy_ARIA', 'Type_of_Food_Allergy_Other_Legumes', 'Type_of_Food_Allergy_Tree_Nuts', 'Type_of_Food_Allergy_TPO', 'Type_of_Food_Allergy_Other', 'Type_of_Respiratory_Allergy_IGE_Molds_Yeast', 'Type_of_Food_Allergy_Shellfish', 'Type_of_Respiratory_Allergy_IGE_Pollen_Herb', 'Type_of_Respiratory_Allergy_IGE_Mite_Cockroach', 'Type_of_Respiratory_Allergy_GINA', 'Type_of_Food_Allergy_Cereals_&_Seeds', 'Type_of_Respiratory_Allergy_IGE_Dander_Animals', 'Severe_Allergy', 'Type_of_Food_Allergy_Oral_Syndrom', 'Type_of_Venom_Allergy_IGE_Venom', 'Allergy_Present', 'Type_of_Food_Allergy_Egg', 'Food_Allergy', 'trustii_id', 'Type_of_Respiratory_Allergy_CONJ', 'Respiratory_Allergy', 'Type_of_Food_Allergy_Fish', 'Venom_Allergy', 'Type_of_Respiratory_Allergy_IGE_Pollen_Tree', 'Type_of_Venom_Allergy_ATCD_Venom', 'Type_of_Food_Allergy_Fruits_and_Vegetables', 'Type_of_Respiratory_Allergy_IGE_Pollen_Gram', 'Type_of_Food_Allergy_Peanut', 'Type_of_Fo

30

## Data Pre-processing

### Preprocessing for the train set

In [8]:
liste_of_Targets =['Allergy_Present', 'Severe_Allergy', 'Respiratory_Allergy', 'Food_Allergy', 'Venom_Allergy',
                     'Type_of_Respiratory_Allergy_ARIA', 'Type_of_Respiratory_Allergy_CONJ', 
                     'Type_of_Respiratory_Allergy_GINA', 'Type_of_Respiratory_Allergy_IGE_Pollen_Gram',
                     'Type_of_Respiratory_Allergy_IGE_Pollen_Herb', 'Type_of_Respiratory_Allergy_IGE_Pollen_Tree',
                     'Type_of_Respiratory_Allergy_IGE_Dander_Animals', 'Type_of_Respiratory_Allergy_IGE_Mite_Cockroach',
                     'Type_of_Respiratory_Allergy_IGE_Molds_Yeast', 'Type_of_Food_Allergy_Aromatics', 'Type_of_Food_Allergy_Other',
                     'Type_of_Food_Allergy_Cereals_&_Seeds', 'Type_of_Food_Allergy_Egg', 'Type_of_Food_Allergy_Fish',
                     'Type_of_Food_Allergy_Fruits_and_Vegetables', 'Type_of_Food_Allergy_Mammalian_Milk', 
                     'Type_of_Food_Allergy_Oral_Syndrom', 'Type_of_Food_Allergy_Other_Legumes', 'Type_of_Food_Allergy_Peanut',
                     'Type_of_Food_Allergy_Shellfish', 'Type_of_Food_Allergy_TPO', 'Type_of_Food_Allergy_Tree_Nuts',
                     'Type_of_Venom_Allergy_ATCD_Venom', 'Type_of_Venom_Allergy_IGE_Venom']
def preprocessing_data(df):
    df = df.drop('Food_Type_0', axis =1)
    df.replace(-1, 0, inplace=True)
    data_noNAN = df.fillna(-1)
    # obtain Targets
    Targets = data_noNAN.loc[:,liste_of_Targets]
    # filter feautures
    X1=data_noNAN.loc[:, ['Chip_Type','Age','Gender','French_Residence_Department','Blood_Month_sample']]
    X= data_noNAN.iloc[:, 8:-29]
    data = pd.concat( [X1, X] , axis=1)
    # handle the 'Treatment_of_rhinitis' feature
    data['Treatment_of_rhinitis'] = data['Treatment_of_rhinitis'].astype(str)
    data['Treatment_of_rhinitis'] = data['Treatment_of_rhinitis'].str.replace('.0', '', regex=True)
    
    ##  Get_dummies of the 'object' type columns
    
    columns_to_encode = ['Chip_Type', 'French_Residence_Department', 'French_Region',
         'Treatment_of_athsma', 'Age_of_onsets',
       'General_cofactors', 'Treatment_of_atopic_dematitis','Treatment_of_rhinitis']
    
    ### Split the columns using multiple delimiters and create dummy columns
    dummy_dfs = []
    for col in columns_to_encode:
        # Split the data in the column that use  delimiters
        data[col] = data[col].astype(str)
        data[col] = data[col].apply(lambda x: [i.strip() for i in re.split('[,.]', x)])

        # Create dummy columns
        dummy_df = pd.get_dummies(data[col].apply(pd.Series).stack(), prefix=f"{col}", prefix_sep='_').groupby(level=0).sum()
        dummy_dfs.append(dummy_df)

    ### Concatenate the original DataFrame with the dummy columns
    df_final = pd.concat([data] + dummy_dfs, axis=1)

    ### Drop the original columns from the final dataset
    df_final.drop(columns=columns_to_encode, inplace=True)
    
    # Converting all values into 'float16' type
    encode_data = df_final.astype('float16')
    print(encode_data.info())
    
    return encode_data,Targets

In [9]:
encode_data,Targets = preprocessing_data(raw_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2989 entries, 0 to 2988
Columns: 467 entries, Age to Treatment_of_rhinitis_9
dtypes: float16(467)
memory usage: 2.7 MB
None


### Preprocessing for the test set

In [10]:
def preprocessing_data_test(df):
    df = df.drop('Food_Type_0', axis =1)
    df.replace(-1, 0, inplace=True)
    data_test_noNAN = df.fillna(-1)
    # filter feautures
    X1=data_test_noNAN.loc[:, ['Chip_Type']]
    X= data_test_noNAN.iloc[:, 5:]
    data = pd.concat( [X1, X] , axis=1)
    # handle the 'Treatment_of_rhinitis' feature
    data['Treatment_of_rhinitis'] = data['Treatment_of_rhinitis'].astype(str)
    data['Treatment_of_rhinitis'] = data['Treatment_of_rhinitis'].str.replace('.0', '', regex=True)
    # handle 'Age_of_onsets' which don't have the same format in data test and train
    data['Age_of_onsets'] = data['Age_of_onsets'].astype(str)

    
    ##  Get_dummies of the 'object' type columns
    
    columns_to_encode = ['Chip_Type', 'French_Residence_Department', 'French_Region',
         'Treatment_of_athsma', 'Age_of_onsets',
       'General_cofactors', 'Treatment_of_atopic_dematitis','Treatment_of_rhinitis']
    
    ### Split the columns using multiple delimiters and create dummy columns
    dummy_dfs = []
    for col in columns_to_encode:
        # Split the data in the column that use  delimiters
        data[col] = data[col].astype(str)
        data[col] = data[col].apply(lambda x: [i.strip() for i in re.split('[,.]', x)])

        # Create dummy columns
        dummy_df = pd.get_dummies(data[col].apply(pd.Series).stack(), prefix=f"{col}", prefix_sep='_').groupby(level=0).sum()
        dummy_dfs.append(dummy_df)

    ### Concatenate the original DataFrame with the dummy columns
    df_final = pd.concat([data] + dummy_dfs, axis=1)

    ### Drop the original columns from the final dataset
    df_final.drop(columns=columns_to_encode, inplace=True)
    
    # Converting all values into 'float16' type
    encode_data = df_final.astype('float16')
    print(encode_data.info())
    
    return encode_data


In [11]:
encode_data_test = preprocessing_data_test(data_test)
missing_cols = set(encode_data.columns) ^ set(encode_data_test.columns)
print(missing_cols)
len(missing_cols)
encode_data_test = encode_data_test.reindex(columns=encode_data.columns, fill_value=0).astype('float16')
encode_data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586 entries, 0 to 585
Columns: 444 entries, Age to Treatment_of_rhinitis_9
dtypes: float16(444)
memory usage: 508.3 KB
None
{'Treatment_of_athsma_8', 'French_Residence_Department_deptIII', 'French_Residence_Department_deptOOO', 'French_Residence_Department_deptU', 'French_Residence_Department_deptPPP', 'French_Residence_Department_deptDDD', 'French_Residence_Department_deptRRR', 'French_Residence_Department_deptQQQ', 'General_cofactors_11', 'French_Residence_Department_deptDD', 'French_Residence_Department_deptCCCC', 'French_Region_regionO', 'Treatment_of_atopic_dematitis_7', 'French_Residence_Department_deptUU', 'French_Residence_Department_deptHHH', 'French_Residence_Department_deptJJJ', 'French_Residence_Department_deptW', 'Treatment_of_athsma_10', 'French_Residence_Department_deptNNN', 'French_Residence_Department_deptMMM', 'French_Residence_Department_deptT', 'French_Residence_Department_deptZZZ', 'French_Residence_Department_deptT

#### Pycaret part

In [12]:
import os

def obtain_models_and_blend_imbl(encode_data, Y):
    
    liste_column_monovalue=['Type_of_Food_Allergy_Other','Type_of_Food_Allergy_Cereals_&_Seeds']
        
    folder_path = f'Final_blend_model_with_all_models'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    for column in Y.columns:
        if column not in liste_column_monovalue :
            rows_with_9 = Y[column].isin([9])
            Targets_without_9 = Y[column][~rows_with_9].reset_index(drop=True)
            X = encode_data[~rows_with_9].reset_index(drop=True)

            dataset= pd.concat([X,Targets_without_9], axis = 1)

            setup_model_1 = setup(data= dataset, target= column, train_size=0.68, fix_imbalance=True, fix_imbalance_method='SMOTE')
            lightgbm= create_model('lightgbm')
            tuned_lightgbm = tune_model(lightgbm, n_iter = 70, optimize = 'F1', fold = 7, search_library = 'tune-sklearn', search_algorithm = 'hyperopt')
            
            file_name = f"{column}_imblearn_final_SMOTE_lightgbm_train_68_F1_opt"
            file_path = os.path.join(folder_path, file_name)
            save_model(tuned_lightgbm, file_path)
            
            setup_model_2 = setup(data= dataset, target= column, train_size=0.65, fix_imbalance=True, fix_imbalance_method='BorderlineSMOTE')
            best_compare_2 = compare_models(sort = 'F1', include = ['rf', 'et', 'lightgbm','ada'])
            tuned_best_compare_2 = tune_model(best_compare_2, n_iter = 70, optimize = 'MCC', fold = 7, search_library = 'tune-sklearn', search_algorithm = 'hyperopt')
            
            file_name = f"{column}_imblearn_final_BorderlineSMOTE_train_65_MCC_opt"
            file_path = os.path.join(folder_path, file_name)
            save_model(tuned_best_compare_2, file_path)
            
            setup_model_3 = setup(data= dataset, target= column, train_size=0.75, fix_imbalance=True, fix_imbalance_method='SMOTETomek')
            best_compare_3 = compare_models(sort = 'F1', include = ['rf', 'et', 'lightgbm','ada'])
            tuned_best_compare_3 = tune_model(best_compare_3, n_iter = 70, optimize = 'F1', fold = 7, search_library = 'tune-sklearn', search_algorithm = 'hyperopt')
            
            file_name = f"{column}_imblearn_final_Smotetomek_train_75_F1_opt"
            file_path = os.path.join(folder_path, file_name)
            save_model(tuned_best_compare_3, file_path)
            
            setup_model_4 = setup(data= dataset, target= column, train_size=0.6, fix_imbalance=True, fix_imbalance_method='RandomOverSampler')
            best_compare_4 = compare_models(sort = 'F1', include = ['rf', 'et', 'lightgbm','ada'])
            tuned_best_compare_4 = tune_model(best_compare_4, n_iter = 70, optimize = 'Kappa', fold = 7, search_library = 'tune-sklearn', search_algorithm = 'hyperopt')
            
            file_name = f"{column}_imblearn_final_ROS_train_60_kappa_opt"
            file_path = os.path.join(folder_path, file_name)
            save_model(tuned_best_compare_4, file_path)
            
            setup_blend_model = setup(data= dataset, target= column, train_size=0.7, fix_imbalance=True, fix_imbalance_method='SMOTE')
            blended_model = blend_models(estimator_list=[tuned_lightgbm, tuned_best_compare_2,tuned_best_compare_3,tuned_best_compare_4])
            
            
            file_name = f"{column}_imblearn_final_blend_4_models_train_size_70"
            file_path = os.path.join(folder_path, file_name)
            save_model(blended_model, file_path)
        else:
            continue

In [None]:
obtain_models_and_blend_imbl(encode_data, Targets)

0,1
Current time:,2023-06-28 22:21:55
Running for:,00:00:30.48
Memory:,4.5/15.6 GiB

Trial name,status,loc,actual_estimator__ba gging_fraction,actual_estimator__ba gging_freq,actual_estimator__fe ature_fraction,actual_estimator__le arning_rate,actual_estimator__mi n_child_samples,actual_estimator__mi n_split_gain,actual_estimator__n_ estimators,actual_estimator__nu m_leaves,actual_estimator__re g_alpha,actual_estimator__re g_lambda,iter,total time (s),split0_test_score,split1_test_score,split2_test_score
_Trainable_eb37d4d8,RUNNING,10.40.130.191:162245,0.828407,4,0.756094,0.000332466,42,0.190484,25,24,1.39538e-06,0.000312404,,,,,
_Trainable_bf4b4825,RUNNING,10.40.130.191:162282,0.810079,1,0.455302,0.0851867,41,0.523875,61,135,0.00311115,0.0204035,,,,,
_Trainable_064b7dac,RUNNING,10.40.130.191:162322,0.894835,4,0.686165,0.00824919,29,0.598493,242,204,1.76458e-06,0.0131672,,,,,
_Trainable_234e78a6,PENDING,10.40.130.191:162210,0.471633,4,0.715478,1.55853e-05,4,0.643141,11,119,0.000381544,9.0922e-07,,,,,
_Trainable_c13c20cf,TERMINATED,10.40.130.191:162210,0.773136,4,0.965758,0.101529,17,0.360062,55,198,4.29421e-05,6.98953e-09,1.0,22.4603,0.90625,0.923077,0.880597




### Generate predictions from classifiers and include them into the dataset + evaluation 

In [13]:
data= pd.read_csv('data/train.csv', low_memory=False)
data_test= pd.read_csv('data/test.csv', low_memory=False)

In [23]:
List_methods_imbl=['SMOTE']

def obtain_pred(data, folder_path):
    liste=[]
    for subdir, dirs, files in os.walk(folder_path):
        for file in files:
            print("Calling model:", file) 
            model = load_model(os.path.join(subdir, os.path.splitext(file)[0]))
            values=predict_model(model, data= data.reset_index())
            file_name = os.path.splitext(file)[0]
            values_name = '{}_{}'.format(file_name, 'values')
            globals()[values_name] = values
            globals()[values_name] = globals()[values_name].rename(columns={'prediction_label': 'pred_label ' + str(values_name), 'prediction_score': 'pred_score ' + str(values_name) })
            liste.append(globals()[values_name])
        return liste
    
def obtain_all_pred_methods(data, list_of_methods):
    all_lists = {} 
    for method in list_of_methods:
        folder_path = f'Test_Imbl_{method}_tuned_stacked_and_calibrated'
        all_lists[f"{method}_tuned"] = obtain_pred(data, folder_path)
    return all_lists
    

In [20]:
def obtain_pred(data, folder_path):
    select_string = f'blend_4_models'
    liste=[]
    for subdir, dirs, files in os.walk(folder_path):
        for file in files:
            if select_string in file:
                print("Calling model:", file)
                model = load_model(os.path.join(subdir, os.path.splitext(file)[0]))
                values=predict_model(model, data= data.reset_index())
                file_name = os.path.splitext(file)[0]
                values_name = '{}_{}'.format(file_name, 'values')
                globals()[values_name] = values
                globals()[values_name] = globals()[values_name].rename(columns={'prediction_label': 'pred_label ' + str(values_name), 'prediction_score': 'pred_score ' + str(values_name) })
                liste.append(globals()[values_name])
            else:
                continue 
        return liste

In [21]:
dico_df= obtain_pred(encode_data, folder_path='Final_blend_model_with_all_models')

Calling model: Type_of_Food_Allergy_Aromatics_imblearn_final_blend_4_models_train_size_70.pkl
Transformation Pipeline and Model Successfully Loaded
Calling model: Type_of_Food_Allergy_Oral_Syndrom_imblearn_final_blend_4_models_train_size_70.pkl
Transformation Pipeline and Model Successfully Loaded
Calling model: Type_of_Venom_Allergy_ATCD_Venom_imblearn_final_blend_4_models_train_size_70.pkl
Transformation Pipeline and Model Successfully Loaded
Calling model: Food_Allergy_imblearn_final_blend_4_models_train_size_70.pkl
Transformation Pipeline and Model Successfully Loaded
Calling model: Type_of_Respiratory_Allergy_IGE_Pollen_Gram_imblearn_final_blend_4_models_train_size_70.pkl
Transformation Pipeline and Model Successfully Loaded
Calling model: Type_of_Respiratory_Allergy_IGE_Pollen_Herb_imblearn_final_blend_4_models_train_size_70.pkl
Transformation Pipeline and Model Successfully Loaded
Calling model: Type_of_Food_Allergy_Egg_imblearn_final_blend_4_models_train_size_70.pkl
Transformat

In [22]:
print(len(dico_df))
dico_df[0]

#len(dico_df.values())

27


Unnamed: 0,index,Age,Gender,Blood_Month_sample,Rural_or_urban_area,Sensitization,Skin_Symptoms,Act_d_1,Act_d_2,Act_d_5,...,Treatment_of_atopic_dematitis_7,Treatment_of_atopic_dematitis_9,Treatment_of_rhinitis_0,Treatment_of_rhinitis_1,Treatment_of_rhinitis_2,Treatment_of_rhinitis_3,Treatment_of_rhinitis_4,Treatment_of_rhinitis_9,pred_label Type_of_Food_Allergy_Aromatics_imblearn_final_blend_4_models_train_size_70_values,pred_score Type_of_Food_Allergy_Aromatics_imblearn_final_blend_4_models_train_size_70_values
0,0,15.0,0.0,7.0,1.0,1.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.5795
1,1,72.0,1.0,5.0,9.0,1.0,9.0,0.000000,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.6254
2,2,67.0,1.0,6.0,9.0,0.0,9.0,0.000000,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.6254
3,3,13.0,1.0,9.0,1.0,1.0,9.0,0.000000,2.150391,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.6055
4,4,28.0,1.0,12.0,9.0,1.0,1.0,0.000000,74.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.6196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2984,2984,21.0,1.0,3.0,9.0,1.0,9.0,0.000000,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.6038
2985,2985,3.0,0.0,1.0,9.0,0.0,9.0,0.000000,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.6159
2986,2986,15.0,0.0,12.0,1.0,1.0,1.0,0.000000,0.000000,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0.5386
2987,2987,11.0,1.0,4.0,0.0,1.0,0.0,0.770020,0.099976,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.6077


In [23]:

liste_column_monovalue=['Type_of_Food_Allergy_Other','Type_of_Food_Allergy_Cereals_&_Seeds']

merged_df = pd.DataFrame()
for i,df in enumerate(dico_df):
    if i==0:
        merged_df = df
    else:
        last_two_columns = df.iloc[:, -2:]
        merged_df = pd.concat([merged_df, last_two_columns], axis=1)

added_list = ['pred_label ' + s + '_imblearn_final_blend_4_models_train_size_70_values' for s in liste_column_monovalue]
zeros_data = pd.DataFrame(0, index=np.arange(len(merged_df)), columns= added_list)
merged_df = pd.concat([merged_df, zeros_data], axis=1)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2989 entries, 0 to 2988
Columns: 524 entries, index to pred_label Type_of_Food_Allergy_Cereals_&_Seeds_imblearn_final_blend_4_models_train_size_70_values
dtypes: float32(467), float64(27), int32(1), int64(29)
memory usage: 6.6 MB


In [24]:
data_true= pd.read_csv('data/train.csv', low_memory=False)

In [25]:
merged_df.head()

Unnamed: 0,index,Age,Gender,Blood_Month_sample,Rural_or_urban_area,Sensitization,Skin_Symptoms,Act_d_1,Act_d_2,Act_d_5,...,pred_label Respiratory_Allergy_imblearn_final_blend_4_models_train_size_70_values,pred_score Respiratory_Allergy_imblearn_final_blend_4_models_train_size_70_values,pred_label Type_of_Respiratory_Allergy_IGE_Dander_Animals_imblearn_final_blend_4_models_train_size_70_values,pred_score Type_of_Respiratory_Allergy_IGE_Dander_Animals_imblearn_final_blend_4_models_train_size_70_values,pred_label Severe_Allergy_imblearn_final_blend_4_models_train_size_70_values,pred_score Severe_Allergy_imblearn_final_blend_4_models_train_size_70_values,pred_label Type_of_Venom_Allergy_IGE_Venom_imblearn_final_blend_4_models_train_size_70_values,pred_score Type_of_Venom_Allergy_IGE_Venom_imblearn_final_blend_4_models_train_size_70_values,pred_label Type_of_Food_Allergy_Other_imblearn_final_blend_4_models_train_size_70_values,pred_label Type_of_Food_Allergy_Cereals_&_Seeds_imblearn_final_blend_4_models_train_size_70_values
0,0,15.0,0.0,7.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1,0.9817,0,0.7981,1,0.9949,0,0.6359,0,0
1,1,72.0,1.0,5.0,9.0,1.0,9.0,0.0,0.0,0.0,...,1,0.7781,0,0.8025,0,0.7175,0,0.6855,0,0
2,2,67.0,1.0,6.0,9.0,0.0,9.0,0.0,0.0,0.0,...,1,0.7219,0,0.8065,0,0.7597,0,0.6855,0,0
3,3,13.0,1.0,9.0,1.0,1.0,9.0,0.0,2.150391,0.0,...,1,0.9872,1,0.8036,1,0.9757,0,0.6974,0,0
4,4,28.0,1.0,12.0,9.0,1.0,1.0,0.0,74.0,0.0,...,1,0.9089,1,0.786,0,0.5932,0,0.703,0,0


#### Evaluation binary for each methods

In [26]:
import numpy as np
target_columns = ['Type_of_Food_Allergy_Cereals_&_Seeds', 'Type_of_Food_Allergy_Other_Legumes', 'Allergy_Present', 'Type_of_Food_Allergy_Mammalian_Milk', 'Type_of_Food_Allergy_Other',
                  'Type_of_Respiratory_Allergy_IGE_Mite_Cockroach', 'Venom_Allergy', 'Type_of_Respiratory_Allergy_ARIA', 'Type_of_Respiratory_Allergy_IGE_Pollen_Gram',
                  'Type_of_Respiratory_Allergy_IGE_Pollen_Herb', 'Food_Allergy', 'Type_of_Food_Allergy_Oral_Syndrom','Type_of_Food_Allergy_Tree_Nuts', 'Severe_Allergy',
                  'Type_of_Food_Allergy_Aromatics', 'Type_of_Venom_Allergy_IGE_Venom', 'Type_of_Venom_Allergy_ATCD_Venom', 'Type_of_Respiratory_Allergy_CONJ', 'Type_of_Food_Allergy_Peanut',
                  'Type_of_Food_Allergy_Egg', 'Type_of_Food_Allergy_Fish', 'Type_of_Respiratory_Allergy_GINA', 'Respiratory_Allergy', 'Type_of_Food_Allergy_TPO',
                  'Type_of_Respiratory_Allergy_IGE_Pollen_Tree', 'Type_of_Food_Allergy_Fruits_and_Vegetables', 'Type_of_Respiratory_Allergy_IGE_Molds_Yeast',
                  'Type_of_Respiratory_Allergy_IGE_Dander_Animals', 'Type_of_Food_Allergy_Shellfish']


f1_scores=[]
below_threshold_columns=[]
threshold =0.85
for column in target_columns:
    rows_with_9 = data_true[column].isin([9])
    Targets_without_9 = data_true[column][~rows_with_9]
    merged_df_removal = merged_df[~rows_with_9]

    merged_df_mod = pd.concat([merged_df_removal, Targets_without_9] , axis=1).reset_index(drop=True)
    y_true = merged_df_mod[column]
    y_pred = merged_df_mod['pred_label '+ column + '_imblearn_final_blend_4_models_train_size_70_values']

    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)

    # Calculate recall
    #recall = recall_score(y_true, y_pred)

    # Calculate F1 score
    if (data_true[column] == 1).any():
        f1 = f1_score(y_true, y_pred,average='macro')
    else:
        f1 = f1_score(y_true, y_pred, pos_label=0)
    cm = confusion_matrix(y_true, y_pred)

    # Display the metrics and confusion matrix
    print(f"Metrics for {column}:")
    print(f"Accuracy: {accuracy}")
    #print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print("Confusion Matrix:")
    print(cm)
    print()
    if f1 != 0:
        f1_scores.append(f1) # Append the F1 score to the list
    if f1 < threshold:
        below_threshold_columns.append(column)

mean_f1 = sum(f1_scores) / len(f1_scores)  # Calculate the mean of F1 scores

print(f"Mean F1 Score: {mean_f1}")
print(below_threshold_columns)
print('')

Metrics for Type_of_Food_Allergy_Cereals_&_Seeds:
Accuracy: 1.0
F1 Score: 1.0
Confusion Matrix:
[[1607]]

Metrics for Type_of_Food_Allergy_Other_Legumes:
Accuracy: 0.9838207840696951
F1 Score: 0.8575354609929079
Confusion Matrix:
[[1547   10]
 [  16   34]]

Metrics for Allergy_Present:
Accuracy: 0.9255258669698693
F1 Score: 0.8964576713906403
Confusion Matrix:
[[ 348   43]
 [  88 1280]]

Metrics for Type_of_Food_Allergy_Mammalian_Milk:
Accuracy: 0.8792781580584941
F1 Score: 0.5266927794983908
Confusion Matrix:
[[1400  182]
 [  12   13]]

Metrics for Type_of_Food_Allergy_Other:
Accuracy: 1.0
F1 Score: 1.0
Confusion Matrix:
[[1607]]

Metrics for Type_of_Respiratory_Allergy_IGE_Mite_Cockroach:
Accuracy: 0.9761114797611148
F1 Score: 0.97586921015831
Confusion Matrix:
[[811  16]
 [ 20 660]]

Metrics for Venom_Allergy:
Accuracy: 0.9966543994647039
F1 Score: 0.9098699763593381
Confusion Matrix:
[[2956    0]
 [  10   23]]

Metrics for Type_of_Respiratory_Allergy_ARIA:
Accuracy: 0.9575315195753

## Generate prediction for the test set

In [27]:
original_validation_data = pd.read_csv('data/test.csv')

data_test = original_validation_data.set_index('trustii_id')

In [28]:
encode_data_test = preprocessing_data_test(data_test)
missing_cols = set(encode_data.columns) ^ set(encode_data_test.columns)
print(missing_cols)
len(missing_cols)
encode_data_test = encode_data_test.reindex(columns=encode_data.columns, fill_value=0).astype('float16')
encode_data_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 586 entries, 1 to 1282
Columns: 443 entries, Gender to Treatment_of_rhinitis_9
dtypes: float16(443)
memory usage: 527.8 KB
None
{'Treatment_of_athsma_8', 'French_Residence_Department_deptIII', 'French_Residence_Department_deptOOO', 'French_Residence_Department_deptU', 'French_Residence_Department_deptPPP', 'French_Residence_Department_deptDDD', 'French_Residence_Department_deptRRR', 'French_Residence_Department_deptQQQ', 'General_cofactors_11', 'French_Residence_Department_deptDD', 'French_Residence_Department_deptCCCC', 'French_Region_regionO', 'Treatment_of_atopic_dematitis_7', 'French_Residence_Department_deptUU', 'French_Residence_Department_deptHHH', 'French_Residence_Department_deptJJJ', 'French_Residence_Department_deptW', 'Treatment_of_athsma_10', 'French_Residence_Department_deptNNN', 'French_Residence_Department_deptMMM', 'French_Residence_Department_deptT', 'French_Residence_Department_deptZZZ', 'French_Residence_Department_d

In [29]:
dico_df= obtain_pred(encode_data_test.reset_index(), folder_path='Final_blend_model_with_all_models')

Calling model: Type_of_Food_Allergy_Aromatics_imblearn_final_blend_4_models_train_size_70.pkl
Transformation Pipeline and Model Successfully Loaded
Calling model: Type_of_Food_Allergy_Oral_Syndrom_imblearn_final_blend_4_models_train_size_70.pkl
Transformation Pipeline and Model Successfully Loaded
Calling model: Type_of_Venom_Allergy_ATCD_Venom_imblearn_final_blend_4_models_train_size_70.pkl
Transformation Pipeline and Model Successfully Loaded
Calling model: Food_Allergy_imblearn_final_blend_4_models_train_size_70.pkl
Transformation Pipeline and Model Successfully Loaded
Calling model: Type_of_Respiratory_Allergy_IGE_Pollen_Gram_imblearn_final_blend_4_models_train_size_70.pkl
Transformation Pipeline and Model Successfully Loaded
Calling model: Type_of_Respiratory_Allergy_IGE_Pollen_Herb_imblearn_final_blend_4_models_train_size_70.pkl
Transformation Pipeline and Model Successfully Loaded
Calling model: Type_of_Food_Allergy_Egg_imblearn_final_blend_4_models_train_size_70.pkl
Transformat

In [30]:
print(len(dico_df))


27


In [31]:
liste_column_monovalue=['Type_of_Food_Allergy_Other','Type_of_Food_Allergy_Cereals_&_Seeds']

merged_df = pd.DataFrame()
for i,df in enumerate(dico_df):
    if i==0:
        merged_df = df
    else:
        last_two_columns = df.iloc[:, -2:]
        merged_df = pd.concat([merged_df, last_two_columns], axis=1)

added_list = ['pred_label ' + s + '_imblearn_final_blend_4_models_train_size_70_values' for s in liste_column_monovalue]
zeros_data = pd.DataFrame(0, index=np.arange(len(merged_df)), columns= added_list)
merged_df = pd.concat([merged_df, zeros_data], axis=1)
merged_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586 entries, 0 to 585
Columns: 525 entries, index to pred_label Type_of_Food_Allergy_Cereals_&_Seeds_imblearn_final_blend_4_models_train_size_70_values
dtypes: float32(467), float64(27), int32(2), int64(29)
memory usage: 1.3 MB


In [32]:
merged_df.head()

Unnamed: 0,index,trustii_id,Age,Gender,Blood_Month_sample,Rural_or_urban_area,Sensitization,Skin_Symptoms,Act_d_1,Act_d_2,...,pred_label Respiratory_Allergy_imblearn_final_blend_4_models_train_size_70_values,pred_score Respiratory_Allergy_imblearn_final_blend_4_models_train_size_70_values,pred_label Type_of_Respiratory_Allergy_IGE_Dander_Animals_imblearn_final_blend_4_models_train_size_70_values,pred_score Type_of_Respiratory_Allergy_IGE_Dander_Animals_imblearn_final_blend_4_models_train_size_70_values,pred_label Severe_Allergy_imblearn_final_blend_4_models_train_size_70_values,pred_score Severe_Allergy_imblearn_final_blend_4_models_train_size_70_values,pred_label Type_of_Venom_Allergy_IGE_Venom_imblearn_final_blend_4_models_train_size_70_values,pred_score Type_of_Venom_Allergy_IGE_Venom_imblearn_final_blend_4_models_train_size_70_values,pred_label Type_of_Food_Allergy_Other_imblearn_final_blend_4_models_train_size_70_values,pred_label Type_of_Food_Allergy_Cereals_&_Seeds_imblearn_final_blend_4_models_train_size_70_values
0,0,1,0.0,1.0,6.0,9.0,1.0,0.0,0.0,0.0,...,1,0.9798,1,0.8025,0,0.5844,0,0.7036,0,0
1,1,4,0.0,1.0,7.0,9.0,1.0,1.0,0.0,0.109985,...,1,0.8436,1,0.7949,1,0.6175,0,0.6625,0,0
2,2,5,0.0,0.0,10.0,1.0,1.0,1.0,0.0,0.0,...,1,0.956,1,0.7934,1,0.9284,0,0.6634,0,0
3,3,7,0.0,1.0,8.0,0.0,1.0,0.0,0.0,0.0,...,0,0.8155,0,0.8053,0,0.9769,0,0.6428,0,0
4,4,8,0.0,0.0,4.0,1.0,0.0,9.0,0.0,0.0,...,0,0.8421,0,0.8072,0,0.6824,0,0.6459,0,0


In [33]:
target_columns = ['Type_of_Food_Allergy_Cereals_&_Seeds', 'Type_of_Food_Allergy_Other_Legumes', 'Allergy_Present', 'Type_of_Food_Allergy_Mammalian_Milk', 'Type_of_Food_Allergy_Other',
                  'Type_of_Respiratory_Allergy_IGE_Mite_Cockroach', 'Venom_Allergy', 'Type_of_Respiratory_Allergy_ARIA', 'Type_of_Respiratory_Allergy_IGE_Pollen_Gram',
                  'Type_of_Respiratory_Allergy_IGE_Pollen_Herb', 'Food_Allergy', 'Type_of_Food_Allergy_Oral_Syndrom','Type_of_Food_Allergy_Tree_Nuts', 'Severe_Allergy',
                  'Type_of_Food_Allergy_Aromatics', 'Type_of_Venom_Allergy_IGE_Venom', 'Type_of_Venom_Allergy_ATCD_Venom', 'Type_of_Respiratory_Allergy_CONJ', 'Type_of_Food_Allergy_Peanut',
                  'Type_of_Food_Allergy_Egg', 'Type_of_Food_Allergy_Fish', 'Type_of_Respiratory_Allergy_GINA', 'Respiratory_Allergy', 'Type_of_Food_Allergy_TPO',
                  'Type_of_Respiratory_Allergy_IGE_Pollen_Tree', 'Type_of_Food_Allergy_Fruits_and_Vegetables', 'Type_of_Respiratory_Allergy_IGE_Molds_Yeast',
                  'Type_of_Respiratory_Allergy_IGE_Dander_Animals', 'Type_of_Food_Allergy_Shellfish']

original_validation_data = pd.read_csv('data/test.csv')
for elem in target_columns:
    original_validation_data[elem]=  merged_df['pred_label '+ elem + '_imblearn_final_blend_4_models_train_size_70_values']

In [34]:
len(original_validation_data)

586

In [35]:
original_validation_data.head(30)

Unnamed: 0,trustii_id,Patient_ID,Chip_Code,Chip_Type,Chip_Image_Name,Age,Gender,Blood_Month_sample,French_Residence_Department,French_Region,...,Type_of_Food_Allergy_Egg,Type_of_Food_Allergy_Fish,Type_of_Respiratory_Allergy_GINA,Respiratory_Allergy,Type_of_Food_Allergy_TPO,Type_of_Respiratory_Allergy_IGE_Pollen_Tree,Type_of_Food_Allergy_Fruits_and_Vegetables,Type_of_Respiratory_Allergy_IGE_Molds_Yeast,Type_of_Respiratory_Allergy_IGE_Dander_Animals,Type_of_Food_Allergy_Shellfish
0,1,PMP0156,22 262C 3858,ISAC_V2,,8.0,1.0,6.0,deptBBB,regionJ,...,0,0,1,1,0,1,0,1,1,0
1,4,PCR0234,02AHX0DC,ALEX,02AHX0DC.bmp,14.0,1.0,7.0,deptL,regionD,...,1,0,1,1,0,0,0,0,1,1
2,5,PCR0532,02AUN372,ALEX,02AUN372.png,32.0,0.0,10.0,deptUUU,regionF,...,0,0,1,1,0,1,0,1,1,0
3,7,GJH0147,EKF3830_4,ISAC_V2,EKF3830_4_2200444337_2023_2_17_11_58_24.bmp,65.0,1.0,8.0,deptQ,regionF,...,0,0,0,0,0,0,0,0,0,0
4,8,TXV0009,881204001164,ISAC_V1,1G20027_2_881204001164_2012_4_25_18_32_58.bmp,5.0,0.0,4.0,deptII,regionC,...,0,0,0,0,0,0,0,0,0,0
5,9,PCR0118,02AFA752,ALEX,,49.0,0.0,1.0,deptXXX,regionI,...,0,0,0,1,0,0,0,0,0,0
6,10,QVW0214,AB02627_3,ISAC_V1,,6.0,1.0,2.0,deptY,regionD,...,0,0,1,1,0,1,0,1,1,0
7,15,TXV0157,881602013302,ISAC_V1,BAF4027_4_881602013302_2016_2_23_16_38_11.bmp,13.0,1.0,2.0,deptRR,regionB,...,0,0,1,1,0,0,0,0,0,0
8,18,WQW0190,223112546,ISAC_V2,END0E30_1_223112546_2023_1_3_16_20_19.bmp,12.0,0.0,11.0,deptOO,regionL,...,0,0,0,0,0,0,0,0,0,0
9,23,TXV0282,881903001372,ISAC_V1,CXG1527_3_881903001372_2019_3_14_3_51_59.bmp,8.0,0.0,3.0,deptEE,regionC,...,0,0,1,1,0,0,0,0,0,0


In [40]:
original_validation_data.to_csv(f"Submission_ML_pycaret_multi_tuned_selection_binary.csv", index=False, encoding='UTF-8')

#### Logics is the same than the first submission but an fix_imbalanced_method is set to give models more samples from the minority class and less from the majority class.
#### one notebook for the the best sampling methods where different models obtain with different methods and setup are stacked.