# Using Pytorch with generation of sample after train/test split + KfoldStratified

In [2]:
!pip install session_info
!pip install imbalanced-learn

Collecting session_info
  Using cached session_info-1.0.0-py3-none-any.whl
Collecting stdlib-list
  Using cached stdlib_list-0.8.0-py3-none-any.whl (63 kB)
Installing collected packages: stdlib-list, session_info
Successfully installed session_info-1.0.0 stdlib-list-0.8.0
Collecting imbalanced-learn
  Using cached imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Installing collected packages: joblib, imbalanced-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
Successfully installed imbalanced-learn-0.10.1 joblib-1.2.0


#### Import librairies

In [3]:
import pandas as pd
import os
import re

from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset , random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau

import imblearn.over_sampling as oversampling
import imblearn.under_sampling as undersampling
import imblearn.combine as combination
import random

from torch.utils.data import TensorDataset # to recreate the modified dataset at each epoch

import session_info

device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
session_info.show()

#### Set the random seed for reproducibility

In [4]:
seed = 64
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

#### Get the data

In [5]:
raw_data= pd.read_csv('data/train.csv', low_memory=False)
data_test= pd.read_csv('data/test.csv', low_memory=False)

In [6]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2989 entries, 0 to 2988
Columns: 365 entries, Patient_ID to Type_of_Venom_Allergy_IGE_Venom
dtypes: float64(322), int64(32), object(11)
memory usage: 8.3+ MB


#### Looking which are the targets to predict

In [7]:
missing_cols = set(raw_data.columns) ^ set(data_test.columns)
print(missing_cols)
len(missing_cols)

{'Type_of_Food_Allergy_Tree_Nuts', 'Type_of_Respiratory_Allergy_IGE_Mite_Cockroach', 'Type_of_Respiratory_Allergy_IGE_Pollen_Herb', 'Type_of_Venom_Allergy_IGE_Venom', 'Type_of_Food_Allergy_Fish', 'Type_of_Food_Allergy_Fruits_and_Vegetables', 'Type_of_Respiratory_Allergy_CONJ', 'Type_of_Food_Allergy_Peanut', 'Type_of_Food_Allergy_Egg', 'Type_of_Respiratory_Allergy_ARIA', 'Severe_Allergy', 'Type_of_Respiratory_Allergy_IGE_Molds_Yeast', 'Type_of_Food_Allergy_Other_Legumes', 'Food_Allergy', 'Type_of_Food_Allergy_Other', 'Allergy_Present', 'Type_of_Respiratory_Allergy_IGE_Dander_Animals', 'Type_of_Food_Allergy_Aromatics', 'Type_of_Respiratory_Allergy_GINA', 'Type_of_Venom_Allergy_ATCD_Venom', 'Type_of_Food_Allergy_Cereals_&_Seeds', 'Type_of_Food_Allergy_Mammalian_Milk', 'Type_of_Food_Allergy_Oral_Syndrom', 'Type_of_Food_Allergy_Shellfish', 'Type_of_Respiratory_Allergy_IGE_Pollen_Tree', 'trustii_id', 'Respiratory_Allergy', 'Type_of_Respiratory_Allergy_IGE_Pollen_Gram', 'Venom_Allergy', 'Type

30

## Data Pre-processing

### Preprocessing for the train set

In [8]:
liste_of_Targets =['Allergy_Present', 'Severe_Allergy', 'Respiratory_Allergy', 'Food_Allergy', 'Venom_Allergy',
                     'Type_of_Respiratory_Allergy_ARIA', 'Type_of_Respiratory_Allergy_CONJ', 
                     'Type_of_Respiratory_Allergy_GINA', 'Type_of_Respiratory_Allergy_IGE_Pollen_Gram',
                     'Type_of_Respiratory_Allergy_IGE_Pollen_Herb', 'Type_of_Respiratory_Allergy_IGE_Pollen_Tree',
                     'Type_of_Respiratory_Allergy_IGE_Dander_Animals', 'Type_of_Respiratory_Allergy_IGE_Mite_Cockroach',
                     'Type_of_Respiratory_Allergy_IGE_Molds_Yeast', 'Type_of_Food_Allergy_Aromatics', 'Type_of_Food_Allergy_Other',
                     'Type_of_Food_Allergy_Cereals_&_Seeds', 'Type_of_Food_Allergy_Egg', 'Type_of_Food_Allergy_Fish',
                     'Type_of_Food_Allergy_Fruits_and_Vegetables', 'Type_of_Food_Allergy_Mammalian_Milk', 
                     'Type_of_Food_Allergy_Oral_Syndrom', 'Type_of_Food_Allergy_Other_Legumes', 'Type_of_Food_Allergy_Peanut',
                     'Type_of_Food_Allergy_Shellfish', 'Type_of_Food_Allergy_TPO', 'Type_of_Food_Allergy_Tree_Nuts',
                     'Type_of_Venom_Allergy_ATCD_Venom', 'Type_of_Venom_Allergy_IGE_Venom']
def preprocessing_data(df):
    df = df.drop('Food_Type_0', axis =1)
    df.replace(-1, 0, inplace=True)
    data_noNAN = df.fillna(-1)
    # obtain Targets
    Targets = data_noNAN.loc[:,liste_of_Targets]
    # filter feautures
    X1=data_noNAN.loc[:, ['Chip_Type','Age','Gender','French_Residence_Department','Blood_Month_sample']]
    X= data_noNAN.iloc[:, 8:-29]
    data = pd.concat( [X1, X] , axis=1)
    # handle the 'Treatment_of_rhinitis' feature
    data['Treatment_of_rhinitis'] = data['Treatment_of_rhinitis'].astype(str)
    data['Treatment_of_rhinitis'] = data['Treatment_of_rhinitis'].str.replace('.0', '', regex=True)
    
    ##  Get_dummies of the 'object' type columns
    
    columns_to_encode = ['Chip_Type', 'French_Residence_Department', 'French_Region',
         'Treatment_of_athsma', 'Age_of_onsets',
       'General_cofactors', 'Treatment_of_atopic_dematitis','Treatment_of_rhinitis']
    
    ### Split the columns using multiple delimiters and create dummy columns
    dummy_dfs = []
    for col in columns_to_encode:
        # Split the data in the column that use  delimiters
        data[col] = data[col].astype(str)
        data[col] = data[col].apply(lambda x: [i.strip() for i in re.split('[,.]', x)])

        # Create dummy columns
        dummy_df = pd.get_dummies(data[col].apply(pd.Series).stack(), prefix=f"{col}", prefix_sep='_').groupby(level=0).sum()
        dummy_dfs.append(dummy_df)

    ### Concatenate the original DataFrame with the dummy columns
    df_final = pd.concat([data] + dummy_dfs, axis=1)

    ### Drop the original columns from the final dataset
    df_final.drop(columns=columns_to_encode, inplace=True)
    
    # Converting all values into 'float16' type
    encode_data = df_final.astype('float16')
    print(encode_data.info())
    
    return encode_data,Targets

In [9]:
encode_data,Targets = preprocessing_data(raw_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2989 entries, 0 to 2988
Columns: 467 entries, Age to Treatment_of_rhinitis_9
dtypes: float16(467)
memory usage: 2.7 MB
None


### Preprocessing for the test set

In [10]:
def preprocessing_data_test(df):
    df = df.drop('Food_Type_0', axis =1)
    df.replace(-1, 0, inplace=True)
    data_test_noNAN = df.fillna(-1)
    # filter feautures
    X1=data_test_noNAN.loc[:, ['Chip_Type']]
    X= data_test_noNAN.iloc[:, 5:]
    data = pd.concat( [X1, X] , axis=1)
    # handle the 'Treatment_of_rhinitis' feature
    data['Treatment_of_rhinitis'] = data['Treatment_of_rhinitis'].astype(str)
    data['Treatment_of_rhinitis'] = data['Treatment_of_rhinitis'].str.replace('.0', '', regex=True)
    # handle 'Age_of_onsets' which don't have the same format in data test and train
    data['Age_of_onsets'] = data['Age_of_onsets'].astype(str)

    
    ##  Get_dummies of the 'object' type columns
    
    columns_to_encode = ['Chip_Type', 'French_Residence_Department', 'French_Region',
         'Treatment_of_athsma', 'Age_of_onsets',
       'General_cofactors', 'Treatment_of_atopic_dematitis','Treatment_of_rhinitis']
    
    ### Split the columns using multiple delimiters and create dummy columns
    dummy_dfs = []
    for col in columns_to_encode:
        # Split the data in the column that use  delimiters
        data[col] = data[col].astype(str)
        data[col] = data[col].apply(lambda x: [i.strip() for i in re.split('[,.]', x)])

        # Create dummy columns
        dummy_df = pd.get_dummies(data[col].apply(pd.Series).stack(), prefix=f"{col}", prefix_sep='_').groupby(level=0).sum()
        dummy_dfs.append(dummy_df)

    ### Concatenate the original DataFrame with the dummy columns
    df_final = pd.concat([data] + dummy_dfs, axis=1)

    ### Drop the original columns from the final dataset
    df_final.drop(columns=columns_to_encode, inplace=True)
    
    # Converting all values into 'float16' type
    encode_data = df_final.astype('float16')
    print(encode_data.info())
    
    return encode_data


In [11]:
encode_data_test = preprocessing_data_test(data_test)
missing_cols = set(encode_data.columns) ^ set(encode_data_test.columns)
print(missing_cols)
len(missing_cols)
encode_data_test = encode_data_test.reindex(columns=encode_data.columns, fill_value=0).astype('float16')
encode_data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586 entries, 0 to 585
Columns: 444 entries, Age to Treatment_of_rhinitis_9
dtypes: float16(444)
memory usage: 508.3 KB
None
{'French_Region_regionN', 'French_Residence_Department_deptMMM', 'French_Residence_Department_deptOOO', 'French_Residence_Department_deptQQQ', 'French_Region_regionO', 'French_Residence_Department_deptW', 'French_Residence_Department_deptAAAA', 'French_Residence_Department_deptTTT', 'French_Residence_Department_deptNNN', 'French_Residence_Department_deptPPP', 'French_Residence_Department_deptCCCC', 'French_Residence_Department_deptU', 'French_Residence_Department_deptP', 'French_Residence_Department_deptUU', 'Treatment_of_athsma_8', 'French_Residence_Department_deptK', 'French_Residence_Department_deptRRR', 'General_cofactors_11', 'French_Residence_Department_deptIII', 'Treatment_of_atopic_dematitis_7', 'French_Residence_Department_deptDDD', 'French_Residence_Department_deptZZZ', 'French_Residence_Department_deptJJ

## For one class (each class can use different hyperparameters to achieve the best results so we gonna explore all targets one by one and keep the best model)

## For all targets

#### Filter for selecting only 1 and 0 in the desired target

In [12]:
# Identify rows with at least one 9 in the target DataFrame
rows_with_9 = Targets['Respiratory_Allergy'].isin([9])

# Separate rows with at least one 9 from the other rows

Targets_without_9 = Targets['Respiratory_Allergy'][~rows_with_9]
encode_data_without_9 = encode_data[~rows_with_9]

df_without_9= pd.concat([encode_data_without_9, Targets_without_9] , axis=1).reset_index(drop=True)

In [13]:
df_without_9

Unnamed: 0,Age,Gender,Blood_Month_sample,Rural_or_urban_area,Sensitization,Skin_Symptoms,Act_d_1,Act_d_2,Act_d_5,Act_d_8,...,Treatment_of_atopic_dematitis_5,Treatment_of_atopic_dematitis_7,Treatment_of_atopic_dematitis_9,Treatment_of_rhinitis_0,Treatment_of_rhinitis_1,Treatment_of_rhinitis_2,Treatment_of_rhinitis_3,Treatment_of_rhinitis_4,Treatment_of_rhinitis_9,Respiratory_Allergy
0,15.0,0.0,7.0,1.0,1.0,0.0,0.000000,0.000000,0.000000,-1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,13.0,1.0,9.0,1.0,1.0,9.0,0.000000,2.150391,0.000000,2.689453,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
2,9.0,1.0,10.0,1.0,1.0,1.0,0.489990,0.000000,0.310059,0.350098,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
3,39.0,0.0,12.0,9.0,1.0,9.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,15.0,1.0,12.0,9.0,1.0,1.0,0.469971,3.359375,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1502,14.0,1.0,2.0,9.0,1.0,9.0,35.625000,0.000000,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1503,16.0,0.0,7.0,9.0,1.0,9.0,0.000000,0.119995,0.000000,-1.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1504,15.0,0.0,2.0,9.0,1.0,1.0,0.000000,2.830078,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1505,15.0,0.0,12.0,1.0,1.0,1.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0


#### General definitions

##### The custom dataset that will be used to store datas as tensors

In [14]:
class CustomDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
        self.features = self.data.iloc[:, :-1].values
        self.labels = self.data.iloc[:, -1].values

        if self.transform is not None:
            self.features, self.labels = self.transform.fit_resample(self.features, self.labels)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(int(self.labels[idx]), dtype=torch.long)

        return features, label
    
    @property
    def num_features(self):
        return self.features.shape[1]
    
    @property
    def num_classes(self):
        return len(set(self.labels))

In [15]:
class CustomDataset_2(Dataset):
    def __init__(self, data):
        self.data = data
        self.features = self.data.iloc[:, :-1].values
        self.labels = self.data.iloc[:, -1].values
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(int(self.labels[idx]), dtype=torch.long)
        return features, label
    
    
def split_dataset(dataset, split_ratio=0.8, shuffle=True, random_seed=None):
    num_samples = len(dataset)
    indices = list(range(num_samples))
    
    if shuffle:
        if random_seed is not None:
            random.seed(random_seed)
        random.shuffle(indices)
    
    split_size = int(num_samples * split_ratio)
    train_indices = indices[:split_size]
    test_indices = indices[split_size:]
    
    train_data = CustomDataset_2([dataset[i] for i in train_indices])
    test_data = CustomDataset_2([dataset[i] for i in test_indices])
    
    return train_data, test_data

##### The class_weight calculator that will be used after each imblearn transformation

In [16]:
def calculate_class_weights(dataset):
    # Convert labels to a PyTorch tensor
    labels_tensor = torch.from_numpy(dataset.labels)
    
    # Get the number of samples in each class
    class_counts = torch.bincount(labels_tensor)
    
    # Calculate the weight for each class as the inverse of its sample count
    total_samples = torch.sum(class_counts)
    class_weights = total_samples / (class_counts * len(class_counts))
    
    return class_weights

##### The dico of sampling technique that will be used to randomly chose the enhance data method at each epoch

In [17]:
sampling_techniques = {
    2: oversampling.RandomOverSampler(),
    #1: oversampling.SMOTE(),
    3: oversampling.BorderlineSMOTE(),
    #0: oversampling.SVMSMOTE(),
    #4: undersampling.TomekLinks(sampling_strategy='auto'),
    4: undersampling.TomekLinks(sampling_strategy='all'),
    5: combination.SMOTETomek(sampling_strategy='auto'),
    6: combination.SMOTETomek(sampling_strategy='all'),
}

##### The architecture of the models to train and used for generate prediction

In [18]:
class Allergy_Net(nn.Module):
    
    def __init__(self,input_size,hidden_size, num_class,dropout_rate):

        super(Allergy_Net,self).__init__()
        self.linear1= nn.Linear(input_size,hidden_size)
        self.linear2= nn.Linear(hidden_size,int(hidden_size/8))
        self.linear3= nn.Linear(int(hidden_size/8),int(hidden_size/64))
        self.linear4= nn.Linear(int(hidden_size/64),num_class)

        self.dropout1 = nn.Dropout(dropout_rate)
        self.batchnorm1 = nn.BatchNorm1d(int(hidden_size/64))

    def forward(self,inputs):
        x = torch.relu(self.linear1(inputs))
        x= self.dropout1(x)
        x = torch.relu(self.linear2(x))
        x= self.dropout1(x)
        x = torch.relu(self.linear3(x))
        x= self.dropout1(x)
        x= self.batchnorm1(x)
        outputs= self.linear4(x)

        # no softmax because Cross entropy Loss
        return outputs

##### The training process to check how the model evolve during training and changing hyperparameters

In [19]:
def training_by_target(enc_data,column,batch_size=512,learning_rate = 1e-3,dropout_rate= 0.6, weight_decay=1e-2, factor=0.75):
    
    
    rows_with_9 = Targets[column].isin([9])
    Targets_without_9 = Targets[column][~rows_with_9]
    encode_data_without_9 = enc_data[~rows_with_9]
    
    dataset_panda= pd.concat([encode_data_without_9,Targets_without_9], axis = 1).reset_index(drop=True)
    print(dataset_panda[column].value_counts())
    
   # Establish the splitting and the Dataset/Loaders for the evaluation part
    batch_size= batch_size
    train_data, test_data = train_test_split(dataset_panda, test_size=0.25, random_state=123)
    dataset_test=CustomDataset(test_data)
    dataset_all=CustomDataset(dataset_panda)
    test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)
    test_all = DataLoader(dataset_all, batch_size=batch_size, shuffle=False)
    
    # define hyperparameters
    input_size= dataset_all.num_features
    hidden_size= 4096
    num_class = dataset_all.num_classes
    num_epochs=30
    learning_rate = learning_rate
    print(input_size, num_class)
    
    # Call a model and define loss and optimizer
    model= Allergy_Net(input_size,hidden_size,num_class,dropout_rate).to(device)
    criterion= nn.CrossEntropyLoss()
    optimizer = torch.optim.Adamax(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=factor, patience=5, verbose=True)
    
    
    # Initialization of some indicators that are used to save best model during training
    best_f1_score = 0.0
    best_model_state = None

    # Training loop
    optimizer.zero_grad()
    for epoch in range(num_epochs):
        model.train()
        random_key = random.randint(2, 6)
        selected_technique = sampling_techniques[random_key]
        print(selected_technique)
        # Apply the sampling technique
        train_dataset = CustomDataset(train_data, transform=selected_technique)
        dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

         # Update class weights
        class_weights = calculate_class_weights(train_dataset)
        criterion.weight = class_weights

        true_labels = []
        predicted_labels = []
        model.train()
        for i,(data,labels) in enumerate (dataloader):
            data= data.to(device)
            labels= labels.to(device)

            #forward
            outputs=model(data)
            loss= criterion(outputs,labels)

            #backward
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            # Calculate some metrics
            _, predictions = torch.max(outputs, 1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())

        acc = accuracy_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels, average='macro')


        print (f'epoch {epoch+1}/{num_epochs}, loss = {loss:.5f}, train_acc = {acc:.4f}, F1 Score_Train = {f1:.4f}')

        # Test 
        model.eval()
        with torch.no_grad():
            true_labels_test = []
            predicted_labels_test = []
            for i,(data,labels) in enumerate (test_loader):
                data= data.to(device)
                labels= labels.to(device)

                outputs = model(data)

                # return value and index of the best class
                _, predictions = torch.max(outputs, 1)

                true_labels_test.extend(labels.cpu().numpy())
                predicted_labels_test.extend(predictions.cpu().numpy())


            test_accuracy = accuracy_score(true_labels_test, predicted_labels_test)
            f1_test = f1_score(true_labels_test, predicted_labels_test, average='macro')

            lr_scheduler.step(f1_test + (f1*0.1))

            # Check if the current model is the best one based on f1 score

            if f1_test + (f1*0.005) > best_f1_score:
                best_f1_score = f1_test+(f1*0.005) 
                best_model_state = model.state_dict()
                torch.save(model.state_dict(), 'best_model.pth')  # Save the best model
                print('\033[91m'+'MODEL_SAVE')
                print(f'Accuracy_test = {test_accuracy:.4f}, F1 Score_test = {f1_test:.4f}'+'\033[0m')

            print(' ')
            
    # eval_final_before saving        
    model = Allergy_Net(input_size, hidden_size, num_class,dropout_rate).to(device)

    # Load the saved model state dictionary
    model.load_state_dict(torch.load('best_model.pth'))

    # Set the model to evaluation mode
    model.eval()

    # Test       
    with torch.no_grad():
        true_labels_test = []
        predicted_labels_test = []
        for i,(data,labels) in enumerate (test_loader):
            data= data.to(device)
            labels= labels.to(device)

            outputs = model(data)

            # return value and index of the best class
            _, predictions = torch.max(outputs, 1)
            true_labels_test.extend(labels.cpu().numpy())
            predicted_labels_test.extend(predictions.cpu().numpy())


        test_accuracy = accuracy_score(true_labels_test, predicted_labels_test)
        f1_test = f1_score(true_labels_test, predicted_labels_test, average='macro')


        print(f'Accuracy = {test_accuracy:.3f}')
        print(f'F1 Score = {f1_test:.4f}')

    # Test on the whole dataset
    with torch.no_grad():
        true_labels = []
        predicted_labels = []
        for data, labels in test_all:
            data= data.to(device)
            labels= labels.to(device)

            outputs = model(data)

            # return value and index of the best class
            _,predictions= torch.max(outputs,1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())

            data_accuracy = accuracy_score(true_labels, predicted_labels)
            f1 = f1_score(true_labels, predicted_labels, average='macro')

        print(f'Accuracy = {data_accuracy:.3f}')
        print(f'F1 Score = {f1:.4f}')
    return model  

In [20]:
from sklearn.model_selection import StratifiedKFold

# Set the number of folds
k = 10
# Create the Stratified K-fold cross-validation splitter
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)


In [21]:
batch_size=64
column= 'Type_of_Venom_Allergy_IGE_Venom'
rows_with_9 = Targets[column].isin([9])
Targets_without_9 = Targets[column][~rows_with_9]
encode_data_without_9 = encode_data[~rows_with_9]

dataset_panda= pd.concat([encode_data_without_9,Targets_without_9], axis = 1).reset_index(drop=True)
dataset_all=CustomDataset(dataset_panda)   
test_all = DataLoader(dataset_all, batch_size=batch_size, shuffle=False)

# Convert the dataset to NumPy arrays
X = dataset_all.features
y = dataset_all.labels

# Create an empty list to store the models trained in each fold
fold_models = []

# Iterate over the folds
for fold, (train_indices, test_indices) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}")

    # Split the data into train and test sets for the current fold
    train_data = dataset_panda.iloc[train_indices].reset_index(drop=True)
    test_data = dataset_panda.iloc[test_indices].reset_index(drop=True)
    
    dataset_test=CustomDataset(test_data)
    test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

                             
    # define hyperparameters
    input_size= dataset_all.num_features
    hidden_size= 2048
    num_class = dataset_all.num_classes
    num_epochs=30
    learning_rate = 1e-4
    dropout_rate = 0.35
    weight_decay= 1e-2
    factor=0.7
    print(input_size, num_class)

    # Create an instance of your model
    model = Allergy_Net(input_size,hidden_size,num_class,dropout_rate).to(device)

    # Define your loss function and optimizer
    criterion= nn.CrossEntropyLoss()
    optimizer = torch.optim.Adamax(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=factor, patience=5, verbose=True)
    
    # Initialization of some indicators that are used to save best model during training
    best_f1_score = 0.0
    best_model_state = None
    
    # Train your model
    optimizer.zero_grad()
    for epoch in range(num_epochs):
        model.train()
        random_key = random.randint(2, 6)
        selected_technique = sampling_techniques[random_key]
        print(selected_technique)
        # Apply the sampling technique
        train_dataset = CustomDataset(train_data, transform=selected_technique)
        dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

         # Update class weights
        class_weights = calculate_class_weights(train_dataset)
        criterion.weight = class_weights

        true_labels = []
        predicted_labels = []
        model.train()
        for i,(data,labels) in enumerate (dataloader):
            data= data.to(device)
            labels= labels.to(device)

            #forward
            outputs=model(data)
            loss= criterion(outputs,labels)

            #backward
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            # Calculate some metrics
            _, predictions = torch.max(outputs, 1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())

        acc = accuracy_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels, average='macro')


        print (f'epoch {epoch+1}/{num_epochs}, loss = {loss:.5f}, train_acc = {acc:.4f}, F1 Score_Train = {f1:.4f}')

        # Test 
        model.eval()
        with torch.no_grad():
            true_labels_test = []
            predicted_labels_test = []
            for i,(data,labels) in enumerate (test_loader):
                data= data.to(device)
                labels= labels.to(device)

                outputs = model(data)

                # return value and index of the best class
                _, predictions = torch.max(outputs, 1)

                true_labels_test.extend(labels.cpu().numpy())
                predicted_labels_test.extend(predictions.cpu().numpy())


            test_accuracy = accuracy_score(true_labels_test, predicted_labels_test)
            f1_test = f1_score(true_labels_test, predicted_labels_test, average='macro')

            lr_scheduler.step(f1_test + (f1*0.1))

            # Check if the current model is the best one based on f1 score

            if f1_test + (f1*0.005) > best_f1_score:
                best_f1_score = f1_test+(f1*0.005) 
                best_model_state = model.state_dict()
                best_model= model
                torch.save(model.state_dict(), 'best_model.pth')  # Save the best model
                print('\033[91m'+'MODEL_SAVE')
                print(f'Accuracy_test = {test_accuracy:.4f}, F1 Score_test = {f1_test:.4f}'+'\033[0m')

            print(' ')

    # Store the trained model for the current fold
    fold_models.append(best_model)

Fold 1
467 2
SMOTETomek()
epoch 1/30, loss = 0.54335, train_acc = 0.7575, F1 Score_Train = 0.7575
[91mMODEL_SAVE
Accuracy_test = 0.8161, F1 Score_test = 0.4667[0m
 
RandomOverSampler()
epoch 2/30, loss = 0.36898, train_acc = 0.8258, F1 Score_Train = 0.8258
[91mMODEL_SAVE
Accuracy_test = 0.8395, F1 Score_test = 0.4762[0m
 
SMOTETomek(sampling_strategy='all')
epoch 3/30, loss = 0.29376, train_acc = 0.8876, F1 Score_Train = 0.8874
[91mMODEL_SAVE
Accuracy_test = 0.8696, F1 Score_test = 0.4894[0m
 
SMOTETomek()
epoch 4/30, loss = 0.31594, train_acc = 0.9054, F1 Score_Train = 0.9053
[91mMODEL_SAVE
Accuracy_test = 0.8763, F1 Score_test = 0.4925[0m
 
SMOTETomek(sampling_strategy='all')
epoch 5/30, loss = 0.30081, train_acc = 0.9213, F1 Score_Train = 0.9212
[91mMODEL_SAVE
Accuracy_test = 0.8997, F1 Score_test = 0.5047[0m
 
RandomOverSampler()
epoch 6/30, loss = 0.23208, train_acc = 0.9114, F1 Score_Train = 0.9113
[91mMODEL_SAVE
Accuracy_test = 0.9030, F1 Score_test = 0.5067[0m
 
Bor

In [22]:
len(fold_models)

10

In [116]:
dataset_all=CustomDataset(dataset_panda)   
test_all = DataLoader(dataset_all, batch_size=64, shuffle=False)

predictions_all = []
max_samples = 0
for model in fold_models:
    pred_model=[]
    model.eval()
    with torch.no_grad():
        for data, labels in test_all:
            model = model.to(device)
            data= data.to(device)
            labels= labels.to(device)

            outputs = model(data)
            pred_model.extend(outputs)
    predictions_all.append(pred_model)

In [36]:
len(predictions_all[0])

2989

In [78]:
class MetaModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(MetaModel, self).__init__()
        self.fc1 = nn.Linear(input_size , len(fold_models))
        self.fc2 = nn.Linear(len(fold_models), output_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Create an instance of the meta-model
meta_model = MetaModel(10, 2)

combined_pred=[]
for i in range(0,len(predictions_all[0])):
    combined_pred_mini=[]
    for j in range (0, len(predictions_all)):
        combined_pred_mini.extend(predictions_all[j][i])
    combined_pred.append(combined_pred_mini)

In [79]:
combined_pred[0]

[tensor(1.1501),
 tensor(-1.2536),
 tensor(1.0965),
 tensor(-1.3371),
 tensor(1.3654),
 tensor(-1.1583),
 tensor(-0.0392),
 tensor(0.1278),
 tensor(0.9168),
 tensor(-0.5533)]

In [80]:
liste_tensor=[]
for i in range (0, len(combined_pred)):
    combined_tensor = torch.stack(combined_pred[i], dim=0)
    liste_tensor.append(combined_tensor)
len(liste_tensor)


2989

In [81]:
combined_labels = dataset_all.labels # Use max_samples to handle variable-sized tensors

# Define the loss function and optimizer for the meta-model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(meta_model.parameters(), lr=learning_rate)

In [None]:
# Train the meta-model
meta_model.train()
for epoch in range(num_epochs):
    true_labels = []
    predicted_labels = []
    optimizer.zero_grad()
    for elem,label in zip (liste_tensor,combined_labels):
        outputs = meta_model(elem)
        
        loss = criterion(outputs.unsqueeze(0), torch.tensor([label]))
        loss.backward()
        optimizer.step()
        
        # Return the value and index of the best class
        _, predictions = torch.max(outputs, 0)  # Assuming outputs has shape (num_classes,)
        true_labels.append(label.item())
        predicted_labels.append(predictions.item())

        data_accuracy = accuracy_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels, average='macro')
    print(f'Accuracy = {data_accuracy:.3f}')
    print(f'F1 Score = {f1:.4f}')

    

In [23]:
def meta_train_method(df,list_of_fold_model):
    num_epochs = 20
    # Dataset creation
    dataset_all=CustomDataset(df)   
    test_all = DataLoader(dataset_all, batch_size=64, shuffle=False)
    
    # obtain prediction for each folds
    predictions_all = []
    for model in list_of_fold_model:
        pred_model=[]
        model.eval()
        with torch.no_grad():
            for data, labels in test_all:
                model = model.to(device)
                data= data.to(device)
                labels= labels.to(device)

                outputs = model(data)
                pred_model.extend(outputs)
        predictions_all.append(pred_model)
    
    class MetaModel(nn.Module):
        def __init__(self, input_size, output_size):
            super(MetaModel, self).__init__()
            self.fc1 = nn.Linear(input_size , len(list_of_fold_model)*len(list_of_fold_model))
            self.fc2 = nn.Linear(len(list_of_fold_model)*len(list_of_fold_model), output_size)
            self.dropout1 = nn.Dropout(0.15)

        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = self.dropout1(x)
            x = self.fc2(x)
            return x

    # Create an instance of the meta-model
    meta_model = MetaModel(int(len(list_of_fold_model))*2, 2)

    combined_pred=[]
    for i in range(0,len(predictions_all[0])):
        combined_pred_mini=[]
        for j in range (0, len(predictions_all)):
            combined_pred_mini.extend(predictions_all[j][i])
        combined_pred.append(combined_pred_mini)
    liste_tensor=[]
    for i in range (0, len(combined_pred)):
        combined_tensor = torch.stack(combined_pred[i], dim=0)
        liste_tensor.append(combined_tensor)
    
    combined_labels = dataset_all.labels
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adamax(meta_model.parameters(), lr=1e-4, weight_decay=1e-1)
    
    # Train the meta-model
    meta_model.train()
    for epoch in range(num_epochs):
        true_labels = []
        predicted_labels = []
        optimizer.zero_grad()
        for elem,label in zip (liste_tensor,combined_labels):
            outputs = meta_model(elem)

            loss = criterion(outputs.unsqueeze(0), torch.tensor([label]))
            loss.backward()
            optimizer.step()

            # Return the value and index of the best class
            _, predictions = torch.max(outputs, 0)
            true_labels.append(label.item())
            predicted_labels.append(predictions.item())

            data_accuracy = accuracy_score(true_labels, predicted_labels)
            f1 = f1_score(true_labels, predicted_labels, average='macro')
        print(f'Accuracy = {data_accuracy:.3f}')
        print(f'F1 Score = {f1:.4f}')

In [24]:
meta_train_method(dataset_panda,fold_models)

Accuracy = 0.989
F1 Score = 0.6164
Accuracy = 0.992
F1 Score = 0.7319
Accuracy = 0.995
F1 Score = 0.8466
Accuracy = 0.994
F1 Score = 0.7775
Accuracy = 0.993
F1 Score = 0.7809
Accuracy = 0.994
F1 Score = 0.8251
Accuracy = 0.994
F1 Score = 0.8177
Accuracy = 0.995
F1 Score = 0.8397
Accuracy = 0.994
F1 Score = 0.8009
Accuracy = 0.996
F1 Score = 0.8561
Accuracy = 0.996
F1 Score = 0.8712
Accuracy = 0.994
F1 Score = 0.8097
Accuracy = 0.994
F1 Score = 0.8319
Accuracy = 0.996
F1 Score = 0.8477
Accuracy = 0.994
F1 Score = 0.8185
Accuracy = 0.995
F1 Score = 0.8530
Accuracy = 0.994
F1 Score = 0.7616
Accuracy = 0.993
F1 Score = 0.7748
Accuracy = 0.995
F1 Score = 0.8238
Accuracy = 0.996
F1 Score = 0.8477


In [61]:
def meta_train_method_2_resampled(df,list_of_fold_model):
    num_epochs = 10
    # Dataset creation
    dataset_all=CustomDataset(df)   
    test_all = DataLoader(dataset_all, batch_size=64, shuffle=False)
    
    # Generate oversampling resampling dataset to inject into the folds 
    adasyn = oversampling.ADASYN(random_state=123)
    data_resampled, labels_resampled = adasyn.fit_resample(df.drop('Type_of_Venom_Allergy_IGE_Venom', axis=1), dataset_panda['Type_of_Venom_Allergy_IGE_Venom'])

    # Convert train and test data into PyTorch datasets
    class CustomDataset_2(Dataset):
        def __init__(self, features, labels):
            self.features = features
            self.labels = labels

        def __len__(self):
            return len(self.features)

        def __getitem__(self, idx):
            feature = torch.tensor(self.features.iloc[idx].values, dtype=torch.float32)
            label = torch.tensor(int(self.labels[idx]), dtype=torch.long)
            return feature, label
    
    full_dataset_resampled = CustomDataset_2(data_resampled, labels_resampled) 
    full_dataloader_resampled= DataLoader(full_dataset_resampled, batch_size=64, shuffle=False)
    
    
    
    # obtain prediction for each folds
    predictions_all = []
    for model in list_of_fold_model:
        pred_model=[]
        model.eval()
        with torch.no_grad():
            for data, labels in full_dataloader_resampled:
                model = model.to(device)
                data= data.to(device)
                labels= labels.to(device)

                outputs = model(data)
                pred_model.extend(outputs)
        predictions_all.append(pred_model)
    
    class MetaModel(nn.Module):
        def __init__(self, input_size, output_size):
            super(MetaModel, self).__init__()
            self.fc1 = nn.Linear(input_size , len(list_of_fold_model)*len(list_of_fold_model))
            self.fc2 = nn.Linear(len(list_of_fold_model)*len(list_of_fold_model), output_size)
            self.dropout1 = nn.Dropout(0.5)

        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = self.dropout1(x)
            x = self.fc2(x)
            return x

    # Create an instance of the meta-model
    meta_model = MetaModel(int(len(list_of_fold_model))*2, 2)

    combined_pred=[]
    for i in range(0,len(predictions_all[0])):
        combined_pred_mini=[]
        for j in range (0, len(predictions_all)):
            combined_pred_mini.extend(predictions_all[j][i])
        combined_pred.append(combined_pred_mini)
    liste_tensor=[]
    for i in range (0, len(combined_pred)):
        combined_tensor = torch.stack(combined_pred[i], dim=0)
        liste_tensor.append(combined_tensor)
        
    combined_labels = full_dataset_resampled.labels
    
    
    

    # Convert tensors to numpy arrays
    numpy_array_list = [tensor.numpy() for tensor in liste_tensor]

    # Create a Pandas Series from the numpy array list
    series = pd.Series(numpy_array_list)
    
    train_data,test_data,train_labels,test_labels= train_test_split(series, combined_labels, test_size=0.2, random_state=126)
     # Reindex train and test datasets
    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)
    train_labels.reset_index(drop=True, inplace=True)
    test_labels.reset_index(drop=True, inplace=True)
    
    # Convert train and test data into PyTorch datasets
    class CustomDataset_3(Dataset):
        def __init__(self, features, labels):
            self.features = features
            self.labels = labels

        def __len__(self):
            return len(self.features)

        def __getitem__(self, idx):
            feature = torch.tensor(self.features.iloc[idx], dtype=torch.float32)
            label = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
            return feature, label
        
        
    train_dataset = CustomDataset_3(train_data, train_labels)
    test_dataset = CustomDataset_3(test_data, test_labels)
    
    # Create DataLoader for train and test datasets
    batch_size = 64
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adamax(meta_model.parameters(), lr=1e-4, weight_decay=10)
    
    # Initialization of some indicators that are used to save best model during training
    best_f1_score = 0.0
    best_model_state = None
    
    # Train the meta-model
    optimizer.zero_grad()
    for epoch in range(num_epochs):
        meta_model.train()
        true_labels = []
        predicted_labels = []
        model.train()
        for i,(data,labels) in enumerate (train_loader):
            data= data.to(device)
            labels= labels.to(device)

            #forward
            outputs=meta_model(data)
            loss= criterion(outputs,labels)

            #backward
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            # Calculate some metrics
            _, predictions = torch.max(outputs, 1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())

        acc = accuracy_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels, average='macro')


        print (f'epoch {epoch+1}/{num_epochs}, loss = {loss:.5f}, train_acc = {acc:.4f}, F1 Score_Train = {f1:.4f}')

        # Test 
        meta_model.eval()
        with torch.no_grad():
            true_labels_test = []
            predicted_labels_test = []
            for i,(data,labels) in enumerate (test_loader):
                data= data.to(device)
                labels= labels.to(device)

                outputs = meta_model(data)

                # return value and index of the best class
                _, predictions = torch.max(outputs, 1)

                true_labels_test.extend(labels.cpu().numpy())
                predicted_labels_test.extend(predictions.cpu().numpy())


            test_accuracy = accuracy_score(true_labels_test, predicted_labels_test)
            f1_test = f1_score(true_labels_test, predicted_labels_test, average='macro')

            # Check if the current model is the best one based on f1 score

            if f1_test + (f1*0.005) > best_f1_score:
                best_f1_score = f1_test+(f1*0.005) 
                best_model_state = meta_model.state_dict()
                torch.save(meta_model.state_dict(), 'best_model.pth')  # Save the best model
                print('\033[91m'+'MODEL_SAVE')
                print(f'Accuracy_test = {test_accuracy:.4f}, F1 Score_test = {f1_test:.4f}'+'\033[0m')

            print(' ')

In [62]:
meta_train_method_2_resampled(dataset_panda,fold_models)

epoch 1/10, loss = 0.53098, train_acc = 0.7342, F1 Score_Train = 0.7216
[91mMODEL_SAVE
Accuracy_test = 0.9208, F1 Score_test = 0.9206[0m
 
epoch 2/10, loss = 0.51084, train_acc = 0.7723, F1 Score_Train = 0.7639
[91mMODEL_SAVE
Accuracy_test = 0.9377, F1 Score_test = 0.9376[0m
 
epoch 3/10, loss = 0.56638, train_acc = 0.8005, F1 Score_Train = 0.7949
[91mMODEL_SAVE
Accuracy_test = 0.9469, F1 Score_test = 0.9469[0m
 
epoch 4/10, loss = 0.57781, train_acc = 0.8275, F1 Score_Train = 0.8239
[91mMODEL_SAVE
Accuracy_test = 0.9537, F1 Score_test = 0.9537[0m
 
epoch 5/10, loss = 0.53952, train_acc = 0.8564, F1 Score_Train = 0.8541
[91mMODEL_SAVE
Accuracy_test = 0.9570, F1 Score_test = 0.9570[0m
 
epoch 6/10, loss = 0.50304, train_acc = 0.8825, F1 Score_Train = 0.8813
[91mMODEL_SAVE
Accuracy_test = 0.9587, F1 Score_test = 0.9587[0m
 
epoch 7/10, loss = 0.55599, train_acc = 0.9044, F1 Score_Train = 0.9037
[91mMODEL_SAVE
Accuracy_test = 0.9612, F1 Score_test = 0.9612[0m
 
epoch 8/10, l

## Target: Type_of_Venom_Allergy_IGE_Venom

In [32]:
model=training_by_target(encode_data,'Type_of_Venom_Allergy_IGE_Venom',batch_size=64,learning_rate = 1e-4,dropout_rate= 0.7, weight_decay=1e-1, factor=0.75)

0    2967
1      22
Name: Type_of_Venom_Allergy_IGE_Venom, dtype: int64
467 2
BorderlineSMOTE()
epoch 1/60, loss = 0.52947, train_acc = 0.5609, F1 Score_Train = 0.5596
[91mMODEL_SAVE
Accuracy_test = 0.8396, F1 Score_test = 0.4724[0m
 
BorderlineSMOTE()
epoch 2/60, loss = 0.49253, train_acc = 0.6609, F1 Score_Train = 0.6605
 
SMOTETomek()
epoch 3/60, loss = 0.78489, train_acc = 0.6337, F1 Score_Train = 0.6334
 
RandomOverSampler()
epoch 4/60, loss = 0.65549, train_acc = 0.6411, F1 Score_Train = 0.6407
 
SMOTETomek()
epoch 5/60, loss = 0.61198, train_acc = 0.6845, F1 Score_Train = 0.6845
 
SMOTETomek(sampling_strategy='all')
epoch 6/60, loss = 0.56566, train_acc = 0.6966, F1 Score_Train = 0.6966
 
BorderlineSMOTE()
epoch 7/60, loss = 0.42533, train_acc = 0.7921, F1 Score_Train = 0.7920
 
SMOTETomek()
epoch 8/60, loss = 0.52072, train_acc = 0.7431, F1 Score_Train = 0.7431
 
RandomOverSampler()
epoch 9/60, loss = 0.46614, train_acc = 0.7288, F1 Score_Train = 0.7288
 
RandomOverSampler()


In [43]:
model_to_save = Allergy_Net(input_size=467, hidden_size=2048, num_class=2,dropout_rate=0.6).to(device)
model_to_save.load_state_dict(torch.load('best_model.pth'))
torch.save(model_to_save.state_dict(), 'Type_of_Venom_Allergy_IGE_Venom_Pytorch_2.pth')

In [20]:
def get_all_without_optim(list_of_targets):
    liste_column_monovalue=[]
    for column in list_of_targets.columns:
        rows_with_9 = Targets[column].isin([9])
        Targets_without_9 = Targets[column][~rows_with_9]
        encode_data_without_9 = encode_data[~rows_with_9]
    
        dataset_panda= pd.concat([encode_data_without_9,Targets_without_9], axis = 1).reset_index(drop=True)
        
        num_class = dataset_panda[column].nunique()
        if num_class >1:
            model=training_by_target(column,batch_size=64,learning_rate = 1e-4,dropout_rate= 0.4, weight_decay=1e-2, factor=0.1)
            model_to_save = Allergy_Net(input_size=467, hidden_size=1024, num_class=num_class,dropout_rate=0.45).to(device)
            model_to_save.load_state_dict(torch.load('best_model.pth'))
            torch.save(model_to_save.state_dict(), f'{column}_Pytorch_no_optim_aug_bef_1')
        else:
            liste_column_monovalue.append(column)
    return liste_column_monovalue        

In [None]:
liste_column_monovalue_train = get_all_without_optim(Targets)

1    1368
0     391
Name: Allergy_Present, dtype: int64
467 2
epoch 1/100, loss = 0.66966, train_acc = 0.5161, F1 Score_Train = 0.5095
[91mMODEL_SAVE
Accuracy_test = 0.5790, F1 Score_test = 0.5556[0m
 
epoch 2/100, loss = 0.57107, train_acc = 0.6099, F1 Score_Train = 0.6093
[91mMODEL_SAVE
Accuracy_test = 0.6434, F1 Score_test = 0.6403[0m
 
epoch 3/100, loss = 0.61164, train_acc = 0.6375, F1 Score_Train = 0.6368
[91mMODEL_SAVE
Accuracy_test = 0.6746, F1 Score_test = 0.6715[0m
 
epoch 4/100, loss = 0.54282, train_acc = 0.6785, F1 Score_Train = 0.6784
[91mMODEL_SAVE
Accuracy_test = 0.6801, F1 Score_test = 0.6792[0m
 
epoch 5/100, loss = 0.71433, train_acc = 0.6799, F1 Score_Train = 0.6798
[91mMODEL_SAVE
Accuracy_test = 0.7040, F1 Score_test = 0.7031[0m
 
epoch 6/100, loss = 0.48520, train_acc = 0.6895, F1 Score_Train = 0.6895
[91mMODEL_SAVE
Accuracy_test = 0.7077, F1 Score_test = 0.7068[0m
 
epoch 7/100, loss = 0.55865, train_acc = 0.7042, F1 Score_Train = 0.7041
[91mMODEL_SA

In [21]:
liste_column_monovalue_train=['Type_of_Food_Allergy_Other', 'Type_of_Food_Allergy_Cereals_&_Seeds']

## Obtain the prediction for the train dataset

In [62]:
data= pd.read_csv('data/train.csv', low_memory=False)
data_test= pd.read_csv('data/test.csv', low_memory=False)

In [63]:
encode_data,Targets = preprocessing_data(data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2989 entries, 0 to 2988
Columns: 467 entries, Age to Treatment_of_rhinitis_9
dtypes: float16(467)
memory usage: 2.7 MB
None


In [72]:
folder_path = 'liste_classifier_reboot2_first_iteration_binary'

def obtain_pred(data, folder_path):
    
    liste=[]
    for column in Targets.columns:
        # get the dataset ready
        print(column)
        rows_with_9 = Targets[column].isin([9])
        Targets_without_9 = Targets[column][~rows_with_9]
        encode_data_without_9 = data[~rows_with_9]
    
        dataset_panda= pd.concat([encode_data_without_9,Targets_without_9], axis = 1).reset_index(drop=True)
        dataset_all=CustomDataset(dataset_panda)
        num_class = dataset_all.num_classes
        
        hidden_size = 1024
        input_size = 467
        
        if num_class == 1:
            continue
       
        for subdir, dirs, files in os.walk(folder_path):
            for file in files:
                file_name = os.path.splitext(file)[0]
                if str(column) +'_Pytorch_no_optim_aug_bef_1' == file_name :
                    print("Calling model:", file)
                    file_name = os.path.splitext(file)[0]

                    model = Allergy_Net(input_size=input_size, hidden_size=hidden_size, num_class=num_class,dropout_rate=0.45).to(device)
                    model.load_state_dict(torch.load(os.path.join(subdir, file_name)))
                    model.eval()

                    # Convert the data to PyTorch tensors
                    data_tensor = torch.tensor(data.values, dtype=torch.float32)

                    # Make predictions
                    with torch.no_grad():
                        outputs = model(data_tensor)
                        _, predicted_labels = torch.max(outputs, dim=1)

                        # Create a DataFrame to store the predicted labels and return the right format(0,1,9)
                        values = pd.DataFrame({'pred_label_' + str(file): predicted_labels})
                        values_name = '{}_{}'.format(file_name, 'values')
                        globals()[values_name] = values.rename(columns={'pred_label_' + str(file): 'pred_label ' + str(values_name)})
                        liste.append(globals()[values_name])

    return liste

In [73]:
liste_df=obtain_pred(encode_data, folder_path)

Allergy_Present
Calling model: Allergy_Present_Pytorch_no_optim_aug_bef_1
Severe_Allergy
Calling model: Severe_Allergy_Pytorch_no_optim_aug_bef_1
Respiratory_Allergy
Calling model: Respiratory_Allergy_Pytorch_no_optim_aug_bef_1
Food_Allergy
Calling model: Food_Allergy_Pytorch_no_optim_aug_bef_1
Venom_Allergy
Calling model: Venom_Allergy_Pytorch_no_optim_aug_bef_1
Type_of_Respiratory_Allergy_ARIA
Calling model: Type_of_Respiratory_Allergy_ARIA_Pytorch_no_optim_aug_bef_1
Type_of_Respiratory_Allergy_CONJ
Calling model: Type_of_Respiratory_Allergy_CONJ_Pytorch_no_optim_aug_bef_1
Type_of_Respiratory_Allergy_GINA
Calling model: Type_of_Respiratory_Allergy_GINA_Pytorch_no_optim_aug_bef_1
Type_of_Respiratory_Allergy_IGE_Pollen_Gram
Calling model: Type_of_Respiratory_Allergy_IGE_Pollen_Gram_Pytorch_no_optim_aug_bef_1
Type_of_Respiratory_Allergy_IGE_Pollen_Herb
Calling model: Type_of_Respiratory_Allergy_IGE_Pollen_Herb_Pytorch_no_optim_aug_bef_1
Type_of_Respiratory_Allergy_IGE_Pollen_Tree
Callin

In [66]:
print(len(liste_df))
liste_df[0]

27


Unnamed: 0,pred_label Allergy_Present_Pytorch_no_optim_aug_bef_1_values
0,1
1,0
2,0
3,1
4,0
...,...
2984,1
2985,0
2986,1
2987,1


In [67]:
merged_df = pd.DataFrame()
for i,df in enumerate(liste_df):
    if i==0:
        merged_df = df
    else:
        last_two_columns = df.iloc[:, -2:]
        merged_df = pd.concat([merged_df, last_two_columns], axis=1)
        
added_list = ['pred_label ' + s + '_Pytorch_no_optim_aug_bef_1_values' for s in liste_column_monovalue_train]
zeros_data = pd.DataFrame(0, index=np.arange(len(merged_df)), columns= added_list)
merged_df = pd.concat([merged_df, zeros_data], axis=1)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2989 entries, 0 to 2988
Data columns (total 29 columns):
 #   Column                                                                                       Non-Null Count  Dtype
---  ------                                                                                       --------------  -----
 0   pred_label Allergy_Present_Pytorch_no_optim_aug_bef_1_values                                 2989 non-null   int64
 1   pred_label Severe_Allergy_Pytorch_no_optim_aug_bef_1_values                                  2989 non-null   int64
 2   pred_label Respiratory_Allergy_Pytorch_no_optim_aug_bef_1_values                             2989 non-null   int64
 3   pred_label Food_Allergy_Pytorch_no_optim_aug_bef_1_values                                    2989 non-null   int64
 4   pred_label Venom_Allergy_Pytorch_no_optim_aug_bef_1_values                                   2989 non-null   int64
 5   pred_label Type_of_Respiratory_Allergy_ARIA_Pyto

In [68]:
data_true= pd.read_csv('data/train.csv', low_memory=False)

In [69]:
import numpy as np
target_columns = ['Type_of_Food_Allergy_Other_Legumes','Type_of_Food_Allergy_Cereals_&_Seeds', 'Allergy_Present', 'Type_of_Food_Allergy_Mammalian_Milk', 'Type_of_Food_Allergy_Other',
                  'Type_of_Respiratory_Allergy_IGE_Mite_Cockroach', 'Venom_Allergy', 'Type_of_Respiratory_Allergy_ARIA', 'Type_of_Respiratory_Allergy_IGE_Pollen_Gram',
                  'Type_of_Respiratory_Allergy_IGE_Pollen_Herb', 'Food_Allergy', 'Type_of_Food_Allergy_Oral_Syndrom','Type_of_Food_Allergy_Tree_Nuts', 'Severe_Allergy',
                  'Type_of_Food_Allergy_Aromatics', 'Type_of_Venom_Allergy_IGE_Venom', 'Type_of_Venom_Allergy_ATCD_Venom', 'Type_of_Respiratory_Allergy_CONJ', 'Type_of_Food_Allergy_Peanut',
                  'Type_of_Food_Allergy_Egg', 'Type_of_Food_Allergy_Fish', 'Type_of_Respiratory_Allergy_GINA', 'Respiratory_Allergy', 'Type_of_Food_Allergy_TPO',
                  'Type_of_Respiratory_Allergy_IGE_Pollen_Tree', 'Type_of_Food_Allergy_Fruits_and_Vegetables', 'Type_of_Respiratory_Allergy_IGE_Molds_Yeast',
                  'Type_of_Respiratory_Allergy_IGE_Dander_Animals', 'Type_of_Food_Allergy_Shellfish']
f1_scores=[]
below_threshold_columns=[]
threshold =0.89
for column in target_columns:
    # Get the predicted and true labels for the column
    
    rows_with_9 = data_true[column].isin([9])
    Targets_without_9 = data_true[column][~rows_with_9]
    merged_df_removal = merged_df[~rows_with_9]
        
    merged_df_mod = pd.concat([merged_df_removal, Targets_without_9] , axis=1).reset_index(drop=True)
    y_true = merged_df_mod[column]
    y_pred = merged_df_mod['pred_label '+ column + '_Pytorch_no_optim_aug_bef_1_values']
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calculate recall
    recall = recall_score(y_true, y_pred)
    
    # Calculate F1 score
    if (data_true[column] == 1).any():
        f1 = f1_score(y_true, y_pred,average='macro')
    else:
        f1 = f1_score(y_true, y_pred, pos_label=0)
    cm = confusion_matrix(y_true, y_pred)

    # Display the metrics and confusion matrix
    print(f"Metrics for {column}:")
    print(f"Accuracy: {accuracy}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print("Confusion Matrix:")
    print(cm)
    print()
    if f1 != 0:
        f1_scores.append(f1) # Append the F1 score to the list
    if f1 < threshold:
        below_threshold_columns.append(column)

mean_f1 = sum(f1_scores) / len(f1_scores)  # Calculate the mean of F1 scores

print(f"Mean F1 Score: {mean_f1}")
print(below_threshold_columns)

Metrics for Type_of_Food_Allergy_Other_Legumes:
Accuracy: 0.9975108898568762
Recall: 1.0
F1 Score: 0.9801261439525104
Confusion Matrix:
[[1553    4]
 [   0   50]]

Metrics for Type_of_Food_Allergy_Cereals_&_Seeds:
Accuracy: 1.0
Recall: 0.0
F1 Score: 1.0
Confusion Matrix:
[[1607]]

Metrics for Allergy_Present:
Accuracy: 0.9403069926094372
Recall: 0.9364035087719298
F1 Score: 0.918622833720403
Confusion Matrix:
[[ 373   18]
 [  87 1281]]

Metrics for Type_of_Food_Allergy_Mammalian_Milk:
Accuracy: 0.9975108898568762
Recall: 1.0
F1 Score: 0.962330051570558
Confusion Matrix:
[[1578    4]
 [   0   25]]

Metrics for Type_of_Food_Allergy_Other:
Accuracy: 1.0
Recall: 0.0
F1 Score: 1.0
Confusion Matrix:
[[1607]]

Metrics for Type_of_Respiratory_Allergy_IGE_Mite_Cockroach:
Accuracy: 0.8135368281353683
Recall: 0.7205882352941176
F1 Score: 0.8084321745027105
Confusion Matrix:
[[736  91]
 [190 490]]

Metrics for Venom_Allergy:
Accuracy: 0.9959852793576447
Recall: 1.0
F1 Score: 0.9220599739243807
Con

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Generate prediction for the test set

In [48]:
original_validation_data = pd.read_csv('data/test.csv')

data_test = original_validation_data.set_index('trustii_id')

In [49]:
encode_data_test = preprocessing_data_test(data_test)
missing_cols = set(encode_data.columns) ^ set(encode_data_test.columns)
print(missing_cols)
len(missing_cols)
encode_data_test = encode_data_test.reindex(columns=encode_data.columns, fill_value=0).astype('float16')
encode_data_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 586 entries, 1 to 1282
Columns: 443 entries, Gender to Treatment_of_rhinitis_9
dtypes: float16(443)
memory usage: 527.8 KB
None
{'French_Residence_Department_deptHHH', 'Treatment_of_athsma_10', 'French_Residence_Department_deptCCCC', 'French_Residence_Department_deptU', 'French_Residence_Department_deptDDD', 'French_Residence_Department_deptK', 'French_Residence_Department_deptQQQ', 'French_Residence_Department_deptDD', 'French_Residence_Department_deptNNN', 'French_Residence_Department_deptT', 'French_Residence_Department_deptIII', 'French_Residence_Department_deptP', 'French_Residence_Department_deptTTT', 'French_Residence_Department_deptZZZ', 'French_Residence_Department_deptOOO', 'Age', 'French_Residence_Department_deptMMM', 'French_Residence_Department_deptRRR', 'French_Residence_Department_deptW', 'Treatment_of_atopic_dematitis_7', 'French_Region_regionN', 'Treatment_of_athsma_8', 'French_Region_regionO', 'French_Residence_Departm

In [50]:
def check_column_order(df1, df2):
    return list(df1.columns) == list(df2.columns)

same_order = check_column_order(encode_data, encode_data_test)
print(same_order)

True


In [53]:
folder_path = 'liste_classifier_reboot2_first_iteration_binary'

def obtain_pred_test(data, folder_path):
    
    liste=[]
    for column in Targets.columns:
        for subdir, dirs, files in os.walk(folder_path):
            for file in files:
                file_name = os.path.splitext(file)[0]
                if str(column) +'_Pytorch_no_optim_aug_bef_1' == file_name :
                    print("Calling model:", file)

                    model = Allergy_Net(input_size=467, hidden_size=1024, num_class=2,dropout_rate=0.45).to(device)
                    model.load_state_dict(torch.load(os.path.join(subdir, file_name)))
                    model.eval()

                    # Convert the data to PyTorch tensors
                    data_tensor = torch.tensor(data.values, dtype=torch.float32)

                    # Make predictions
                    with torch.no_grad():
                        outputs = model(data_tensor)
                        _, predicted_labels = torch.max(outputs, dim=1)

                        # Create a DataFrame to store the predicted labels and return the right format(0,1,9)
                        values = pd.DataFrame({'pred_label_' + str(file): predicted_labels})
                        values_name = '{}_{}'.format(file_name, 'values')
                        globals()[values_name] = values.rename(columns={'pred_label_' + str(file): 'pred_label ' + str(values_name)})
                        liste.append(globals()[values_name])

    return liste

In [54]:
liste_df=obtain_pred_test(encode_data_test, folder_path)

Calling model: Allergy_Present_Pytorch_no_optim_aug_bef_1
Calling model: Severe_Allergy_Pytorch_no_optim_aug_bef_1
Calling model: Respiratory_Allergy_Pytorch_no_optim_aug_bef_1
Calling model: Food_Allergy_Pytorch_no_optim_aug_bef_1
Calling model: Venom_Allergy_Pytorch_no_optim_aug_bef_1
Calling model: Type_of_Respiratory_Allergy_ARIA_Pytorch_no_optim_aug_bef_1
Calling model: Type_of_Respiratory_Allergy_CONJ_Pytorch_no_optim_aug_bef_1
Calling model: Type_of_Respiratory_Allergy_GINA_Pytorch_no_optim_aug_bef_1
Calling model: Type_of_Respiratory_Allergy_IGE_Pollen_Gram_Pytorch_no_optim_aug_bef_1
Calling model: Type_of_Respiratory_Allergy_IGE_Pollen_Herb_Pytorch_no_optim_aug_bef_1
Calling model: Type_of_Respiratory_Allergy_IGE_Pollen_Tree_Pytorch_no_optim_aug_bef_1
Calling model: Type_of_Respiratory_Allergy_IGE_Dander_Animals_Pytorch_no_optim_aug_bef_1
Calling model: Type_of_Respiratory_Allergy_IGE_Mite_Cockroach_Pytorch_no_optim_aug_bef_1
Calling model: Type_of_Respiratory_Allergy_IGE_Mold

In [55]:
print(len(liste_df))
liste_df[0]

27


Unnamed: 0,pred_label Allergy_Present_Pytorch_no_optim_aug_bef_1_values
0,1
1,1
2,1
3,1
4,1
...,...
581,1
582,1
583,1
584,1


In [56]:
merged_df = pd.DataFrame()
for i,df in enumerate(liste_df):
    if i==0:
        merged_df = df
    else:
        last_two_columns = df.iloc[:, -2:]
        merged_df = pd.concat([merged_df, last_two_columns], axis=1)
        
added_list = ['pred_label ' + s + '_Pytorch_no_optim_aug_bef_1_values' for s in liste_column_monovalue_train]
zeros_data = pd.DataFrame(0, index=np.arange(len(merged_df)), columns= added_list)
merged_df = pd.concat([merged_df, zeros_data], axis=1)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586 entries, 0 to 585
Data columns (total 29 columns):
 #   Column                                                                                       Non-Null Count  Dtype
---  ------                                                                                       --------------  -----
 0   pred_label Allergy_Present_Pytorch_no_optim_aug_bef_1_values                                 586 non-null    int64
 1   pred_label Severe_Allergy_Pytorch_no_optim_aug_bef_1_values                                  586 non-null    int64
 2   pred_label Respiratory_Allergy_Pytorch_no_optim_aug_bef_1_values                             586 non-null    int64
 3   pred_label Food_Allergy_Pytorch_no_optim_aug_bef_1_values                                    586 non-null    int64
 4   pred_label Venom_Allergy_Pytorch_no_optim_aug_bef_1_values                                   586 non-null    int64
 5   pred_label Type_of_Respiratory_Allergy_ARIA_Pytorc

In [57]:
merged_df.head()

Unnamed: 0,pred_label Allergy_Present_Pytorch_no_optim_aug_bef_1_values,pred_label Severe_Allergy_Pytorch_no_optim_aug_bef_1_values,pred_label Respiratory_Allergy_Pytorch_no_optim_aug_bef_1_values,pred_label Food_Allergy_Pytorch_no_optim_aug_bef_1_values,pred_label Venom_Allergy_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Respiratory_Allergy_ARIA_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Respiratory_Allergy_CONJ_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Respiratory_Allergy_GINA_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Respiratory_Allergy_IGE_Pollen_Gram_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Respiratory_Allergy_IGE_Pollen_Herb_Pytorch_no_optim_aug_bef_1_values,...,pred_label Type_of_Food_Allergy_Oral_Syndrom_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Food_Allergy_Other_Legumes_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Food_Allergy_Peanut_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Food_Allergy_Shellfish_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Food_Allergy_TPO_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Food_Allergy_Tree_Nuts_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Venom_Allergy_ATCD_Venom_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Venom_Allergy_IGE_Venom_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Food_Allergy_Other_Pytorch_no_optim_aug_bef_1_values,pred_label Type_of_Food_Allergy_Cereals_&_Seeds_Pytorch_no_optim_aug_bef_1_values
0,1,1,1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
target_columns = ['Type_of_Food_Allergy_Cereals_&_Seeds', 'Type_of_Food_Allergy_Other_Legumes', 'Allergy_Present', 'Type_of_Food_Allergy_Mammalian_Milk', 'Type_of_Food_Allergy_Other',
                  'Type_of_Respiratory_Allergy_IGE_Mite_Cockroach', 'Venom_Allergy', 'Type_of_Respiratory_Allergy_ARIA', 'Type_of_Respiratory_Allergy_IGE_Pollen_Gram',
                  'Type_of_Respiratory_Allergy_IGE_Pollen_Herb', 'Food_Allergy', 'Type_of_Food_Allergy_Oral_Syndrom','Type_of_Food_Allergy_Tree_Nuts', 'Severe_Allergy',
                  'Type_of_Food_Allergy_Aromatics', 'Type_of_Venom_Allergy_IGE_Venom', 'Type_of_Venom_Allergy_ATCD_Venom', 'Type_of_Respiratory_Allergy_CONJ', 'Type_of_Food_Allergy_Peanut',
                  'Type_of_Food_Allergy_Egg', 'Type_of_Food_Allergy_Fish', 'Type_of_Respiratory_Allergy_GINA', 'Respiratory_Allergy', 'Type_of_Food_Allergy_TPO',
                  'Type_of_Respiratory_Allergy_IGE_Pollen_Tree', 'Type_of_Food_Allergy_Fruits_and_Vegetables', 'Type_of_Respiratory_Allergy_IGE_Molds_Yeast',
                  'Type_of_Respiratory_Allergy_IGE_Dander_Animals', 'Type_of_Food_Allergy_Shellfish']

for elem in target_columns:
    original_validation_data[elem]=  merged_df['pred_label '+ elem + '_Pytorch_no_optim_aug_bef_1_values']

In [59]:
original_validation_data.head(30)

Unnamed: 0,trustii_id,Patient_ID,Chip_Code,Chip_Type,Chip_Image_Name,Age,Gender,Blood_Month_sample,French_Residence_Department,French_Region,...,Type_of_Food_Allergy_Egg,Type_of_Food_Allergy_Fish,Type_of_Respiratory_Allergy_GINA,Respiratory_Allergy,Type_of_Food_Allergy_TPO,Type_of_Respiratory_Allergy_IGE_Pollen_Tree,Type_of_Food_Allergy_Fruits_and_Vegetables,Type_of_Respiratory_Allergy_IGE_Molds_Yeast,Type_of_Respiratory_Allergy_IGE_Dander_Animals,Type_of_Food_Allergy_Shellfish
0,1,PMP0156,22 262C 3858,ISAC_V2,,8.0,1.0,6.0,deptBBB,regionJ,...,0,0,1,1,0,1,0,0,1,0
1,4,PCR0234,02AHX0DC,ALEX,02AHX0DC.bmp,14.0,1.0,7.0,deptL,regionD,...,0,0,1,1,0,1,0,0,1,0
2,5,PCR0532,02AUN372,ALEX,02AUN372.png,32.0,0.0,10.0,deptUUU,regionF,...,0,0,1,1,0,1,1,1,1,0
3,7,GJH0147,EKF3830_4,ISAC_V2,EKF3830_4_2200444337_2023_2_17_11_58_24.bmp,65.0,1.0,8.0,deptQ,regionF,...,0,0,1,0,0,0,0,0,0,0
4,8,TXV0009,881204001164,ISAC_V1,1G20027_2_881204001164_2012_4_25_18_32_58.bmp,5.0,0.0,4.0,deptII,regionC,...,0,0,1,0,0,0,0,0,0,0
5,9,PCR0118,02AFA752,ALEX,,49.0,0.0,1.0,deptXXX,regionI,...,0,0,1,1,0,0,0,0,0,0
6,10,QVW0214,AB02627_3,ISAC_V1,,6.0,1.0,2.0,deptY,regionD,...,0,0,1,1,0,1,0,1,1,0
7,15,TXV0157,881602013302,ISAC_V1,BAF4027_4_881602013302_2016_2_23_16_38_11.bmp,13.0,1.0,2.0,deptRR,regionB,...,0,0,1,1,0,0,0,0,0,0
8,18,WQW0190,223112546,ISAC_V2,END0E30_1_223112546_2023_1_3_16_20_19.bmp,12.0,0.0,11.0,deptOO,regionL,...,1,0,1,0,0,0,1,0,0,0
9,23,TXV0282,881903001372,ISAC_V1,CXG1527_3_881903001372_2019_3_14_3_51_59.bmp,8.0,0.0,3.0,deptEE,regionC,...,0,0,1,1,0,0,0,0,0,0


In [60]:
original_validation_data.to_csv('Submission_Pytorch_imblearn_6_iteration_binary_no_optim.csv', index=False, encoding='UTF-8')