# Using Pytorch on CSV_preprocess

In [1]:
!pip install session_info
!pip install imbalanced-learn



#### Import librairies

In [2]:
import pandas as pd
import os
import re

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset , random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau

import imblearn.over_sampling as oversampling
import imblearn.under_sampling as undersampling
import imblearn.combine as combination
import random

from torch.utils.data import TensorDataset # to recreate the modified dataset at each epoch

import session_info

device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
session_info.show()

#### Set the random seed for reproducibility

In [3]:
seed = 64
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

#### Get the data

In [4]:
raw_data= pd.read_csv('data/train.csv', low_memory=False)
data_test= pd.read_csv('data/test.csv', low_memory=False)

In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2989 entries, 0 to 2988
Columns: 365 entries, Patient_ID to Type_of_Venom_Allergy_IGE_Venom
dtypes: float64(322), int64(32), object(11)
memory usage: 8.3+ MB


#### Looking which are the targets to predict

In [6]:
missing_cols = set(raw_data.columns) ^ set(data_test.columns)
print(missing_cols)
len(missing_cols)

{'Type_of_Food_Allergy_Other_Legumes', 'Type_of_Food_Allergy_Tree_Nuts', 'Type_of_Food_Allergy_Other', 'Type_of_Respiratory_Allergy_CONJ', 'Venom_Allergy', 'Food_Allergy', 'Type_of_Food_Allergy_Fruits_and_Vegetables', 'Respiratory_Allergy', 'Type_of_Respiratory_Allergy_IGE_Dander_Animals', 'Type_of_Respiratory_Allergy_IGE_Mite_Cockroach', 'Type_of_Respiratory_Allergy_IGE_Pollen_Herb', 'Type_of_Food_Allergy_Peanut', 'trustii_id', 'Type_of_Venom_Allergy_ATCD_Venom', 'Type_of_Respiratory_Allergy_ARIA', 'Severe_Allergy', 'Allergy_Present', 'Type_of_Venom_Allergy_IGE_Venom', 'Type_of_Food_Allergy_Aromatics', 'Type_of_Food_Allergy_Mammalian_Milk', 'Type_of_Food_Allergy_TPO', 'Type_of_Respiratory_Allergy_IGE_Pollen_Tree', 'Type_of_Food_Allergy_Oral_Syndrom', 'Type_of_Food_Allergy_Cereals_&_Seeds', 'Type_of_Respiratory_Allergy_IGE_Molds_Yeast', 'Type_of_Food_Allergy_Shellfish', 'Type_of_Respiratory_Allergy_IGE_Pollen_Gram', 'Type_of_Respiratory_Allergy_GINA', 'Type_of_Food_Allergy_Fish', 'Type

30

## Data Pre-processing

### Preprocessing for the train set

In [7]:
liste_of_Targets =['Allergy_Present', 'Severe_Allergy', 'Respiratory_Allergy', 'Food_Allergy', 'Venom_Allergy',
                     'Type_of_Respiratory_Allergy_ARIA', 'Type_of_Respiratory_Allergy_CONJ', 
                     'Type_of_Respiratory_Allergy_GINA', 'Type_of_Respiratory_Allergy_IGE_Pollen_Gram',
                     'Type_of_Respiratory_Allergy_IGE_Pollen_Herb', 'Type_of_Respiratory_Allergy_IGE_Pollen_Tree',
                     'Type_of_Respiratory_Allergy_IGE_Dander_Animals', 'Type_of_Respiratory_Allergy_IGE_Mite_Cockroach',
                     'Type_of_Respiratory_Allergy_IGE_Molds_Yeast', 'Type_of_Food_Allergy_Aromatics', 'Type_of_Food_Allergy_Other',
                     'Type_of_Food_Allergy_Cereals_&_Seeds', 'Type_of_Food_Allergy_Egg', 'Type_of_Food_Allergy_Fish',
                     'Type_of_Food_Allergy_Fruits_and_Vegetables', 'Type_of_Food_Allergy_Mammalian_Milk', 
                     'Type_of_Food_Allergy_Oral_Syndrom', 'Type_of_Food_Allergy_Other_Legumes', 'Type_of_Food_Allergy_Peanut',
                     'Type_of_Food_Allergy_Shellfish', 'Type_of_Food_Allergy_TPO', 'Type_of_Food_Allergy_Tree_Nuts',
                     'Type_of_Venom_Allergy_ATCD_Venom', 'Type_of_Venom_Allergy_IGE_Venom']
def preprocessing_data(df):
    df = df.drop('Food_Type_0', axis =1)
    df.replace(-1, 0, inplace=True)
    data_noNAN = df.fillna(-1)
    # obtain Targets
    Targets = data_noNAN.loc[:,liste_of_Targets]
    # filter feautures
    X1=data_noNAN.loc[:, ['Chip_Type','Age','Gender','French_Residence_Department','Blood_Month_sample']]
    X= data_noNAN.iloc[:, 8:-29]
    data = pd.concat( [X1, X] , axis=1)
    # handle the 'Treatment_of_rhinitis' feature
    data['Treatment_of_rhinitis'] = data['Treatment_of_rhinitis'].astype(str)
    data['Treatment_of_rhinitis'] = data['Treatment_of_rhinitis'].str.replace('.0', '', regex=True)
   
    # Preprocessing of numerical data
    
    ## Transform continuous values of chips features into discrete values
    
    float_cols= data.select_dtypes(include=['float64'])
    filtered_cols = float_cols.columns[float_cols.apply(lambda x: (x % 1 != 0).any())] # it doesn't matter if we miss the columns fill with 0 as at the end, 0 keep their values
    filtered_data = data[filtered_cols]
    
    ### Create a boolean mask for non -1 values
    mask = filtered_data != -1
    
    ### Apply Min-Max scaling
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(filtered_data[mask])
    
    ### Convert the scaled array back to a DataFrame and apply 'discretisation' 
    scaled_df = pd.DataFrame(scaled_data, columns=filtered_data.columns)

    thresholds = [0, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 2.5e-2, 5e-2, 1e-1, 5e-1, 1]

    scaled_df[(scaled_df > thresholds[0]) & (scaled_df < thresholds[1])] = 1.5
    scaled_df[(scaled_df >= thresholds[1]) & (scaled_df < thresholds[2])] = 2
    scaled_df[(scaled_df >= thresholds[2]) & (scaled_df < thresholds[3])] = 3
    scaled_df[(scaled_df >= thresholds[3]) & (scaled_df < thresholds[4])] = 4
    scaled_df[(scaled_df >= thresholds[4]) & (scaled_df < thresholds[5])] = 5
    scaled_df[(scaled_df >= thresholds[5]) & (scaled_df < thresholds[6])] = 6
    scaled_df[(scaled_df >= thresholds[6]) & (scaled_df < thresholds[7])] = 7
    scaled_df[(scaled_df >= thresholds[7]) & (scaled_df < thresholds[8])] = 8
    scaled_df[(scaled_df >= thresholds[8]) & (scaled_df < thresholds[9])] = 9
    scaled_df[(scaled_df >= thresholds[9]) & (scaled_df < (thresholds[10]+0.01))] = 10

    ### Replace NaN values with -1
    scaled_df = (scaled_df/10).fillna(-1)
    
    ### Replace columns in the dataframe of features
    columns_to_update = scaled_df.columns
    data[columns_to_update] = scaled_df[columns_to_update]
    
    ##  Get_dummies of the 'object' type columns
    
    columns_to_encode = ['Chip_Type', 'French_Residence_Department', 'French_Region',
         'Treatment_of_athsma', 'Age_of_onsets',
       'General_cofactors', 'Treatment_of_atopic_dematitis','Treatment_of_rhinitis']
    
    ### Split the columns using multiple delimiters and create dummy columns
    dummy_dfs = []
    for col in columns_to_encode:
        # Split the data in the column that use  delimiters
        data[col] = data[col].astype(str)
        data[col] = data[col].apply(lambda x: [i.strip() for i in re.split('[,.]', x)])

        # Create dummy columns
        dummy_df = pd.get_dummies(data[col].apply(pd.Series).stack(), prefix=f"{col}", prefix_sep='_').groupby(level=0).sum()
        dummy_dfs.append(dummy_df)

    ### Concatenate the original DataFrame with the dummy columns
    df_final = pd.concat([data] + dummy_dfs, axis=1)

    ### Drop the original columns from the final dataset
    df_final.drop(columns=columns_to_encode, inplace=True)
    
    # Converting all values into 'float16' type
    encode_data = df_final.astype('float16')
    print(encode_data.info())
    
    return encode_data,Targets

In [8]:
encode_data,Targets = preprocessing_data(raw_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2989 entries, 0 to 2988
Columns: 467 entries, Age to Treatment_of_rhinitis_9
dtypes: float16(467)
memory usage: 2.7 MB
None


### Preprocessing for the test set

In [9]:
def preprocessing_data_test(df):
    df = df.drop('Food_Type_0', axis =1)
    df.replace(-1, 0, inplace=True)
    data_test_noNAN = df.fillna(-1)
    # filter feautures
    X1=data_test_noNAN.loc[:, ['Chip_Type']]
    X= data_test_noNAN.iloc[:, 5:]
    data = pd.concat( [X1, X] , axis=1)
    # handle the 'Treatment_of_rhinitis' feature
    data['Treatment_of_rhinitis'] = data['Treatment_of_rhinitis'].astype(str)
    data['Treatment_of_rhinitis'] = data['Treatment_of_rhinitis'].str.replace('.0', '', regex=True)
    # handle 'Age_of_onsets' which don't have the same format in data test and train
    data['Age_of_onsets'] = data['Age_of_onsets'].astype(str)

    # Preprocessing of numerical data

    ## Transform continuous values of chips features into discrete values

    float_cols= data.select_dtypes(include=['float64'])
    filtered_cols = float_cols.columns[float_cols.apply(lambda x: (x % 1 != 0).any())] # it doesn't matter if we miss the columns fill with 0 as at the end, 0 keep their values
    filtered_data = data[filtered_cols]

    ### Create a boolean mask for non -1 values
    mask = filtered_data != -1

    ### Apply Min-Max scaling
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(filtered_data[mask])

    ### Convert the scaled array back to a DataFrame and apply 'discretisation' 
    scaled_df = pd.DataFrame(scaled_data, columns=filtered_data.columns)

    thresholds = [0, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 2.5e-2, 5e-2, 1e-1, 5e-1, 1]

    scaled_df[(scaled_df > thresholds[0]) & (scaled_df < thresholds[1])] = 1.5
    scaled_df[(scaled_df >= thresholds[1]) & (scaled_df < thresholds[2])] = 2
    scaled_df[(scaled_df >= thresholds[2]) & (scaled_df < thresholds[3])] = 3
    scaled_df[(scaled_df >= thresholds[3]) & (scaled_df < thresholds[4])] = 4
    scaled_df[(scaled_df >= thresholds[4]) & (scaled_df < thresholds[5])] = 5
    scaled_df[(scaled_df >= thresholds[5]) & (scaled_df < thresholds[6])] = 6
    scaled_df[(scaled_df >= thresholds[6]) & (scaled_df < thresholds[7])] = 7
    scaled_df[(scaled_df >= thresholds[7]) & (scaled_df < thresholds[8])] = 8
    scaled_df[(scaled_df >= thresholds[8]) & (scaled_df < thresholds[9])] = 9
    scaled_df[(scaled_df >= thresholds[9]) & (scaled_df < (thresholds[10]+0.01))] = 10

    ### Replace NaN values with -1
    scaled_df = (scaled_df/10).fillna(-1)

    ### Replace columns in the dataframe of features
    columns_to_update = scaled_df.columns
    data[columns_to_update] = scaled_df[columns_to_update]
    
    ##  Get_dummies of the 'object' type columns
    
    columns_to_encode = ['Chip_Type', 'French_Residence_Department', 'French_Region',
         'Treatment_of_athsma', 'Age_of_onsets',
       'General_cofactors', 'Treatment_of_atopic_dematitis','Treatment_of_rhinitis']
    
    ### Split the columns using multiple delimiters and create dummy columns
    dummy_dfs = []
    for col in columns_to_encode:
        # Split the data in the column that use  delimiters
        data[col] = data[col].astype(str)
        data[col] = data[col].apply(lambda x: [i.strip() for i in re.split('[,.]', x)])

        # Create dummy columns
        dummy_df = pd.get_dummies(data[col].apply(pd.Series).stack(), prefix=f"{col}", prefix_sep='_').groupby(level=0).sum()
        dummy_dfs.append(dummy_df)

    ### Concatenate the original DataFrame with the dummy columns
    df_final = pd.concat([data] + dummy_dfs, axis=1)

    ### Drop the original columns from the final dataset
    df_final.drop(columns=columns_to_encode, inplace=True)
    
    # Converting all values into 'float16' type
    encode_data = df_final.astype('float16')
    print(encode_data.info())
    
    return encode_data


In [10]:
encode_data_test = preprocessing_data_test(data_test)
missing_cols = set(encode_data.columns) ^ set(encode_data_test.columns)
print(missing_cols)
len(missing_cols)
encode_data_test = encode_data_test.reindex(columns=encode_data.columns, fill_value=0).astype('float16')
encode_data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1282 entries, 0 to 1281
Columns: 448 entries, Age to Treatment_of_rhinitis_9
dtypes: float16(448)
memory usage: 1.1 MB
None
{'French_Region_regionN', 'French_Residence_Department_deptK', 'French_Residence_Department_deptZZZ', 'Treatment_of_atopic_dematitis_7', 'Treatment_of_athsma_8', 'French_Residence_Department_deptNNN', 'French_Residence_Department_deptP', 'French_Residence_Department_deptQQQ', 'French_Residence_Department_deptUU', 'French_Residence_Department_deptW', 'French_Residence_Department_deptAAAA', 'General_cofactors_11', 'French_Residence_Department_deptDD', 'French_Residence_Department_deptCCCC', 'French_Residence_Department_deptIII', 'French_Residence_Department_deptT', 'French_Residence_Department_deptOOO', 'French_Residence_Department_deptMMM', 'French_Residence_Department_deptDDD', 'French_Residence_Department_deptJJJ', 'French_Residence_Department_deptHHH', 'French_Residence_Department_deptRRR', 'French_Residence_Depa

## For one class (each class can use different hyperparameters to achieve the best results so we gonna explore all targets one by one and keep the best model)

## For all targets

#### Transform 9 into 2 for multiclass pytorch

In [11]:
Targets.replace(9, 2, inplace=True)

#### General definitions

##### The custom dataset that will be used to store datas as tensors

In [12]:
class CustomDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
        self.features = self.data.iloc[:, :-1].values
        self.labels = self.data.iloc[:, -1].values

        if self.transform is not None:
            self.features, self.labels = self.transform.fit_resample(self.features, self.labels)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(int(self.labels[idx]), dtype=torch.long)

        return features, label
    
    @property
    def num_features(self):
        return self.features.shape[1]
    
    @property
    def num_classes(self):
        return len(set(self.labels))

##### The class_weight calculator that will be used after each imblearn transformation

In [13]:
def calculate_class_weights(dataset):
    # Convert labels to a PyTorch tensor
    labels_tensor = torch.from_numpy(dataset.labels)
    
    # Get the number of samples in each class
    class_counts = torch.bincount(labels_tensor)
    
    # Calculate the weight for each class as the inverse of its sample count
    total_samples = torch.sum(class_counts)
    class_weights = total_samples / (class_counts * len(class_counts))
    
    return class_weights

##### The dico of sampling technique that will be used to randomly chose the enhance data method at each epoch

In [14]:
sampling_techniques = {
    2: oversampling.RandomOverSampler(),
    #1: oversampling.SMOTE(),
    3: oversampling.BorderlineSMOTE(),
    #0: oversampling.SVMSMOTE(),
    #4: undersampling.TomekLinks(sampling_strategy='auto'),
    4: undersampling.TomekLinks(sampling_strategy='all'),
    5: combination.SMOTETomek(sampling_strategy='auto'),
    6: combination.SMOTETomek(sampling_strategy='all'),
}

##### The architecture of the models to train and used for generate prediction

In [19]:
class Allergy_Net(nn.Module):
    
    def __init__(self,input_size,hidden_size, num_class,dropout_rate):

        super(Allergy_Net,self).__init__()
        self.linear1= nn.Linear(input_size,hidden_size)
        self.linear2= nn.Linear(hidden_size,int(hidden_size/8))
        self.linear3= nn.Linear(int(hidden_size/8),int(hidden_size/16))
        self.linear4= nn.Linear(int(hidden_size/16),num_class)

        self.dropout1 = nn.Dropout(dropout_rate)
        self.batchnorm1 = nn.BatchNorm1d(int(hidden_size/16))

    def forward(self,inputs):
        x = torch.relu(self.linear1(inputs))
        x= self.dropout1(x)
        x = torch.relu(self.linear2(x))
        x= self.dropout1(x)
        x = torch.relu(self.linear3(x))
        x= self.dropout1(x)
        x= self.batchnorm1(x)
        outputs= self.linear4(x)

        # no softmax because Cross entropy Loss
        return outputs

##### The training process to check how the model evolve during training and changing hyperparameters

In [59]:
def training_by_target(column,batch_size=512,learning_rate = 1e-3,dropout_rate= 0.6, weight_decay=1e-2, factor=0.75):
    print(Targets[column].value_counts())
    dataset_panda= pd.concat([encode_data,Targets[column]], axis = 1)
    
    # Establish the splitting and the Dataset/Loaders for the evaluation part
    batch_size= batch_size
    train_data, test_data = train_test_split(dataset_panda, test_size=0.25, random_state=123)
    dataset_test=CustomDataset(test_data)
    dataset_all=CustomDataset(dataset_panda)
    test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)
    test_all = DataLoader(dataset_all, batch_size=batch_size, shuffle=False)
    
    # define hyperparameters
    input_size= dataset_all.num_features
    hidden_size= 2048
    num_class = dataset_all.num_classes
    num_epochs=60
    learning_rate = learning_rate
    print(input_size, num_class)
    
    # Call a model and define loss and optimizer
    model= Allergy_Net(input_size,hidden_size,num_class,dropout_rate).to(device)
    criterion= nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=factor, patience=5, verbose=True)
    
    
    # Initialization of some indicators that are used to save best model during training
    best_f1_score = 0.0
    best_model_state = None

    # Training loop
    optimizer.zero_grad()
    for epoch in range(num_epochs):
        model.train()
        random_key = random.randint(2, 6)
        selected_technique = sampling_techniques[random_key]
        print(selected_technique)
        # Apply the sampling technique
        train_dataset = CustomDataset(train_data, transform=selected_technique)
        dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

         # Update class weights
        class_weights = calculate_class_weights(train_dataset)
        criterion.weight = class_weights

        true_labels = []
        predicted_labels = []
        model.train()
        for i,(data,labels) in enumerate (dataloader):
            data= data.to(device)
            labels= labels.to(device)

            #forward
            outputs=model(data)
            loss= criterion(outputs,labels)

            #backward
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            # Calculate some metrics
            _, predictions = torch.max(outputs, 1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())

        acc = accuracy_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels, average='macro')


        print (f'epoch {epoch+1}/{num_epochs}, loss = {loss:.5f}, train_acc = {acc:.4f}, F1 Score_Train = {f1:.4f}')

        # Test 
        model.eval()
        with torch.no_grad():
            true_labels_test = []
            predicted_labels_test = []
            for i,(data,labels) in enumerate (test_loader):
                data= data.to(device)
                labels= labels.to(device)

                outputs = model(data)

                # return value and index of the best class
                _, predictions = torch.max(outputs, 1)

                true_labels_test.extend(labels.cpu().numpy())
                predicted_labels_test.extend(predictions.cpu().numpy())


            test_accuracy = accuracy_score(true_labels_test, predicted_labels_test)
            f1_test = f1_score(true_labels_test, predicted_labels_test, average='macro')

            lr_scheduler.step(f1_test + (f1*0.1))

            # Check if the current model is the best one based on f1 score

            if f1_test + (f1*0.5) > best_f1_score:
                best_f1_score = f1_test+(f1*0.5) 
                best_model_state = model.state_dict()
                torch.save(model.state_dict(), 'best_model.pth')  # Save the best model
                print('\033[91m'+'MODEL_SAVE')
                print(f'Accuracy_test = {test_accuracy:.4f}, F1 Score_test = {f1_test:.4f}'+'\033[0m')

            print(' ')
            
    # eval_final_before saving        
    model = Allergy_Net(input_size, hidden_size, num_class,dropout_rate).to(device)

    # Load the saved model state dictionary
    model.load_state_dict(torch.load('best_model.pth'))

    # Set the model to evaluation mode
    model.eval()

    # Test       
    with torch.no_grad():
        true_labels_test = []
        predicted_labels_test = []
        for i,(data,labels) in enumerate (test_loader):
            data= data.to(device)
            labels= labels.to(device)

            outputs = model(data)

            # return value and index of the best class
            _, predictions = torch.max(outputs, 1)
            true_labels_test.extend(labels.cpu().numpy())
            predicted_labels_test.extend(predictions.cpu().numpy())


        test_accuracy = accuracy_score(true_labels_test, predicted_labels_test)
        f1_test = f1_score(true_labels_test, predicted_labels_test, average='macro')


        print(f'Accuracy = {test_accuracy:.3f}')
        print(f'F1 Score = {f1_test:.4f}')

    # Test on the whole dataset
    with torch.no_grad():
        true_labels = []
        predicted_labels = []
        for data, labels in test_all:
            data= data.to(device)
            labels= labels.to(device)

            outputs = model(data)

            # return value and index of the best class
            _,predictions= torch.max(outputs,1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())

            data_accuracy = accuracy_score(true_labels, predicted_labels)
            f1 = f1_score(true_labels, predicted_labels, average='macro')

        print(f'Accuracy = {data_accuracy:.3f}')
        print(f'F1 Score = {f1:.4f}')
    return model  

## Target: Type_of_Venom_Allergy_IGE_Venom

In [39]:
model=training_by_target('Type_of_Venom_Allergy_IGE_Venom',batch_size=512,learning_rate = 1e-3,dropout_rate= 0.6, weight_decay=1e-2, factor=0.75)

0    2967
1      22
Name: Type_of_Venom_Allergy_IGE_Venom, dtype: int64
467 2
TomekLinks(sampling_strategy='all')
epoch 1/60, loss = 0.74256, train_acc = 0.5047, F1 Score_Train = 0.3370
[91mMODEL_SAVE
Accuracy_test = 0.9920, F1 Score_test = 0.4980[0m
 
SMOTETomek()
epoch 2/60, loss = 0.60843, train_acc = 0.6328, F1 Score_Train = 0.6326
[91mMODEL_SAVE
Accuracy_test = 0.8262, F1 Score_test = 0.4743[0m
 
TomekLinks(sampling_strategy='all')
epoch 3/60, loss = 0.36876, train_acc = 0.5487, F1 Score_Train = 0.3606
 
RandomOverSampler()
epoch 4/60, loss = 0.58863, train_acc = 0.7135, F1 Score_Train = 0.7133
 
TomekLinks(sampling_strategy='all')
epoch 5/60, loss = 0.80822, train_acc = 0.5716, F1 Score_Train = 0.3713
 
SMOTETomek(sampling_strategy='all')
epoch 6/60, loss = 0.55313, train_acc = 0.7535, F1 Score_Train = 0.7530
[91mMODEL_SAVE
Accuracy_test = 0.7406, F1 Score_test = 0.4402[0m
 
SMOTETomek(sampling_strategy='all')
epoch 7/60, loss = 0.45897, train_acc = 0.7773, F1 Score_Train =

In [43]:
model_to_save = Allergy_Net(input_size=467, hidden_size=2048, num_class=2,dropout_rate=0.6).to(device)
model_to_save.load_state_dict(torch.load('best_model.pth'))
torch.save(model_to_save.state_dict(), 'Type_of_Venom_Allergy_IGE_Venom_Pytorch_2.pth')

## Target: Type_of_Venom_Allergy_ATCD_Venom

In [73]:
sampling_techniques = {
    3: oversampling.RandomOverSampler(),
    #1: oversampling.SMOTE(),
    4: oversampling.BorderlineSMOTE(),
    #0: oversampling.SVMSMOTE(),
    #4: undersampling.TomekLinks(sampling_strategy='auto'),
    #4: undersampling.TomekLinks(sampling_strategy='all'),
    5: combination.SMOTETomek(sampling_strategy='auto'),
    6: combination.SMOTETomek(sampling_strategy='all'),
}
def training_by_target(column,batch_size=512,learning_rate = 1e-3,dropout_rate= 0.6, weight_decay=1e-2, factor=0.75):
    print(Targets[column].value_counts())
    dataset_panda= pd.concat([encode_data,Targets[column]], axis = 1)
    
    # Establish the splitting and the Dataset/Loaders for the evaluation part
    batch_size= batch_size
    train_data, test_data = train_test_split(dataset_panda, test_size=0.15, random_state=123)
    dataset_test=CustomDataset(test_data)
    dataset_all=CustomDataset(dataset_panda)
    test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)
    test_all = DataLoader(dataset_all, batch_size=batch_size, shuffle=False)
    
    # define hyperparameters
    input_size= dataset_all.num_features
    hidden_size= 2048
    num_class = dataset_all.num_classes
    num_epochs=60
    learning_rate = learning_rate
    print(input_size, num_class)
    
    # Call a model and define loss and optimizer
    model= Allergy_Net(input_size,hidden_size,num_class,dropout_rate).to(device)
    criterion= nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=factor, patience=5, verbose=True)
    
    
    # Initialization of some indicators that are used to save best model during training
    best_f1_score = 0.0
    best_model_state = None

    # Training loop
    optimizer.zero_grad()
    for epoch in range(num_epochs):
        model.train()
        random_key = random.randint(3, 6)
        selected_technique = sampling_techniques[random_key]
        print(selected_technique)
        # Apply the sampling technique
        train_dataset = CustomDataset(train_data, transform=selected_technique)
        dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

         # Update class weights
        class_weights = calculate_class_weights(train_dataset)
        criterion.weight = class_weights

        true_labels = []
        predicted_labels = []
        model.train()
        for i,(data,labels) in enumerate (dataloader):
            data= data.to(device)
            labels= labels.to(device)

            #forward
            outputs=model(data)
            loss= criterion(outputs,labels)

            #backward
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            # Calculate some metrics
            _, predictions = torch.max(outputs, 1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())

        acc = accuracy_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels, average='macro')


        print (f'epoch {epoch+1}/{num_epochs}, loss = {loss:.5f}, train_acc = {acc:.4f}, F1 Score_Train = {f1:.4f}')

        # Test 
        model.eval()
        with torch.no_grad():
            true_labels_test = []
            predicted_labels_test = []
            for i,(data,labels) in enumerate (test_loader):
                data= data.to(device)
                labels= labels.to(device)

                outputs = model(data)

                # return value and index of the best class
                _, predictions = torch.max(outputs, 1)

                true_labels_test.extend(labels.cpu().numpy())
                predicted_labels_test.extend(predictions.cpu().numpy())


            test_accuracy = accuracy_score(true_labels_test, predicted_labels_test)
            f1_test = f1_score(true_labels_test, predicted_labels_test, average='macro')

            lr_scheduler.step(f1_test + (f1*0.1))

            # Check if the current model is the best one based on f1 score

            if f1_test + (f1*0.5) > best_f1_score:
                best_f1_score = f1_test+(f1*0.5) 
                best_model_state = model.state_dict()
                torch.save(model.state_dict(), 'best_model.pth')  # Save the best model
                print('\033[91m'+'MODEL_SAVE')
                print(f'Accuracy_test = {test_accuracy:.4f}, F1 Score_test = {f1_test:.4f}'+'\033[0m')

            print(' ')
            
    # eval_final_before saving        
    model = Allergy_Net(input_size, hidden_size, num_class,dropout_rate).to(device)

    # Load the saved model state dictionary
    model.load_state_dict(torch.load('best_model.pth'))

    # Set the model to evaluation mode
    model.eval()

    # Test       
    with torch.no_grad():
        true_labels_test = []
        predicted_labels_test = []
        for i,(data,labels) in enumerate (test_loader):
            data= data.to(device)
            labels= labels.to(device)

            outputs = model(data)

            # return value and index of the best class
            _, predictions = torch.max(outputs, 1)
            true_labels_test.extend(labels.cpu().numpy())
            predicted_labels_test.extend(predictions.cpu().numpy())


        test_accuracy = accuracy_score(true_labels_test, predicted_labels_test)
        f1_test = f1_score(true_labels_test, predicted_labels_test, average='macro')


        print(f'Accuracy = {test_accuracy:.3f}')
        print(f'F1 Score = {f1_test:.4f}')

    # Test on the whole dataset
    with torch.no_grad():
        true_labels = []
        predicted_labels = []
        for data, labels in test_all:
            data= data.to(device)
            labels= labels.to(device)

            outputs = model(data)

            # return value and index of the best class
            _,predictions= torch.max(outputs,1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())

            data_accuracy = accuracy_score(true_labels, predicted_labels)
            f1 = f1_score(true_labels, predicted_labels, average='macro')

        print(f'Accuracy = {data_accuracy:.3f}')
        print(f'F1 Score = {f1:.4f}')
    return model  

In [74]:
model=training_by_target('Type_of_Venom_Allergy_ATCD_Venom',batch_size=64,learning_rate = 1e-3,dropout_rate= 0.25, weight_decay=1e-2, factor=0.1)

0    2968
1      21
Name: Type_of_Venom_Allergy_ATCD_Venom, dtype: int64
467 2
BorderlineSMOTE()
epoch 1/60, loss = 0.22585, train_acc = 0.8542, F1 Score_Train = 0.8534
[91mMODEL_SAVE
Accuracy_test = 0.9332, F1 Score_test = 0.4827[0m
 
SMOTETomek(sampling_strategy='all')
epoch 2/60, loss = 0.55738, train_acc = 0.6375, F1 Score_Train = 0.6350
 
SMOTETomek()
epoch 3/60, loss = 0.50204, train_acc = 0.7315, F1 Score_Train = 0.7292
 
SMOTETomek(sampling_strategy='all')
epoch 4/60, loss = 0.44708, train_acc = 0.7729, F1 Score_Train = 0.7690
 
BorderlineSMOTE()
epoch 5/60, loss = 0.18578, train_acc = 0.9064, F1 Score_Train = 0.9056
[91mMODEL_SAVE
Accuracy_test = 0.9154, F1 Score_test = 0.4779[0m
 
RandomOverSampler()
epoch 6/60, loss = 0.53277, train_acc = 0.6852, F1 Score_Train = 0.6820
 
SMOTETomek()
epoch 7/60, loss = 0.39316, train_acc = 0.8120, F1 Score_Train = 0.8102
 
BorderlineSMOTE()
epoch 8/60, loss = 0.14250, train_acc = 0.9480, F1 Score_Train = 0.9479
[91mMODEL_SAVE
Accuracy_

In [77]:
model_to_save = Allergy_Net(input_size=467, hidden_size=2048, num_class=2,dropout_rate=0.6).to(device)
model_to_save.load_state_dict(torch.load('best_model.pth'))
torch.save(model_to_save.state_dict(), 'Type_of_Venom_Allergy_ATCD_Venom_Pytorch_2.pth')

## Target: Type_of_Respiratory_Allergy_IGE_Molds_Yeast

In [103]:
sampling_techniques = {
    0: oversampling.RandomOverSampler(),
    1: oversampling.SMOTE(),
    2: oversampling.BorderlineSMOTE(),
    3: oversampling.SVMSMOTE(),
    4: undersampling.RandomUnderSampler(),
    5: undersampling.TomekLinks(),
    6: undersampling.NearMiss(),
    7: undersampling.EditedNearestNeighbours(),
    8: combination.SMOTEENN(),
    9: combination.SMOTETomek()
}
class Allergy_Net(nn.Module):
    
    def __init__(self,input_size,hidden_size, num_class,dropout_rate):

        super(Allergy_Net,self).__init__()
        self.linear1= nn.Linear(input_size,hidden_size)
        self.linear2= nn.Linear(hidden_size,int(hidden_size/8))
        self.linear3= nn.Linear(int(hidden_size/8),int(hidden_size/16))
        self.linear4= nn.Linear(int(hidden_size/16),num_class)

        self.dropout1 = nn.Dropout(dropout_rate)
        self.batchnorm1 = nn.BatchNorm1d(int(hidden_size/16))

    def forward(self,inputs):
        x = torch.relu(self.linear1(inputs))
        x= self.dropout1(x)
        x = torch.relu(self.linear2(x))
        x= self.dropout1(x)
        x = torch.relu(self.linear3(x))
        x= self.dropout1(x)
        x= self.batchnorm1(x)
        outputs= self.linear4(x)

        # no softmax because Cross entropy Loss
        return outputs
    
def training_by_target(column,batch_size=512,learning_rate = 1e-3,dropout_rate= 0.6, weight_decay=1e-2, factor=0.75):
    print(Targets[column].value_counts())
    dataset_panda= pd.concat([encode_data,Targets[column]], axis = 1)
    
    # Establish the splitting and the Dataset/Loaders for the evaluation part
    batch_size= batch_size
    train_data, test_data = train_test_split(dataset_panda, test_size=0.2, random_state=123)
    dataset_test=CustomDataset(test_data)
    dataset_all=CustomDataset(dataset_panda)
    test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)
    test_all = DataLoader(dataset_all, batch_size=batch_size, shuffle=False)
    
    # define hyperparameters
    input_size= dataset_all.num_features
    hidden_size= 2048
    num_class = dataset_all.num_classes
    num_epochs=60
    learning_rate = learning_rate
    print(input_size, num_class)
    
    # Call a model and define loss and optimizer
    model= Allergy_Net(input_size,hidden_size,num_class,dropout_rate).to(device)
    criterion= nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=factor, patience=5, verbose=True)
    
    
    # Initialization of some indicators that are used to save best model during training
    best_f1_score = 0.0
    best_model_state = None

    # Training loop
    optimizer.zero_grad()
    for epoch in range(num_epochs):
        model.train()
        random_key = random.randint(0, 9)
        selected_technique = sampling_techniques[random_key]
        print(selected_technique)
        # Apply the sampling technique
        train_dataset = CustomDataset(train_data, transform=selected_technique)
        dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

         # Update class weights
        #class_weights = calculate_class_weights(train_dataset)
        #criterion.weight = class_weights

        true_labels = []
        predicted_labels = []
        model.train()
        for i,(data,labels) in enumerate (dataloader):
            data= data.to(device)
            labels= labels.to(device)

            #forward
            outputs=model(data)
            loss= criterion(outputs,labels)

            #backward
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            # Calculate some metrics
            _, predictions = torch.max(outputs, 1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())

        acc = accuracy_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels, average='macro')


        print (f'epoch {epoch+1}/{num_epochs}, loss = {loss:.5f}, train_acc = {acc:.4f}, F1 Score_Train = {f1:.4f}')

        # Test 
        model.eval()
        with torch.no_grad():
            true_labels_test = []
            predicted_labels_test = []
            for i,(data,labels) in enumerate (test_loader):
                data= data.to(device)
                labels= labels.to(device)

                outputs = model(data)

                # return value and index of the best class
                _, predictions = torch.max(outputs, 1)

                true_labels_test.extend(labels.cpu().numpy())
                predicted_labels_test.extend(predictions.cpu().numpy())


            test_accuracy = accuracy_score(true_labels_test, predicted_labels_test)
            f1_test = f1_score(true_labels_test, predicted_labels_test, average='macro')

            lr_scheduler.step(f1_test + (f1*0.1))

            # Check if the current model is the best one based on f1 score

            if f1_test + (f1*0.05) > best_f1_score:
                best_f1_score = f1_test+(f1*0.05) 
                best_model_state = model.state_dict()
                torch.save(model.state_dict(), 'best_model.pth')  # Save the best model
                print('\033[91m'+'MODEL_SAVE')
                print(f'Accuracy_test = {test_accuracy:.4f}, F1 Score_test = {f1_test:.4f}'+'\033[0m')

            print(' ')
            
    # eval_final_before saving        
    model = Allergy_Net(input_size, hidden_size, num_class,dropout_rate).to(device)

    # Load the saved model state dictionary
    model.load_state_dict(torch.load('best_model.pth'))

    # Set the model to evaluation mode
    model.eval()

    # Test       
    with torch.no_grad():
        true_labels_test = []
        predicted_labels_test = []
        for i,(data,labels) in enumerate (test_loader):
            data= data.to(device)
            labels= labels.to(device)

            outputs = model(data)

            # return value and index of the best class
            _, predictions = torch.max(outputs, 1)
            true_labels_test.extend(labels.cpu().numpy())
            predicted_labels_test.extend(predictions.cpu().numpy())


        test_accuracy = accuracy_score(true_labels_test, predicted_labels_test)
        f1_test = f1_score(true_labels_test, predicted_labels_test, average='weighted')


        print(f'Accuracy = {test_accuracy:.3f}')
        print(f'F1 Score = {f1_test:.4f}')

    # Test on the whole dataset
    with torch.no_grad():
        true_labels = []
        predicted_labels = []
        for data, labels in test_all:
            data= data.to(device)
            labels= labels.to(device)

            outputs = model(data)

            # return value and index of the best class
            _,predictions= torch.max(outputs,1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())

            data_accuracy = accuracy_score(true_labels, predicted_labels)
            f1 = f1_score(true_labels, predicted_labels, average='macro')

        print(f'Accuracy = {data_accuracy:.3f}')
        print(f'F1 Score = {f1:.4f}')
    return model  

In [113]:
model=training_by_target('Type_of_Respiratory_Allergy_IGE_Molds_Yeast',batch_size=128,learning_rate = 2e-3,dropout_rate= 0.3, weight_decay=1e-2, factor=0.3)

2    1482
0    1166
1     341
Name: Type_of_Respiratory_Allergy_IGE_Molds_Yeast, dtype: int64
467 3
RandomUnderSampler()
epoch 1/60, loss = 1.03035, train_acc = 0.4242, F1 Score_Train = 0.4204
[91mMODEL_SAVE
Accuracy_test = 0.6622, F1 Score_test = 0.4976[0m
 
BorderlineSMOTE()
epoch 2/60, loss = 0.71245, train_acc = 0.5992, F1 Score_Train = 0.5745
[91mMODEL_SAVE
Accuracy_test = 0.6706, F1 Score_test = 0.5356[0m
 
BorderlineSMOTE()
epoch 3/60, loss = 0.70223, train_acc = 0.6481, F1 Score_Train = 0.6243
[91mMODEL_SAVE
Accuracy_test = 0.7860, F1 Score_test = 0.6298[0m
 
BorderlineSMOTE()
epoch 4/60, loss = 0.58468, train_acc = 0.7137, F1 Score_Train = 0.7068
[91mMODEL_SAVE
Accuracy_test = 0.7324, F1 Score_test = 0.6342[0m
 
SMOTE()
epoch 5/60, loss = 0.60639, train_acc = 0.7492, F1 Score_Train = 0.7467
 
EditedNearestNeighbours()
epoch 6/60, loss = 0.27077, train_acc = 0.8588, F1 Score_Train = 0.8115
[91mMODEL_SAVE
Accuracy_test = 0.8395, F1 Score_test = 0.7221[0m
 
SMOTE()
epoc

In [115]:
model_to_save = Allergy_Net(input_size=467, hidden_size=2048, num_class=3,dropout_rate=0.6).to(device)
model_to_save.load_state_dict(torch.load('best_model.pth'))
torch.save(model_to_save.state_dict(), 'Type_of_Respiratory_Allergy_IGE_Molds_Yeast_Pytorch_2.pth')

## Target: Type_of_Food_Allergy_Tree_Nuts

In [118]:
model=training_by_target('Type_of_Food_Allergy_Tree_Nuts',batch_size=64,learning_rate = 1e-3,dropout_rate= 0.2, weight_decay=1e-3, factor=0.2)

0    1463
2    1382
1     144
Name: Type_of_Food_Allergy_Tree_Nuts, dtype: int64
467 3
RandomUnderSampler()
epoch 1/60, loss = 0.95284, train_acc = 0.5510, F1 Score_Train = 0.5383
[91mMODEL_SAVE
Accuracy_test = 0.5535, F1 Score_test = 0.3594[0m
 
TomekLinks()
epoch 2/60, loss = 0.68138, train_acc = 0.7216, F1 Score_Train = 0.5508
[91mMODEL_SAVE
Accuracy_test = 0.8344, F1 Score_test = 0.5642[0m
 
NearMiss()
epoch 3/60, loss = 0.70053, train_acc = 0.6364, F1 Score_Train = 0.5743
[91mMODEL_SAVE
Accuracy_test = 0.8161, F1 Score_test = 0.5723[0m
 
RandomUnderSampler()
epoch 4/60, loss = 1.07907, train_acc = 0.5758, F1 Score_Train = 0.5423
 
SMOTEENN()
epoch 5/60, loss = 0.18240, train_acc = 0.8090, F1 Score_Train = 0.7476
 
RandomUnderSampler()
epoch 6/60, loss = 0.74215, train_acc = 0.6970, F1 Score_Train = 0.6526
 
RandomOverSampler()
epoch 7/60, loss = 0.49269, train_acc = 0.7761, F1 Score_Train = 0.7671
[91mMODEL_SAVE
Accuracy_test = 0.7893, F1 Score_test = 0.6555[0m
 
TomekLink

In [None]:
model_to_save = Allergy_Net(input_size=467, hidden_size=2048, num_class=3,dropout_rate=0.6).to(device)
model_to_save.load_state_dict(torch.load('best_model.pth'))
torch.save(model_to_save.state_dict(), 'Type_of_Food_Allergy_Tree_Nuts_Pytorch_2.pth')