In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pathlib import Path
import os
import sys
import seaborn as sns
import random
from collections import Counter
from functools import partial
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold, BaseCrossValidator, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, f1_score, recall_score, precision_recall_curve, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
import torch
import torch.nn as nn
import torch.nn.functional as F

import sys
sys.path.append('../tools')
import kentai
from models import AttentionMIL, AttentionMIL3class
import miltools
from miltools import split_data_by_pid, undersample_bags, compute_metrics, save_curve_data, plot_curves, save_label_counts, train_mil_model, evaluate_mil_model, evaluate_MIL

In [2]:
projectdir = os.environ['PROJECT_DIR']         # Project directory
projectdir = Path(projectdir)
datadir = projectdir / 'data'                  # Data directory
pulsedir = datadir / 'pulse'                   # Pulse directory
Infldir = pulsedir / 'Infl20230106'            # Influenza pulse directory
Coviddir = pulsedir / 'SARS-CoV-2'             # Covid pulse directory
NCdir = pulsedir / 'NC'                        # NC directory
featuredir = datadir / 'feature'               # Feature directory
resultdir = datadir / 'result'                 # Result directory
featurepath = featuredir / 'features.csv'      # Path to features file
idlabelspath = featuredir / 'idlabels.csv'     # Path to idlabels file

In [3]:
kentai = kentai.Kentai(featurepath, idlabelspath)

In [4]:
kentai.count_instance()

Unnamed: 0_level_0,pid,pulse
label,Unnamed: 1_level_1,Unnamed: 2_level_1
covid,40,45742
infl,59,55449
nc,106,70815


In [None]:
experiment_base_name = 'Tsfresh_AttentionMIL_InflCovidNC'
experiment_name = 'InflNC'
experiment_dir = resultdir / experiment_base_name / experiment_name
os.makedirs(experiment_dir, exist_ok=True) 

pid_random_states = range(0, 100)  # 100 random states for PID sampling
n_pid_split_dict = {'infl': [29, 30], 'nc': [32, 30]}  # Number of train/test samples per class
labels = list(n_pid_split_dict.keys())  # Class labels
num_class = len(n_pid_split_dict)  # Number of classes
le = LabelEncoder()  # Label encoder
le.fit(labels)
label_column = 'label'
label_map = {'infl': 1, 'nc': 0}

chunk_size = 15
epochs = 20000

for i, (df_X, df_y, splitted_pids, splitted_pidlabels) \
    in enumerate(kentai.generate_dfxypidspidlabels(
            n_pid_split_dict, random_states=pid_random_states, label_column=label_column)):

    outputdirname = 'ex-division' + str(pid_random_states[i]).zfill(2)  # Name of the directory where results will be saved
    outputdir = experiment_dir / outputdirname  # Directory where results will be saved
    os.makedirs(outputdir, exist_ok=True)       # Create it if it does not exist    

    df_X_train, df_X_test = df_X  # Feature vectors for each instance
    df_y_train, df_y_test = df_y  # Labels for each instance
    pids_train, pids_test = splitted_pids  # PIDs used for train/test
    pidlabels_train, pidlabels_test = splitted_pidlabels  # Labels of PIDs used for train/test
    
    X_train_chunks, label_train_chunks, pid_train_chunks = \
        split_data_by_pid(df_X_train, df_y_train, chunk_size)
    X_train_chunks_torch = torch.tensor(X_train_chunks, dtype=torch.float32)  # Convert to tensor

    X_test_chunks, label_test_chunks, pid_test_chunks = \
        split_data_by_pid(df_X_test, df_y_test, chunk_size)
    X_test_chunks_torch = torch.tensor(X_test_chunks, dtype=torch.float32)  # Convert to tensor

    output_file_name = f'mil_results_chunk{chunk_size}.json'
    output_file = outputdir / output_file_name

    test_results, best_metrics, best_epoch = \
        evaluate_MIL(
            X_train_chunks_torch, label_train_chunks, pid_train_chunks,
            X_test_chunks_torch, label_test_chunks, pid_test_chunks,
            feature_dim=X_train_chunks_torch.shape[-1],
            epochs=epochs,
            lr=0.001,
            output_dir=outputdir,
            label_map=label_map
        )


In [None]:
experiment_base_name = 'Tsfresh_AttentionMIL_InflCovidNC'
experiment_name = 'CovidNC'
experiment_dir = resultdir / experiment_base_name / experiment_name
os.makedirs(experiment_dir, exist_ok=True) 

pid_random_states = range(0, 100)  # 100 random states for PID sampling
n_pid_split_dict = {'covid': [20, 20], 'nc': [21, 20]}  # Number of train/test samples per class
labels = list(n_pid_split_dict.keys())  # Class labels
num_class = len(n_pid_split_dict)  # Number of classes
le = LabelEncoder()  # Label encoder
le.fit(labels)
label_column = 'label'
label_map = {'covid': 1, 'nc': 0}

chunk_size = 15
epochs = 20000

for i, (df_X, df_y, splitted_pids, splitted_pidlabels) \
    in enumerate(kentai.generate_dfxypidspidlabels(
            n_pid_split_dict, random_states=pid_random_states, label_column=label_column)):

    outputdirname = 'ex-division' + str(pid_random_states[i]).zfill(2)  # Name of the directory where results will be saved
    outputdir = experiment_dir / outputdirname  # Directory where results will be saved
    os.makedirs(outputdir, exist_ok=True)       # Create it if it does not exist    

    df_X_train, df_X_test = df_X  # Feature vectors for each instance
    df_y_train, df_y_test = df_y  # Labels for each instance
    pids_train, pids_test = splitted_pids  # PIDs used for train/test
    pidlabels_train, pidlabels_test = splitted_pidlabels  # Labels of PIDs used for train/test
    
    X_train_chunks, label_train_chunks, pid_train_chunks = \
        split_data_by_pid(df_X_train, df_y_train, chunk_size)
    X_train_chunks_torch = torch.tensor(X_train_chunks, dtype=torch.float32)  # Convert to tensor

    X_test_chunks, label_test_chunks, pid_test_chunks = \
        split_data_by_pid(df_X_test, df_y_test, chunk_size)
    X_test_chunks_torch = torch.tensor(X_test_chunks, dtype=torch.float32)  # Convert to tensor

    output_file_name = f'mil_results_chunk{chunk_size}.json'
    output_file = outputdir / output_file_name

    test_results, best_metrics, best_epoch = \
        evaluate_MIL(
            X_train_chunks_torch, label_train_chunks, pid_train_chunks,
            X_test_chunks_torch, label_test_chunks, pid_test_chunks,
            feature_dim=X_train_chunks_torch.shape[-1],
            epochs=epochs,
            lr=0.001,
            output_dir=outputdir,
            label_map=label_map
        )
