## 시험 스크립트 1-2 : 스크립트2에서 저장된 데이터셋 10개를 가지고 각각에 대해 10번의 모델 훈련을 수행하며 Precision, Recall, F1-score 과 그 평균 값들을 구해 averages_1481_i.csv 에 저장 (총 100번의 훈련과 각 지표마다 100개의 기록)

# Multi-Class Classification with Transformer Encoder

## 1. Loading helper modules & functions

In [1]:
%load_ext autoreload
%autoreload 2
import torch as T
import torch.nn as nn
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import torchinfo
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
# os.environ["CUDA_VISIBLE_DEVICES"]= "0,1"  # Set the GPU 2 to use
device = T.device("cuda" if T.cuda.is_available() else "cpu")
# device2 = T.device("cuda:1" if T.cuda.is_available() else "cpu")
# device = T.device("cpu")
# from encoding.parallel import DataParallelModel, DataParallelCriterion
from torchmetrics import MetricCollection
from torchmetrics.classification import MulticlassAccuracy, MulticlassPrecision, MulticlassRecall, MulticlassF1Score, MulticlassConfusionMatrix
metric_collection = MetricCollection([
    # MulticlassAccuracy(num_classes=4, average="none").to(device),
    MulticlassPrecision(num_classes=4, average="none").to(device),
    MulticlassRecall(num_classes=4, average="none").to(device),
    MulticlassF1Score(num_classes=4, average="none").to(device),
])
metric_collection_weighted = MetricCollection([
    # MulticlassAccuracy(num_classes=4, average="weighted").to(device),
    MulticlassPrecision(num_classes=4, average="weighted").to(device),
    MulticlassRecall(num_classes=4, average="weighted").to(device),
    MulticlassF1Score(num_classes=4, average="weighted").to(device),
])
# import sklearn.datasets
from sklearn.model_selection import train_test_split
# import umap
# import umap.plot
# import shap
# import scipy as sp
# import transformers
print("Pytorch version:", T.__version__, "CUDA version:", T.version.cuda, "cuDNN version:", T.backends.cudnn.version())
print("device:", device, T.cuda.get_device_name(0), "*", T.cuda.device_count())

Pytorch version: 1.12.1 CUDA version: 11.3 cuDNN version: 8302
device: cuda Tesla V100-SXM2-32GB * 8


## 2. Implementing backbone of Transformer-based Classifier

In [2]:
# feature 개수 확인 및 설정
hf = h5py.File('STAD_Dataset3_minus_20_exam_0.h5', 'r')
X_DATA = np.array(hf.get('X'))
Y_DATA = np.array(hf.get('Y'))
print('X_DATA.shape :', X_DATA.shape, ', X_DATA.dtype :', X_DATA.dtype)
print('Y_DATA.shape :', Y_DATA.shape, '     , Y_DATA.dtype :', Y_DATA.dtype)
hf.close()

n_feature = X_DATA.shape[1]
print("\nThe number of feature :", n_feature)

X_DATA.shape : (407, 1481) , X_DATA.dtype : float64
Y_DATA.shape : (407,)      , Y_DATA.dtype : int8

The number of feature : 1481


In [3]:
class STAD_Dataset(T.utils.data.Dataset):
    def __init__(self, src_file):
        hf = h5py.File(src_file, 'r')
        if 'X_Train' in hf.keys() :
            # x = np.array(hf.get('X_Train')).astype('int64')
            x = np.array(hf.get('X_Train'))
            y = np.array(hf.get('Y_Train')).astype('int8')
        else :
            # x = np.array(hf.get('X_Test')).astype('int64')
            x = np.array(hf.get('X_Test'))
            y = np.array(hf.get('Y_Test')).astype('int8')
        hf.close()

        # self.x_data = T.tensor(x, dtype=T.int64).to(device)
        self.x_data = T.tensor(x, dtype=T.float32).to(device)
        self.y_data = T.tensor(y, dtype=T.int64).to(device)  

    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        x = self.x_data[idx]
        y = self.y_data[idx] 
        return x, y  

In [4]:
class PositionalEncoding(T.nn.Module):  
    def __init__(self, d_model: int, dropout: float=0.1, max_len: int=n_feature):
        super(PositionalEncoding, self).__init__()  
        self.dropout = T.nn.Dropout(p=dropout)
        pe = T.zeros(max_len, d_model)  
        position = T.arange(0, max_len, dtype=T.float32).unsqueeze(1)
        div_term = T.exp(T.arange(0, d_model, 2).float() * (-np.log(10_000.0) / d_model))
        pe[:, 0::2] = T.sin(position * div_term)
        pe[:, 1::2] = T.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)  # 이게 없으면 나중에 forward 에서 pe값을 못 찾아 에러남

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [5]:
# E = number of cases that all features can express in integer (feature=token) 
# D = #Embedding Dimension ; I = #input features ; F = #fc1 layer hidden nodes ; drop = dropout rate
E = 320 ; D = 8 ; I = n_feature ; F = 8 ; drop = 0.1
class Transformer_Classifier(T.nn.Module):  # 1501 -> 12008 -> Transformer -> FC -> 4
    def __init__(self):
        super(Transformer_Classifier, self).__init__() 
        # numeric pseudo-embedding, dim=D
        # self.embed = Special_Embedding_Layer(I, I*D)  # I input features, each goes to D 
        self.embed = T.nn.Embedding(E, D)  # I input features, each goes to D
        self.pos_enc = PositionalEncoding(D, dropout=drop)  # positional embedding
        self.enc_layer = T.nn.TransformerEncoderLayer(d_model=D, nhead=4, dim_feedforward=D, 
                         dropout=drop, batch_first=True)  # d_model should be divisible by nhead
        self.trans_enc = T.nn.TransformerEncoder(self.enc_layer, num_layers=3)  # default 6 layers 
        # STAD Dataset has I input features per each sample
        self.dropout = nn.Dropout(drop)
        # self.fc1 = T.nn.Linear(D*I, F)  # F hidden nodes
        # self.fc2 = T.nn.Linear(F, 4)    # 4 classes
        self.fc3 = T.nn.Linear(D*I, 4)    # 4 classes (GX, G1, G2, G3)

    def forward(self, x):
        # x = I input features, fixed length
        z = self.embed(x.int())  # I inputs to I*D embeds
        z = z.reshape(-1, I, D) 
        z = self.pos_enc(z) 
        z = self.trans_enc(z) 
        z = z.reshape(-1, D*I)  # torch.Size([batch_size, D*I])
        # z = self.dropout(z)
        # z = self.fc1(z)
        # z = T.tanh(z)
        z = self.dropout(z)
        z = self.fc3(z)
        z = T.log_softmax(z, dim=1)  # later followed by NLL Loss()
        return z

## 3. Training model

In [6]:
for j in range(10):
    # feature 개수 확인 및 설정
    hf = h5py.File(f'STAD_Dataset3_minus_20_exam_{j}.h5', 'r')
    X_DATA = np.array(hf.get('X'))
    Y_DATA = np.array(hf.get('Y'))
    hf.close()

    print(f'STAD_Dataset3_minus_20_exam_{j}.h5 dataset loaded\n')

    max_F1 = 0
    precision_list_10 = []
    recall_list_10 = []
    f1score_list_10 = []

    for i in range(10):

        print("############################## Trining", j+1, "-", i+1, "##############################\n")

        ########## 3-0. Data Random Splitting ##########

        # 1. Train : Test 를 레이블 비율을 균등하게 유지하면서 8 : 2로 랜덤하게 나눔
        X_Train, X_Test, Y_Train, Y_Test = train_test_split(X_DATA, Y_DATA, test_size=0.2, stratify=Y_DATA, random_state=(2*i+1))

        # 2. Train 데이터 파일로 저장
        hf = h5py.File('STAD_Dataset3_Train_minus_20_exam.h5', 'w')
        hf.create_dataset('X_Train', data=X_Train)
        hf.create_dataset('Y_Train', data=Y_Train)
        hf.close()
        # 3. Test 데이터 파일로 저장 
        hf = h5py.File('STAD_Dataset3_Test_minus_20_exam.h5', 'w')
        hf.create_dataset('X_Test', data=X_Test)
        hf.create_dataset('Y_Test', data=Y_Test)
        hf.close()

        ########## 3-1. Lodaing Dataset ##########

        # 0. setup
        np.random.seed(1)  
        T.manual_seed(1)  

        # 1. create Dataset
        train_file = "./STAD_Dataset3_Train_minus_20_exam.h5"
        train_ds = STAD_Dataset(train_file)

        test_file = "./STAD_Dataset3_Test_minus_20_exam.h5"
        test_ds = STAD_Dataset(test_file)

        bat_size = 32
        train_ldr = T.utils.data.DataLoader(train_ds, batch_size=bat_size, shuffle=True)
        # test_ldr = T.utils.data.DataLoader(test_ds, batch_size=2, shuffle=False)

        ########## 3-2. Training model ##########

        # net = Transformer_Classifier().to(device)
        net = nn.DataParallel(Transformer_Classifier()).to(device)

        max_epochs = 20
        ep_log_interval = 1
        lrn_rate = 0.0001

        loss_func = T.nn.NLLLoss()  # assumes log-softmax()
        # loss_func = DataParallelCriterion(T.nn.NLLLoss())  # assumes log-softmax()
        optimizer = T.optim.Adam(net.parameters(), lr=lrn_rate, weight_decay=0.1)
        scheduler = T.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lambda epoch: 0.99 ** epoch, last_epoch=-1, verbose=False)

        net.train()  # set mode

        loss_list = []
        accuracy_list = []
        test_accuracy_list = []
        for epoch in range(0, max_epochs):
            ep_loss = 0.0  # for one full epoch
            for (X, y) in train_ldr: # X = genes, y = labels
                optimizer.zero_grad()
                # with T.autocast(device_type="cuda"): # for auto-mixed precision
                oupt = net(X)
                loss_val = loss_func(oupt, y)  # a tensor
                
                ep_loss += loss_val.item()  # accumulate
                loss_val.backward()  # compute grads
                optimizer.step()     # update weights
            if epoch % ep_log_interval == 0:
                print("epoch = %4d  |  loss = %8.4f  |  lr = %8.8f" % (epoch+1, T.cuda.device_count()*ep_loss/bat_size, optimizer.param_groups[0]['lr']))
                # net.eval()
            scheduler.step()

        print("Done ") 

        net.eval()
        with T.no_grad():
            output_test = net(test_ds.x_data)
        target_test = test_ds.y_data
        preds_test  = output_test.argmax(dim=1, keepdim=True).squeeze(1)
        print(metric_collection_weighted(preds_test, target_test),'\n')

        # 시험2를 위한
        precision_list_10.append(MulticlassPrecision(num_classes=4, average="weighted").to(device)(preds_test, target_test))    
        recall_list_10.append(MulticlassRecall(num_classes=4, average="weighted").to(device)(preds_test, target_test))    
        f1score_list_10.append(MulticlassF1Score(num_classes=4, average="weighted").to(device)(preds_test, target_test))    

    ### 10번 훈련에서의 각 precision 값들과 그 평균
    precision_sum = 0
    for p in precision_list_10:
        precision_sum += p
    ### 10번 훈련에서의 각 recall 값들과 그 평균
    recall_sum = 0
    for p in recall_list_10:
        recall_sum += p
    ### 10번 훈련에서의 각 F1-Score 값들과 그 평균
    f1score_sum = 0
    for p in f1score_list_10:
        f1score_sum += p

    ### 각종 값들과 평균 csv 파일로 저장
    precision_list = [p.detach().cpu().numpy() for p in precision_list_10]
    precision_list.append((precision_sum/10).detach().cpu().numpy())
    recall_list = [p.detach().cpu().numpy() for p in recall_list_10]
    recall_list.append((recall_sum/10).detach().cpu().numpy())
    f1score_list = [p.detach().cpu().numpy() for p in f1score_list_10]
    f1score_list.append((f1score_sum/10).detach().cpu().numpy())

    averages_1501_df = pd.DataFrame({'Train':np.arange(1,len(precision_list)+1),
                                    'Precision':precision_list, 
                                    'Recall':recall_list, 
                                    'F1-Score':f1score_list})  
    averages_1501_df.iloc[(-1,0)] = '평균'

    averages_1501_df.to_csv(f'./averages_1481_{j}.csv')
    print(f'averages_1481_{j}.csv saved\n')

STAD_Dataset3_minus_20_exam_0.h5 dataset loaded

############################## Trining 1 - 1 ##############################



epoch =    1  |  loss =   2.6846  |  lr = 0.00010000
epoch =    2  |  loss =   2.6103  |  lr = 0.00009900
epoch =    3  |  loss =   2.4313  |  lr = 0.00009801
epoch =    4  |  loss =   2.0815  |  lr = 0.00009703
epoch =    5  |  loss =   2.0042  |  lr = 0.00009606
epoch =    6  |  loss =   2.0033  |  lr = 0.00009510
epoch =    7  |  loss =   1.7648  |  lr = 0.00009415
epoch =    8  |  loss =   1.7650  |  lr = 0.00009321
epoch =    9  |  loss =   1.4653  |  lr = 0.00009227
epoch =   10  |  loss =   1.4039  |  lr = 0.00009135
epoch =   11  |  loss =   1.2165  |  lr = 0.00009044
epoch =   12  |  loss =   1.2438  |  lr = 0.00008953
epoch =   13  |  loss =   1.1551  |  lr = 0.00008864
epoch =   14  |  loss =   1.0802  |  lr = 0.00008775
epoch =   15  |  loss =   1.0006  |  lr = 0.00008687
epoch =   16  |  loss =   0.9031  |  lr = 0.00008601
epoch =   17  |  loss =   0.9487  |  lr = 0.00008515
epoch =   18  |  loss =   0.8227  |  lr = 0.00008429
epoch =   19  |  loss =   0.8044  |  lr = 0.00