# Set up environment

In [None]:
# Uncomment below section and run in case of re-connecting Colab

# !pip install git+https://github.com/haven-jeon/PyKoSpacing.git
# !pip install transformers
# !pip install git+https://github.com/ssut/py-hanspell.git

# from google.colab import drive
# drive.mount('/content/drive')

# %cd drive/MyDrive/MBTI
# !pwd

In [2]:
from dataloader import MBTIDataset

import pandas as pd
pd.set_option('display.width', 180)
import torch
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from sklearn.model_selection import KFold
from transformers import DataCollatorWithPadding, BertForSequenceClassification, BertConfig

# Prepare dataset

In [15]:
# Setup
env_dict = {
    # ==== Arguments for dataset =====
    'train_path'        : './data/example_train.csv',
    'question_path'     : './data/question_filtered.csv',
    'target'            : 'E',
    'pretrained_url'    : "klue/bert-base",
    'padding_per_batch' : True,
    # ==== Arguments for dataloader =====
    'shuffle'           : False,            # turn off 'shuffle' since we use sampler in Dataloader
    # ==== Arguments for training =====
    'batch_size'        : 64,
    'epoch'             : 100,
    'lr'                : 1e-4

}

In [None]:
# Dataset
train_dataset = MBTIDataset(
    data_path           = env_dict['train_path'],
    question_path       = env_dict['question_path'],
    target_mbti         = env_dict['target'],
    pretrained_url      = env_dict['pretrained_url'],
    padding_per_batch   = env_dict['padding_per_batch'],
    is_train            = True
)

print(len(train_dataset))
print(train_dataset.data.head())

In [None]:
# define collator function when padding per batch is needed
#TODO: data_collator가 아닌 torch의 Packing 을 이용하는 것과 성능 비교가 필요함
data_collator = DataCollatorWithPadding(tokenizer=train_dataset.tokenizer) if env_dict['padding_per_batch'] else None

# Dataloader
train_dataloder = DataLoader(
    train_dataset,
    batch_size  = env_dict['batch_size'],
    shuffle     = env_dict['shuffle'],
    collate_fn  = data_collator
)

# Prepare model

In [3]:
# GPU preparation
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
# Model
model = BertForSequenceClassification.from_pretrained(env_dict['pretrained_url'], num_labels=2)
model.cuda()

# Train & CV

In [None]:
# Training with cross validation (ref : https://velog.io/@pppanghyun/6.-%EA%B5%90%EC%B0%A8-%EA%B2%80%EC%A6%9DCross-Validation)

kfold     = KFold(n_splits=5, shuffle=True)
criterion = torch.nn.MSELoss()

validation_loss = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(train_dataset)):

    # Make indices for both training and validation
    train_subsampler  = SubsetRandomSampler(train_idx)
    val_subsampler    = SubsetRandomSampler(val_idx)
    
    # Define dataloader using sampler
    train_dataloder = DataLoader(
        train_dataset,
        batch_size  = env_dict['batch_size'],
        shuffle     = env_dict['shuffle'],
        sampler     = train_subsampler,
        collate_fn  = data_collator
    )
    val_dataloder   = DataLoader(
        train_dataset,
        batch_size  = env_dict['batch_size'],
        shuffle     = env_dict['shuffle'],
        sampler     = val_subsampler,
        collate_fn  = data_collator
    )

    # ===================
    #    Training Loop
    # ===================

#     model = Regressor()
#     optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)

#     for epoch in range(400): # 400번 학습을 진행한다.

#         for data in trainloader: # 무작위로 섞인 32개 데이터가 있는 배치가 하나 씩 들어온다.

#             inputs, values = data # data에는 X, Y가 들어있다.

#             optimizer.zero_grad() # 최적화 초기화

#             outputs = model(inputs) # 모델에 입력값 대입 후 예측값 산출
#             loss = criterion(outputs, values) # 손실 함수 계산
#             loss.backward() # 손실 함수 기준으로 역전파 설정 
#             optimizer.step() # 역전파를 진행하고 가중치 업데이트

#     train_rmse = evaluation(trainloader) # 학습 데이터의 RMSE
#     val_rmse = evaluation(valloader)
#     print("k-fold", fold," Train Loss: %.4f, Validation Loss: %.4f" %(train_rmse, val_rmse)) 
#     validation_loss.append(val_rmse)

## Calculate validation score

# validation_loss = np.array(validation_loss)
# mean = np.mean(validation_loss)
# std = np.std(validation_loss)
# print("Validation Score: %.4f, ± %.4f" %(mean, std))

# Test