# Set up environment

In [None]:
# Uncomment below section and run in case of re-connecting Colab

!pip install git+https://github.com/haven-jeon/PyKoSpacing.git
!pip install transformers
!pip install git+https://github.com/ssut/py-hanspell.git

from google.colab import drive
drive.mount('/content/drive')

# 영도야 여긴 너 경로에 맞게 바꿔야 할거야
%cd drive/MyDrive/MBTI
!pwd

In [2]:
import time
import datetime
import random
from tqdm import tqdm
import pickle

from dataloader import MBTIDataset

import pandas as pd
pd.set_option('display.width', 180)
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from sklearn.model_selection import KFold
from transformers import DataCollatorWithPadding, BertForSequenceClassification, BertConfig, AdamW

# Environment

In [42]:
# Setup
env_dict = {
    # ==== Arguments for dataset =====
    'train_path'        : './data/hackathon_train_v1.csv',
    'question_path'     : './data/question_filtered.csv',
    'test_path'         : './data/hackathon_test_for_user.csv'
    'pretrained_url'    : "klue/bert-base",
    'padding_per_batch' : True,
    # ==== Arguments for dataloader =====
    'shuffle'           : False,            # turn off 'shuffle' since we use sampler in Dataloader
    # ==== Arguments for training =====
    'target'            : 'I/E',
    'lm'                : 'bert',
    'classifier'        : 'mlp',
    'batch_size'        : 16,
    'epoch'             : 5,
    'lr'                : 1e-2,
    'decay_rate'        : 1e-7,
    'dropout'           : 0.1,
    'hidden_dim'        : [192, 48, 12]     # 일단 설정해둔 숫자들 (cls token의 dimension인 768 을 4로 나눈 값들)
}

# Random seed
seed_val = 1234
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [43]:
# Garbage collect
import torch, gc
gc.collect()
torch.cuda.empty_cache()

# Prepare dataset

In [None]:
# Uncomment & run this cell only if there is no preprocessed data

# Dataset
# train_dataset = MBTIIDataset(
#     data_path           = env_dict['train_path'],
#     question_path       = env_dict['question_path'],
#     pretrained_url      = env_dict['pretrained_url'],
#     padding_per_batch   = env_dict['padding_per_batch'],
#     txt_preprocess      = True,
#     is_train            = True
# )

# print(len(train_dataset))
# print(train_dataset.data.head())



100%|██████████| 11520/11520 [10:07<00:00, 18.97it/s]




100%|██████████| 11520/11520 [00:00<00:00, 31081.21it/s]




4it [00:00, 65.24it/s]




100%|██████████| 11520/11520 [00:06<00:00, 1737.02it/s]

11520
   Data_ID  User_ID  Gender       Age  MBTI  Q_number                                             Answer  I/E  S/N  T/F  J/P                                        QandA
0        1        1       1 -0.372581  INFP         1  <아니다> 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의...    0    1    1    1  [input_ids, token_type_ids, attention_mask]
1        2        1       1 -0.372581  INFP         2  <중립> 다양한 관심사를 탐구하진 않지만 대체로 자연과 역사에 관련된 것을 좋아하며...    0    1    1    1  [input_ids, token_type_ids, attention_mask]
2        3        1       1 -0.372581  INFP         3  <그렇다> 감정이입이 잘 되어 코미디 영화에서 사람이 울고 있을 때도 울기 때문에 ...    0    1    1    1  [input_ids, token_type_ids, attention_mask]
3        4        1       1 -0.372581  INFP         4  <중립> 대비책을 세우긴 하는데 세우다가 마는 편입니다일의 변수가 생길 수 있고 변...    0    1    1    1  [input_ids, token_type_ids, attention_mask]
4        5        1       1 -0.372581  INFP         5  <아니다> 평정심을 유지 못하는 편입니다 머릿속은 백지화가 된 상태로 말도 제대로 ...    0    1    1    1  [input_ids, token_




In [5]:
# Uncomment & run this cell only if there is no preprocessed data

# # save preprocessed pd.Dataframe & tokenizer as pickle data format

data_path = './data/' + 'base_data_2.pickle'
# with open(data_path, 'wb') as handle:
#     pickle.dump(train_dataset.data, handle)

# with open('./data/base_tokenizer.pickle', 'wb') as handle:
#     pickle.dump(train_dataset.tokenizer, handle)

In [6]:
# Load data as pd.DataFrame & tokenizer
from dataloader import MBTIDataset

with open(data_path, 'rb') as handle:
    df = pickle.load(handle)

with open('./data/base_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

train_dataset = MBTIDataset(
    data_path           = df,
    question_path       = env_dict['question_path'],
    pretrained_url      = env_dict['pretrained_url'],
    padding_per_batch   = env_dict['padding_per_batch'],
    is_train            = True
)

print(len(train_dataset))
print(train_dataset.data.head())

11520
   Data_ID  User_ID  Gender       Age  MBTI  Q_number                                             Answer  I/E  S/N  T/F  J/P                                              QandA
0        1        1       1 -0.372581  INFP         1  <아니다> 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의...    0    1    1    1  [input_ids, token_type_ids, attention_mask, ca...
1        2        1       1 -0.372581  INFP         2  <중립> 다양한 관심사를 탐구하진 않지만 대체로 자연과 역사에 관련된 것을 좋아하며...    0    1    1    1  [input_ids, token_type_ids, attention_mask, ca...
2        3        1       1 -0.372581  INFP         3  <그렇다> 감정이입이 잘 되어 코미디 영화에서 사람이 울고 있을 때도 울기 때문에 ...    0    1    1    1  [input_ids, token_type_ids, attention_mask, ca...
3        4        1       1 -0.372581  INFP         4  <중립> 대비책을 세우긴 하는데 세우다가 마는 편입니다일의 변수가 생길 수 있고 변...    0    1    1    1  [input_ids, token_type_ids, attention_mask, ca...
4        5        1       1 -0.372581  INFP         5  <아니다> 평정심을 유지 못하는 편입니다 머릿속은 백지화가 된 상태로 말도 제대로 ...    0    1

In [44]:
# define collator function when padding per batch is needed
#TODO: data_collator가 아닌 torch의 Packing 을 이용하는 것과 성능 비교가 필요함
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) if env_dict['padding_per_batch'] else None

# Dataloader
train_dataloader = DataLoader(
    train_dataset,
    batch_size  = env_dict['batch_size'],
    shuffle     = env_dict['shuffle'],
    collate_fn  = data_collator
)

In [8]:
# Example result from dataloader
import pprint
pprint.pprint(next(iter(train_dataloader)))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'I/E': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'J/P': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'S/N': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'T/F': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]),
 'cat_input': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'input_ids': tensor([[    2,  7267, 11187,  ...,     0,     0,     0],
        [    2,  3936,  3641,  ...,     0,     0,     0],
        [    2,  3656,  3611,  ...,     0,     0,     0],
        ...,
        [    2,  4051,  4362,  ...,     0,     0,     0],
        [    2,  3656,  3611,  ...,     0,     0,     0],
        [    2,  3971,  3746,  ...,     0,     0,     0]]),
 'num_input': tensor([-0.3726, -0.3726, -0.3726, 

# Prepare language model

In [9]:
# GPU preparation
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [10]:
# Debugging cell
import torch.nn.functional as F

class MLPPClassifier(nn.Module):
    """ MBTI Binary Classifier """
    def __init__(self, input_dim, n_layers, hidden_dim, num_classes = 2, dropout = 0.5):
        super(MLPPClassifier, self).__init__()

        assert type(hidden_dim) == list, ValueError("hidden_dim should be list type")

        self.n_layers = n_layers            # number of layers
        self.hidden_dim = hidden_dim        # hidden dimension
        self.input_dim = input_dim          # input dimension
        self.num_classes = num_classes      # number of classes
        self.dropout = nn.Dropout(p=dropout, inplace=False)
        
        # dimension list for all layers 
        self.dimensions = [self.input_dim] + self.hidden_dim + [self.num_classes]

        # layer stacks
        self.layers = nn.ModuleList(
            [nn.Linear(self.dimensions[i - 1], self.dimensions[i]) for i in range(1, len(self.dimensions))])

    def forward(self, x):
        for i, layer in enumerate(self.layers):
            x = x.to(torch.float32)
            x = layer(x)

            # If layer is not the last layer
            if i != len(self.layers) - 1: 
                x = self.dropout(F.relu(x))

        return x

In [11]:
# from mlp import MLPClassifier

class BertWithMlp(BertForSequenceClassification):
    def __init__(
        self,
        config,
        input_dim = None,
        hidden_dim = None,
        num_classes = 2,
        dropout = 0.1
        ):

        # ====================
        #      BERT Setup
        # ====================

        # resulting BERT model is stored in 'self.bert'.
        super().__init__(config)

        self.num_labels = config.num_labels

        combined_feat_dim = config.text_feat_dim + config.cat_feat_dim + config.num_feat_dim
        print("combined_feat_dim :", combined_feat_dim)

        self.mlp = MLPPClassifier(
            combined_feat_dim,
            None,
            hidden_dim,
            num_classes=num_classes,
            dropout=dropout
        )
        print("mlp :", self.mlp)
        self.dropout = nn.Dropout(p=dropout, inplace=False)
        self.bn = nn.BatchNorm1d(config.num_feat_dim)

    def forward(
        self,
        input_ids = None,
        attention_mask = None,
        token_type_ids = None,
        position_ids = None,
        head_mask = None,
        inputs_embeds = None,
        labels = None,
        output_attentions = None,
        cat_feats = None,
        num_feats = None
    ):
        # ====================
        #     BERT forward
        # ====================
        #TODO: 더 많은 인자 추가해주기
        logits = self.bert(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask)

        cls = logits[1]
        # print("logits :", logits[0].shape, logits[1].shape)
        # Apply dropout to cls
        cls = self.dropout(cls)
        # Apply batch normalization to numerical features
        # num_feats = self.bn(num_feats)        # 여기 오류남. 왜인지 아직 확인 X

        # print("cls shape :", cls.shape)
        # print("cat shape :", cat_feats.shape)
        # print("num shape :", num_feats.shape)

        # ====================
        #      MLP forward
        # ====================
        all_feats = torch.cat((cls, cat_feats.view(-1, 1), num_feats.view(-1, 1)), dim=1)
        # print("all_feats shape :", all_feats.shape)
        output = self.mlp(all_feats)

        return output


In [12]:
# Update config file
from transformers import BertConfig

#TODO: num_labels 인자가 필요한지 알아봐야 함
config = BertConfig.from_pretrained(
    env_dict['pretrained_url'],
    num_labels = 2
  )

config.num_feat_dim = 1
config.cat_feat_dim = 1
config.text_feat_dim = config.hidden_size

print(config)

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "cat_feat_dim": 1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_feat_dim": 1,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "text_feat_dim": 768,
  "transformers_version": "4.27.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}



In [45]:
# Prepare model
model = BertWithMlp.from_pretrained(
    env_dict['pretrained_url'],
    config      = config,
    hidden_dim  = env_dict['hidden_dim'],
    dropout     = env_dict['dropout']
    )

model.cuda()

# Apply weight decaying except for bias & layer normalization term
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# Prepare optimizer
optimizer = AdamW(optimizer_grouped_parameters, lr=env_dict['lr'])

criterion = nn.BCEWithLogitsLoss()

combined_feat_dim : 770
mlp : MLPPClassifier(
  (dropout): Dropout(p=0.1, inplace=False)
  (layers): ModuleList(
    (0): Linear(in_features=770, out_features=192, bias=True)
    (1): Linear(in_features=192, out_features=48, bias=True)
    (2): Linear(in_features=48, out_features=12, bias=True)
    (3): Linear(in_features=12, out_features=2, bias=True)
  )
)


Some weights of the model checkpoint at klue/bert-base were not used when initializing BertWithMlp: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertWithMlp from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertWithMlp from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertWithMlp were not initialized from the model checkpoint at klue/bert-base and are newly initialized: 

In [47]:
# Freeze Encoder, use head's parameters only
free_encoder = False          # 이걸 True 로 바꾸면 학습 성과가 정말 떨어지더라..

if free_encoder:
  for param in model.base_model.parameters():
    param.requires_grad = False

# Train

In [15]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [46]:
# Set the target of training right before training loop
env_dict['target_of_training'] = 'I/E'

In [17]:
def one_hot_embedding(labels, num_classes, device):
    """Embedding labels to one-hot form.

    Args:
      labels: (LongTensor) class labels, sized [N,].
      num_classes: (int) number of classes.

    Returns:
      (tensor) encoded labels, sized [N, #classes].
    """
    y = torch.eye(num_classes, device=device) 
    return y[labels]

In [None]:
# Training Loop for 영도
model.train()
total_loss = 0

for epoch in range(env_dict['epoch']):
  t0 = time.time()
  epoch_loss = 0
  for step, batch in tqdm(enumerate(train_dataloader)):

    input_ids       = batch['input_ids'].to(device)
    token_type_ids  = batch['token_type_ids'].to(device)
    attention_mask  = batch['attention_mask'].to(device)
    cat_input       = batch['cat_input'].to(device)
    num_input       = batch['num_input'].to(device)
    label          = batch[env_dict['target_of_training']].to(device)

    # Forward
    output = model(input_ids        = input_ids,
                    token_type_ids  = token_type_ids,
                    attention_mask  = attention_mask,
                    cat_feats       = cat_input,
                    num_feats       = num_input)

    # Calculate loss
    loss = criterion(output, one_hot_embedding(label, 2, device))
    accu = torch.sum(torch.argmax(output, axis=1) == label).item() / env_dict['batch_size'] * 100

    print("Step loss: {0:.2f}".format(loss))
    print("Step accu: ", accu)

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    total_loss += loss

  avg_train_loss = total_loss / len(train_dataloader)
  print("Epoch loss: {0:.2f}".format(avg_train_loss))
  print("Training epoch took: {:}".format(format_time(time.time() - t0)))

# Save trained model

In [55]:
# Save
#TODO: haperparams가 이름에 드러날 수 있는 저장경로 생각해보기
# save_path = './models/' + env_dict['lm'] + 'with' + env_dict['classifier'] + '.pt'

# torch.save({
#             'epoch': env_dict['epoch'],
#             'lr'   : env_dict['lr'],
#             'batch_size' : env_dict['batch_size'],
#             'free_encoder': free_encoder,
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': optimizer.state_dict()
#             }, save_path)

# Test

In [None]:
test_dataset = MBTIDataset(
    data_path           = env_dict['test_path'],
    question_path       = env_dict['question_path'],
    pretrained_url      = env_dict['pretrained_url'],
    padding_per_batch   = env_dict['padding_per_batch'],
    is_train            = True
)

data_collator = DataCollatorWithPadding(tokenizer=test_dataset.tokenizer) if env_dict['padding_per_batch'] else None

# Dataloader
test_dataloader = DataLoader(
    test_dataset,
    batch_size  = env_dict['batch_size'],
    shuffle     = env_dict['shuffle'],
    collate_fn  = data_collator
)

In [None]:
# How to load saved model
# ref : https://pytorch.org/tutorials/beginner/saving_loading_models.html


In [None]:
# 저장된 모델을 다른 파일에 불러와서 Test 하길 권장! (test.py 만들어도 좋아)
# Test cell for 영도

model.test()

for epoch in range(env_dict['epoch']):
  t0 = time.time()
  epoch_loss = 0
  for step, batch in tqdm(enumerate(train_dataloader)):

    input_ids       = batch['input_ids'].to(device)
    token_type_ids  = batch['token_type_ids'].to(device)
    attention_mask  = batch['attention_mask'].to(device)
    cat_input       = batch['cat_input'].to(device)
    num_input       = batch['num_input'].to(device)
    label          = batch[env_dict['target_of_training']].to(device)

    # Forward
    output = model(input_ids        = input_ids,
                    token_type_ids  = token_type_ids,
                    attention_mask  = attention_mask,
                    cat_feats       = cat_input,
                    num_feats       = num_input)

    # Calculate loss
    loss = criterion(output, one_hot_embedding(label, 2, device))
    accu = torch.sum(torch.argmax(output, axis=1) == label).item() / env_dict['batch_size'] * 100

    print("Step loss: {0:.2f}".format(loss))
    print("Step accu: ", accu)

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    total_loss += loss

  avg_train_loss = total_loss / len(train_dataloader)
  print("Epoch loss: {0:.2f}".format(avg_train_loss))
  print("Training epoch took: {:}".format(format_time(time.time() - t0)))

# K-Fold Cross Validation (수정 중)

In [None]:
# Training with cross validation (ref : https://velog.io/@pppanghyun/6.-%EA%B5%90%EC%B0%A8-%EA%B2%80%EC%A6%9DCross-Validation)
#TODO: Scheduler, Gradient Clipping

kfold     = KFold(n_splits=5, shuffle=True)
criterion = torch.nn.MSELoss()

validation_loss = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(train_dataset)):

    #TODO: Fold 끼리 겹칠 수 있음. 겹치지 않는 방식 고려 필요
    # Make indices for both training and validation
    train_subsampler  = SubsetRandomSampler(train_idx)
    val_subsampler    = SubsetRandomSampler(val_idx)

    # Define dataloader using sampler
    train_dataloder = DataLoader(
        train_dataset,
        batch_size  = env_dict['batch_size'],
        shuffle     = env_dict['shuffle'],
        sampler     = train_subsampler,
        collate_fn  = data_collator
    )
    val_dataloder   = DataLoader(
        train_dataset,
        batch_size  = env_dict['batch_size'],
        shuffle     = env_dict['shuffle'],
        sampler     = val_subsampler,
        collate_fn  = data_collator
    )

    # ===================
    #    Training Loop
    # ===================

    optimizer = AdamW(model.parameters(), lr=env_dict['lr'], weight_decay=env_dict['decay_rate'])

    model.train()

    for epoch in range(env_dict['epoch']):
      t0 = time.time()
      epoch_loss = 0
      for step, batch in tqdm(enumerate(train_dataloader)):

        input_ids       = batch['input_ids'].to(device)
        token_type_ids  = batch['token_type_ids'].to(device)
        attention_mask  = batch['attention_mask'].to(device)
        labels  = batch[env_dict['target_of_training']].to(device)

        # Clear prior gradients
        model.zero_grad()

        # Forward
        output = model(input_ids,
                       token_type_ids=token_type_ids,
                       attention_mask=attention_mask,
                       cat_feats=batch['cat_input'],
                       num_feats=batch['num_input'])

        # Calculate loss

        # loss    = outputs.loss     # Default : CELoss

        print("Step loss: {0:.2f}".format(loss))

        loss.backward()
        optimizer.step()
      
      avg_train_loss = total_loss / len(train_dataloader)
      print("Epoch loss: {0:.2f}".format(loss))
      print("Training epoch took: {:}".format(format_time(time.time() - t0)))

#     train_rmse = evaluation(trainloader) # 학습 데이터의 RMSE
#     val_rmse = evaluation(valloader)
#     print("k-fold", fold," Train Loss: %.4f, Validation Loss: %.4f" %(train_rmse, val_rmse)) 
#     validation_loss.append(val_rmse)

## Calculate validation score

# validation_loss = np.array(validation_loss)
# mean = np.mean(validation_loss)
# std = np.std(validation_loss)
# print("Validation Score: %.4f, ± %.4f" %(mean, std))