In [1]:
!pip install transformers --q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m103.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset

from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification
from transformers import BertConfig
from transformers import get_linear_schedule_with_warmup

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime
from tqdm import tqdm

In [3]:
# load data
train = pd.read_csv('./data/ko_train_label.csv')
test = pd.read_csv('./data/ko_test_label.csv')

In [4]:
test.drop(columns=['Unnamed: 7'], inplace=True)
test.dropna(inplace=True)

In [5]:
len(test), len(train)

(9999, 9999)

In [29]:
# make bert inputs
def make_bert_inputs(data, max_len=128):
    # add [CLS], [SEP] tokens
    sentences = data['document']
    sentences = ['[CLS] ' + str(sentence) + ' [SEP]' for sentence in sentences]
    
    # tokenize
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    
    # padding
    MAX_LEN = max_len
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')
    
    # attention mask
    attention_masks = []
    for seq in tqdm(input_ids):
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
        
    return tokenized_texts, input_ids, attention_masks

def make_labels(data, columns):
    labels = torch.tensor(data[columns].values)    
    return labels

def train_split(inputs_ids, labels, attention_masks, random_state, test_size=0.2):
    # split train, validation
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(inputs_ids, labels, random_state=random_state, test_size=test_size)
    
    # split attention mask
    train_masks, validation_masks, _, _ = train_test_split(attention_masks, inputs_ids, random_state=random_state, test_size=test_size)
    
    return train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks

def convert_to_tensor(inputs, labels, masks):
    # convert to tensor
    inputs_to_tensor = torch.tensor(inputs)
    labels_to_tensor = torch.tensor(labels)
    masks_to_tensor = torch.tensor(masks)
    
    return inputs_to_tensor, labels_to_tensor, masks_to_tensor

def custom_dataset(inputs, labels, masks, batch_size):
    # make custom dataset
    BATCH_SIZE = batch_size
    data = TensorDataset(inputs, masks, labels)
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)
    
    return dataloader
    

In [43]:
# tokenized_texts, input_ids, attention_masks
train_tokenized_texts, train_input_ids, train_attention_masks = make_bert_inputs(train)
test_tokenized_texts, test_input_ids, test_attention_masks = make_bert_inputs(test)

# make label to tensor
train_labels = make_labels(train, train.columns[2:].tolist())
test_labels = make_labels(test, test.columns[2:].tolist())

# split train, validation
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_split(train_input_ids, train_labels, train_attention_masks, random_state=42)

# convert to tensor train
train_input_tensor, train_label_tensor, train_mask_tensor = convert_to_tensor(train_inputs, train_labels, train_masks)

# convert to tensor test
test_input_tensor, test_label_tensor, test_mask_tensor = convert_to_tensor(test_input_ids, test_labels, test_attention_masks)

# conver to tensor validation
validation_inputs_tensor, validation_labels_tensor, validation_masks_tensor = convert_to_tensor(validation_inputs, validation_labels, validation_masks)

# make custom dataset
train_dataloader = custom_dataset(train_input_tensor, train_label_tensor, train_mask_tensor, batch_size=128)
test_dataloader = custom_dataset(test_input_tensor, test_label_tensor, test_mask_tensor, batch_size=128)
validation_dataloader = custom_dataset(validation_inputs_tensor, validation_labels_tensor, validation_masks_tensor, batch_size=128)

100%|██████████| 9999/9999 [00:00<00:00, 23065.55it/s]
100%|██████████| 9999/9999 [00:00<00:00, 21874.31it/s]
  labels_to_tensor = torch.tensor(labels)


## Train BERT

In [68]:
device_name = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device_name)

if device_name == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3, 1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3, 1), 'GB')
else:
    print(device_name)

Tesla V100-SXM2-16GB
Memory Usage:
Allocated: 13.9 GB
Cached:    14.1 GB




In [54]:
# load Bert
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=5)
model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [47]:
# run optim
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [48]:
def flat_acc(pred, labels):
    pred_flat = np.argmax(pred, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [65]:
import gc

gc.collect()

8

In [67]:
torch.cuda.empty_cache()

In [None]:
# train
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


# initialize Gradient
model.zero_grad()

# initialize loss
total_loss = 0

# train for epoch
for epoch_i in range(0, epochs):
    # ----------------------------
    #          Training
    # ----------------------------
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    # start time
    t0 = time.time()
    
    # train mode
    model.train()
    
    # dataloader with batch
    for step, batch in enumerate(train_dataloader):
        # progress update
        if step%500==0 and not step==0:
            elapsed = format_time(time.time() - t0)
            print('        Batch {:>5,} of {:>5,}.          Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        
        # batch to device
        batch = tuple(t.to(device) for t in batch)
        
        # unpack batch
        b_input_ids, b_input_mask, b_labels = batch
        
        # Forward pass
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels
                        )
        
        # loss
        loss = outputs[0]
        
        # calc total loss
        total_loss += loss.item()
        
        # backward pass
        loss.backward()
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # update parameters
        optimizer.step()
        
        # update learning rate
        scheduler.step()
        
        # initialize gradient
        model.zero_grad()
        
    # avg loss
    avg_train_loss = total_loss / len(train_dataloader)
    
    print("")
    print('        Average training loss: {0:.2f}'.format(avg_train_loss))
    print('        Training epoch took: {:}'.format(format_time(time.time() - t0)))
    
    # ----------------------------
    #         Validation
    # ----------------------------
    
    print("")
    print('        Running Validation...')
    
    t0 = time.time()
    
    # eval mode
    model.eval()
    
    # initialize
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    
    # dataloader with batch
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask
                            )
        # loss
        logits = outputs[0]
        
        # move logits, labels to cpu
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # calc accuracy
        tmp_eval_accuracy = flat_acc(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        
    print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print(" Validation took: {:}".format(format_time(time.time() - t0)))


print("")
print("Training complete!")

In [45]:
for step, batch in enumerate(train_dataloader):
    print(step)
    print(batch)

0
[tensor([[   101,  80956,   8857,  ...,      0,      0,      0],
        [   101,   9659,  22458,  ...,      0,      0,      0],
        [   101,   9708, 119235,  ...,      0,      0,      0],
        ...,
        [   101,   9069,  86580,  ...,      0,      0,      0],
        [   101,  42428,  20595,  ...,      0,      0,      0],
        [   101,   9659, 118959,  ...,      0,      0,      0]]), tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]]), tensor([[1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 1, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 