In [1]:
!pip install transformers --q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m103.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset

from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification
from transformers import BertConfig
from transformers import get_linear_schedule_with_warmup

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime
from tqdm import tqdm

In [3]:
# load data
train = pd.read_csv('./data/ko_train_label.csv')
test = pd.read_csv('./data/ko_test_label.csv')

In [4]:
test.drop(columns=['Unnamed: 7'], inplace=True)
test.dropna(inplace=True)

In [5]:
len(test), len(train)

(9999, 9999)

In [29]:
# make bert inputs
def make_bert_inputs(data, max_len=128):
    # add [CLS], [SEP] tokens
    sentences = data['document']
    sentences = ['[CLS] ' + str(sentence) + ' [SEP]' for sentence in sentences]
    
    # tokenize
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    
    # padding
    MAX_LEN = max_len
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')
    
    # attention mask
    attention_masks = []
    for seq in tqdm(input_ids):
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
        
    return tokenized_texts, input_ids, attention_masks

def make_labels(data, columns):
    labels = torch.tensor(data[columns].values)    
    return labels

def train_split(inputs_ids, labels, attention_masks, random_state, test_size=0.2):
    # split train, validation
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(inputs_ids, labels, random_state=random_state, test_size=test_size)
    
    # split attention mask
    train_masks, validation_masks, _, _ = train_test_split(attention_masks, inputs_ids, random_state=random_state, test_size=test_size)
    
    return train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks

def convert_to_tensor(inputs, labels, masks):
    # convert to tensor
    inputs_to_tensor = torch.tensor(inputs)
    labels_to_tensor = torch.tensor(labels)
    masks_to_tensor = torch.tensor(masks)
    
    return inputs_to_tensor, labels_to_tensor, masks_to_tensor

def custom_dataset(inputs, labels, masks, batch_size):
    # make custom dataset
    BATCH_SIZE = batch_size
    data = TensorDataset(inputs, masks, labels)
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)
    
    return dataloader
    

In [30]:
# tokenized_texts, input_ids, attention_masks
train_tokenized_texts, train_input_ids, train_attention_masks = make_bert_inputs(train)
test_tokenized_texts, test_input_ids, test_attention_masks = make_bert_inputs(test)

# make label to tensor
train_labels = make_labels(train, train.columns[2:].tolist())
test_labels = make_labels(test, test.columns[2:].tolist())

# split train, validation
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_split(train_input_ids, train_labels, train_attention_masks, random_state=42)

# convert to tensor
train_input_tensor, train_label_tensor, train_mask_tensor = convert_to_tensor(train_inputs, train_labels, train_masks)



100%|██████████| 9999/9999 [00:00<00:00, 21557.22it/s]
100%|██████████| 9999/9999 [00:00<00:00, 21570.65it/s]
  labels_to_tensor = torch.tensor(labels)


In [23]:
train_labels = make_labels(train, train.columns[2:].tolist())