In [None]:
!pip install transformers

In [2]:
import pandas as pd
import numpy as np
import pickle

from transformers import BertTokenizerFast, BertModel
from torchtext import transforms as T
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch
from torch.optim import Adam
from tqdm import tqdm

computer = 'colab'

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [3]:
if computer=='docker':
    train_data_path = '/root/share/data/train.csv'
    test_data_path = '/root/share/data/test.csv'
elif computer=='colab':
    train_data_path = '/content/drive/MyDrive/commit_folder/chungwadae/torch_nlp/data/train_dataset.pkl'
    valid_data_path = '/content/drive/MyDrive/commit_folder/chungwadae/torch_nlp/data/valid_dataset.pkl'
    test_data_path = '/content/drive/MyDrive/commit_folder/chungwadae/torch_nlp/data/test_dataset.pkl'

In [4]:
tokenizer = BertTokenizerFast.from_pretrained('kykim/bert-kor-base')
model = BertModel.from_pretrained('kykim/bert-kor-base')

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
class CustomTrainDataset(Dataset):
  def __init__(self, df, tokenizer, tag):
    self.labels = [label for label in df['category']]
    df['data'] = df['data'].apply(self.tokenizer, tag=tag)
    df['data'] = df['data'].apply(lambda x: ' '.join(x))
    self.sentence = [
        tokenizer(sentence, padding='max_length', max_length=300, truncation=True, return_tensors='pt')
        for sentence in df['data']            
    ]

  def tokenizer(self, sentence, tag):
    return tag.morphs(sentence)

  def __len__(self):
    return len(self.labels)

  def get_batch_labels(self, idx):
    return np.array(self.labels[idx])

  def get_batch_sentences(self, idx):
    return self.sentence[idx]

  def __getitem__(self, idx):

    batch_sentence = self.get_batch_sentences(idx)
    batch_y = self.get_batch_labels(idx)

    return batch_sentence, batch_y

class CustomTestDataset(Dataset):
  def __init__(self, df, tokenizer, tag):
    df['data'] = df['data'].apply(self.tokenizer, tag=tag)
    df['data'] = df['data'].apply(lambda x: ' '.join(x))
    self.sentence = [
        tokenizer(sentence, padding='max_length', max_length=300, truncation=True, return_tensors='pt')
        for sentence in df['data']            
    ]

  def tokenizer(self, sentence, tag):
    return tag.morphs(sentence)

  def __len__(self):
    return len(self.sentence)

  def get_batch_sentences(self, idx):
    return self.sentence[idx]

  def __getitem__(self, idx):

    batch_sentence = self.get_batch_sentences(idx)

    return batch_sentence

In [6]:
with open(train_data_path, 'rb') as f:
  train_dataset = pickle.load(f)

with open(valid_data_path, 'rb') as f:
  valid_dataset = pickle.load(f)

with open(test_data_path, 'rb') as f:
  test_dataset = pickle.load(f)

In [7]:
class BertClassifier(nn.Module):
  def __init__(self, dropout=0.5):
    super(BertClassifier, self).__init__()

    self.bert = BertModel.from_pretrained('kykim/bert-kor-base')
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(768,3)
    self.softmax = nn.Softmax()

  def forward(self, input, mask):
    _, x = self.bert(input, attention_mask=mask, return_dict=False) # x는 [CLS]의 토큰의 정보를 담고 있음
    x = self.dropout(x)
    x = self.linear(x)
    output = self.softmax(x)
    return output

In [8]:
def train(model, train_data, val_data, lr, epochs):
  train_dataloader = DataLoader(train_data, batch_size = 16, shuffle=True)
  valid_dataloader = DataLoader(val_data, batch_size = 16, shuffle=False)

  use_cuda = torch.cuda.is_available()
  device = torch.device('cuda' if use_cuda else 'cpu')

  loss_func = nn.CrossEntropyLoss()
  optimizer = Adam(model.parameters(), lr = lr)

  if use_cuda:
    model = model.cuda()
    loss_func = loss_func.cuda()

  for epoch in range(epochs):
    total_train_accuracy = 0.0
    total_train_loss = 0.0

    for input, label in tqdm(train_dataloader):
      label = label.to(device)
      mask = input['attention_mask'].squeeze(1).to(device)
      input_id = input['input_ids'].squeeze(1).to(device)

      output = model(input_id, mask)

      batch_loss = loss_func(output, label)
      total_train_loss += batch_loss.item()

      accuracy = (output.argmax(dim=1) == label).sum().item()
      total_train_accuracy += accuracy

      model.zero_grad()
      batch_loss.backward()
      optimizer.step()

    total_val_accuracy = 0.0
    total_val_loss = 0.0

    with torch.no_grad():
      for input, label in valid_dataloader:
        label = label.to(device)
        mask = input['attention_mask'].squeeze(1).to(device)
        input_id = input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)

        batch_loss = loss_func(output, label)
        total_val_loss += batch_loss.item()

        accuracy = (output.argmax(dim=1) == label).sum().item()
        total_val_accuracy += accuracy

    print(
      f'Epochs: {epoch + 1} | Train Loss: {total_train_loss / len(train_data): .3f} \
      | Train Accuracy: {total_train_accuracy / len(train_data): .3f} \
      | Val Loss: {total_val_loss / len(val_data): .3f} \
      | Val Accuracy: {total_val_accuracy / len(val_data): .3f}')

In [None]:
EPOCHS = 5
model = BertClassifier()
lr = 1e-6

train(model, train_dataset, valid_dataset, lr, EPOCHS)

In [24]:
def infer(model, test_data):
  test_dataloader = DataLoader(test_data, batch_size=16, shuffle=False)
  use_cuda = torch.cuda.is_available()
  device = torch.device('cuda' if use_cuda else 'cpu')

  if use_cuda:
    model = model.cuda()
  res = []
  for input in tqdm(test_dataloader):
    mask = input['attention_mask'].squeeze(1).to(device)
    input_id = input['input_ids'].squeeze(1).to(device)
    output = model(input_id, mask)

    accuracy = output.argmax(dim=1)
    res.extend(accuracy.tolist())
  return res

In [25]:
res = infer(model, test_dataset)

  
100%|██████████| 313/313 [01:43<00:00,  3.03it/s]


In [30]:
submission = pd.read_csv('/content/drive/MyDrive/commit_folder/chungwadae/torch_nlp/data/sample_submission.csv')
submission['category'] = res
submission.to_csv('/content/drive/MyDrive/commit_folder/chungwadae/torch_nlp/data/bert_submission.csv', index=False)