In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.4 MB/s[0m eta [36m0:00:0

In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer
from torch.utils.data import TensorDataset, Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

## Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
data_dir = '/content/drive/MyDrive/data/ethic'
train = pd.read_csv(os.path.join(data_dir, 'train.tsv'), sep='\t', header=None)
test = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t', header=None)

In [5]:
print(len(train))
train = train.dropna()
print(len(train))

363154
363153


In [6]:
train_sen = list(train[0])
train_label = list(train[1])
train_score = list(train[2])

test_sen = list(test[0])
test_label = list(test[1])
test_score = list(test[2])

In [7]:
model_name = 'kykim/bert-kor-base'

In [8]:
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/344k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

In [9]:
max_len = 0
for sentence in train_sen:
  tokens = tokenizer.tokenize(sentence)
  length = len(tokens)
  if length > max_len:
    max_len = length

max_len

99

In [10]:
train_sen, valid_sen, train_label, valid_label, train_score, valid_score = train_test_split(train_sen, train_label, train_score, test_size=0.1, random_state=42)

In [11]:
len(train_sen), len(valid_sen)

(326837, 36316)

## DataLoader

In [12]:
class TextDataset(Dataset):
  def __init__(self, sen, label, score):
    self.text = sen
    self.label = label
    self.score = score

  def __len__(self):
    return len(self.text)

  def __getitem__(self, idx):
    text = self.text[idx]
    label = self.label[idx]
    score = self.score[idx]

    return text, label, score

In [13]:
train_dataset = TextDataset(train_sen, train_label, train_score)
valid_dataset = TextDataset(valid_sen, valid_label, valid_score)
test_dataset = TextDataset(test_sen, test_label, test_score)

In [14]:
batch = 64

train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=False)

## Model

In [15]:
class Classifier(nn.Module):
  def __init__(self, model_name, hidden_dim, output_dim, dropout=0.2, device='cuda'):
    super(Classifier, self).__init__()
    self.bert = BertModel.from_pretrained(model_name)
    self.fc1 = nn.Linear(self.bert.config.hidden_size, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, output_dim)

    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()
    self.dropout = nn.Dropout(dropout)

    self.device = device

  def forward(self, x):
    y = self.sigmoid(self.bert(**x).pooler_output)
    y = self.dropout(y)

    y = self.sigmoid(self.fc1(y))
    y = self.dropout(y)

    y = self.sigmoid(self.fc2(y))

    return y

## Train

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [17]:
model = Classifier(model_name, 200, 1).to(device)
lr = 1e-5

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss()

num_epochs = 15
early_stop = 3

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

In [18]:
train_label[0]

True

In [18]:
from tqdm import tqdm
from time import time

In [19]:
torch.cuda.empty_cache()

In [22]:
min_loss = np.inf
stop_count = 0
for epoch in range(1, num_epochs+1):
  start = time()
  train_loss = 0.0
  valid_loss = 0.0
  model.train()
  pbar = tqdm(train_loader, desc=f'Epoch {epoch}')
  for idx, batch in enumerate(pbar):
    text, label, score = batch
    label = torch.tensor(label, dtype=torch.float).to(device)
    # score = torch.tensor(score, dtype=torch.float).to(device)
    encoding = tokenizer(text, max_length=max_len, padding='max_length', truncation=True, return_tensors='pt').to(device)

    output = model(encoding)

    loss = criterion(output.squeeze(), label)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    pbar.set_postfix(loss = loss.item())

    train_loss += loss.item()

  train_loss /= idx

  with torch.no_grad():
    model.eval()
    pbar = tqdm(valid_loader, desc=f'Epoch {epoch}')

    for idx, batch in enumerate(pbar):
      text, label, score = batch
      label = torch.tensor(label, dtype=torch.float).to(device)
      # score = torch.tensor(score, dtype=torch.float).to(device)
      encoding = tokenizer(text, max_length=max_len, padding='max_length', truncation=True, return_tensors='pt').to(device)

      output = model(encoding)
      loss = criterion(output.squeeze(), label)

      pbar.set_postfix(loss = loss.item())

      valid_loss += loss.item()

    valid_loss /= idx
    if valid_loss <= min_loss:
      stop_count = 0
      min_loss = valid_loss
      best_epoch = epoch
      torch.save(model.state_dict(), os.path.join(data_dir, 'best_model.pt'))
    else:
      stop_count += 1
      if stop_count == early_stop:
        print("Early stop!")
        print(f'Best epoch : {best_epoch}')
        break

  print(f'Epoch: {epoch} | Elapsed time: {time()-start} | Training Loss: {train_loss:.3f} | Valid Loss: {valid_loss:.3f}')

  label = torch.tensor(label, dtype=torch.float).to(device)
Epoch 1: 100%|██████████| 5107/5107 [1:22:37<00:00,  1.03it/s, loss=0.374]
  label = torch.tensor(label, dtype=torch.float).to(device)
Epoch 1: 100%|██████████| 568/568 [03:25<00:00,  2.76it/s, loss=0.182]


Epoch: 1 | Elapsed time: 5165.651659727097 | Training Loss: 0.402 | Valid Loss: 0.347


Epoch 2: 100%|██████████| 5107/5107 [1:22:42<00:00,  1.03it/s, loss=0.351]
Epoch 2: 100%|██████████| 568/568 [03:26<00:00,  2.76it/s, loss=0.165]


Epoch: 2 | Elapsed time: 5175.261150598526 | Training Loss: 0.326 | Valid Loss: 0.334


Epoch 3: 100%|██████████| 5107/5107 [1:22:43<00:00,  1.03it/s, loss=0.293]
Epoch 3: 100%|██████████| 568/568 [03:25<00:00,  2.76it/s, loss=0.125]


Epoch: 3 | Elapsed time: 5168.65065741539 | Training Loss: 0.288 | Valid Loss: 0.343


Epoch 4: 100%|██████████| 5107/5107 [1:22:39<00:00,  1.03it/s, loss=0.145]
Epoch 4: 100%|██████████| 568/568 [03:25<00:00,  2.76it/s, loss=0.13]


Epoch: 4 | Elapsed time: 5165.114332675934 | Training Loss: 0.251 | Valid Loss: 0.360


Epoch 5: 100%|██████████| 5107/5107 [1:22:39<00:00,  1.03it/s, loss=0.179]
Epoch 5: 100%|██████████| 568/568 [03:25<00:00,  2.76it/s, loss=0.074]

Early stop!
Best epoch : 2





## Evaluate

In [47]:
pbar = tqdm(test_loader)
precision = 0.0
recall = 0.0
accuracy = 0.0

test_loss = 0.0

model.load_state_dict(torch.load(os.path.join(data_dir, 'best_model.pt')))

with torch.no_grad():
  model.eval()
  for idx, batch in enumerate(pbar):
    text, label, score = batch
    label = torch.tensor(label, dtype=torch.float).to(device)
    score = torch.tensor(score, dtype=torch.float).to(device)
    encoding = tokenizer(text, max_length=max_len, padding='max_length', truncation=True, return_tensors='pt').to(device)

    output = model(encoding)

    y = torch.where(output.squeeze()<0.5, 0, 1)

    accuracy += (torch.eq(y, label).sum().item()/len(label))
    precision += (torch.logical_and(y, label).sum().item()/y.sum().item())
    recall += (torch.logical_and(y, label).sum().item()/label.sum().item())

  precision /= idx
  recall /= idx
  accuracy /= idx
  f1_score = 2*(precision*recall)/(precision+recall)

print(f'Precision : {precision:.3f} | Recall : {recall:.3f} | Accuracy : {accuracy:.3f} | F1 score : {f1_score:.3f}')

  label = torch.tensor(label, dtype=torch.float).to(device)
  score = torch.tensor(score, dtype=torch.float).to(device)
100%|██████████| 707/707 [04:03<00:00,  2.90it/s]

Precision : 0.864 | Recall : 0.892 | Accuracy : 0.863 | F1 score : 0.878



