<a href="https://colab.research.google.com/github/sunny0103/DeepLearning_nlp_projects/blob/main/Korean_hate_speach/Korean_hate_speach_classification_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers datasets

In [2]:
import pandas as pd
import numpy as np
import random
from tqdm.notebook import tqdm

from datasets import load_dataset

from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          get_linear_schedule_with_warmup,
                          AdamW)
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

In [3]:
seed =42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [4]:
dataset = load_dataset("jeanlee/kmhas_korean_hate_speech")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 78977
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 8776
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 21939
    })
})

class_label:
*   0: origin (출신차별)
*   1: physical (외모차별)
*   2: politics (정치성향차별)
*   3: profanity (혐오욕설)
*   4: age (연령차별)
*   5: gender (성차별)
*   6: race (인종차별)
*   7: religion (종교차별)
*   8: not_hate_speech (혐오아님)

In [5]:
id2label ={0: 'origin',
           1: 'physical',
           2: 'politics',
           3: 'profanity',
           4: 'age',
           5: 'gender',
           6: 'race',
           7: 'religion',
           8: 'not_hate_speech'}
label2id = {v:k for k, v in id2label.items()}
label2id

{'origin': 0,
 'physical': 1,
 'politics': 2,
 'profanity': 3,
 'age': 4,
 'gender': 5,
 'race': 6,
 'religion': 7,
 'not_hate_speech': 8}

In [6]:
labels = [id for id in id2label.keys()]
labels

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [7]:
train = dataset['train']
valid = dataset['validation']
test = dataset['test']

In [8]:
# Multi label classification
print(train[0]['label'])
print(train[1]['label'])
print(train[2]['label'])

[2, 4]
[8]
[2]


In [9]:
MODEL_NAME = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
max_len = 64

enc = MultiLabelBinarizer()
enc.fit(train[:]['label'])

def preprocessing_data(examples):
  text = examples['text']
  tokenized = tokenizer(text, padding='max_length',
                        truncation=True,
                        max_length=max_len,
                        return_tensors='pt',
                        add_special_tokens=True)

  enc_label = enc.transform(examples['label'])
  float_arr = np.vstack(enc_label[:]).astype(float)
  tokenized['labels'] = float_arr.tolist()
  return tokenized['input_ids'], tokenized['attention_mask'], torch.tensor(tokenized['labels'])

In [11]:
input_ids_train, attention_masks_train, labels_train  = preprocessing_data(train)
input_ids_valid, attention_masks_valid, labels_valid  = preprocessing_data(valid)
input_ids_test, attention_masks_test, labels_test  = preprocessing_data(test)

train_dataset = TensorDataset(input_ids_train, attention_masks_train, labels_train)
valid_dataset = TensorDataset(input_ids_valid, attention_masks_valid, labels_valid)
test_dataset = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [12]:
len(train_dataset), len(valid_dataset), len(test_dataset)

(78977, 8776, 21939)

In [13]:
BATCH_SIZE = 16
trainloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
testloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [14]:
# for i, data in enumerate(trainloader):
#   if i==1:
#     print(data[0].shape)
#     input_ids_train, attention_masks_train, labels_train = data
#     print("input_ids_train", input_ids_train)
#     print("attention_masks_train", attention_masks_train)
#     print("labels_train", labels_train)
#     break

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                           num_labels= len(id2label),
                                                           problem_type="multi_label_classification")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
def multi_label_metrics(predictions, labels, threshold=0.5):
  sigmoid = nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))
  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs >= threshold)] = 1
  y_true = labels

  accuracy = accuracy_score(y_true, y_pred)
  f1_macro_average = f1_score(y_true=y_true, y_pred= y_pred, average='macro', zero_division=0)
  f1_micro_average = f1_score(y_true=y_true, y_pred= y_pred, average='micro', zero_division=0)
  f1_weighted_average = f1_score(y_true=y_true, y_pred= y_pred, average='weighted', zero_division=0)
  roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')

  metrics = {'accuracy': accuracy,
             'f1_macro': f1_macro_average,
             'f1_micro': f1_micro_average,
             'f1_weighted': f1_weighted_average,
             'roc_auc': roc_auc}
  return metrics

In [17]:
Learning_rate = 2e-5
epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = AdamW(model.parameters(), lr = Learning_rate, eps=1e-8)
total_steps = len(trainloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
model = model.to(device)



In [18]:
import time
import datetime
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [19]:
class Trainer():
  def __init__(self, trainloader, validloader, testloader, model, optimizer, scheduler, device):
    self.trainloader = trainloader
    self.validloader = validloader
    self.testloader = testloader
    self.model = model
    self.optimizer = optimizer
    self.scheduler = scheduler
    self.device = device

  def train(self, epochs=1):
    self.model.train()
    for epoch in range(1, epochs+1):
      running_loss = 0.0
      t0 = time.time()
      print('=========================Epoch {:}/{:}========================='.format(epoch, epochs))
      for step, batch in tqdm(enumerate(self.trainloader)):
        if step % 1000 == 0 and not step == 0:
          elapsed = format_time(time.time()-t0)
          print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(self.trainloader), elapsed))

        self.optimizer.zero_grad()
        batch = tuple(b.to(self.device) for b in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                  }
        outputs = model(**inputs)

        loss = outputs[0]
        running_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        self.optimizer.step()
        self.scheduler.step()

      train_loss = running_loss/len(self.trainloader)
      valid_loss = self.validate()
      tqdm.write('Train loss:{:.5f}, Validation loss:{:.5f}'.format(train_loss,  valid_loss))

  def validate(self):
    self.model.eval()
    t0 = time.time()
    predictions, true_vals = [], []
    with torch.no_grad():
      running_loss = 0.0
      for batch in self.validloader:
        batch = tuple(b.to(self.device) for b in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                  }
        outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    valid_loss = running_loss/len(self.validloader)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    results = multi_label_metrics(predictions, true_vals)
    print("Accuracy: {0:.4f}".format(results['accuracy']))
    print("F1 (Macro) Score: {0:.4f}".format(results['f1_macro']))
    print("F1 (Micro) Score: {0:.4f}".format(results['f1_micro']))
    print("F1 (Weighted) Score: {0:.4f}".format(results['f1_weighted']))
    print("ROC-AUC: {0:.4f}".format(results['roc_auc']))
    return valid_loss

  def test(self):
    self.model.eval()
    predictions, true_vals = [], []
    with torch.no_grad():
      for batch in self.testloader:
        batch = tuple(b.to(self.device) for b in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                  }
        outputs = model(**inputs)
        logits= outputs[1]
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    results = multi_label_metrics(predictions, true_vals)
    print("Accuracy: {0:.4f}".format(results['accuracy']))
    print("F1 (Macro) Score: {0:.4f}".format(results['f1_macro']))
    print("F1 (Micro) Score: {0:.4f}".format(results['f1_micro']))
    print("F1 (Weighted) Score: {0:.4f}".format(results['f1_weighted']))
    print("ROC-AUC: {0:.4f}".format(results['roc_auc']))

In [20]:
trainer = Trainer(trainloader= trainloader,
                  validloader = validloader,
                  testloader = testloader,
                  model = model,
                  optimizer = optimizer,
                  scheduler = scheduler,
                  device = device)
trainer.train(epochs=epochs)



0it [00:00, ?it/s]

  Batch 1,000  of  4,937.    Elapsed: 0:01:12.
  Batch 2,000  of  4,937.    Elapsed: 0:02:25.
  Batch 3,000  of  4,937.    Elapsed: 0:03:37.
  Batch 4,000  of  4,937.    Elapsed: 0:04:57.
Accuracy: 0.7918
F1 (Macro) Score: 0.7240
F1 (Micro) Score: 0.8457
F1 (Weighted) Score: 0.8444
ROC-AUC: 0.9081
Train loss:0.12887, Validation loss:0.00000


0it [00:00, ?it/s]

  Batch 1,000  of  4,937.    Elapsed: 0:01:11.
  Batch 2,000  of  4,937.    Elapsed: 0:02:22.
  Batch 3,000  of  4,937.    Elapsed: 0:03:33.
  Batch 4,000  of  4,937.    Elapsed: 0:04:43.
Accuracy: 0.8075
F1 (Macro) Score: 0.7527
F1 (Micro) Score: 0.8543
F1 (Weighted) Score: 0.8514
ROC-AUC: 0.9121
Train loss:0.07613, Validation loss:0.00000


0it [00:00, ?it/s]

  Batch 1,000  of  4,937.    Elapsed: 0:01:11.
  Batch 2,000  of  4,937.    Elapsed: 0:02:22.
  Batch 3,000  of  4,937.    Elapsed: 0:03:34.
  Batch 4,000  of  4,937.    Elapsed: 0:04:45.
Accuracy: 0.8079
F1 (Macro) Score: 0.7582
F1 (Micro) Score: 0.8521
F1 (Weighted) Score: 0.8506
ROC-AUC: 0.9121
Train loss:0.03692, Validation loss:0.00000


In [21]:
trainer.test()

Accuracy: 0.8059
F1 (Macro) Score: 0.7686
F1 (Micro) Score: 0.8525
F1 (Weighted) Score: 0.8514
ROC-AUC: 0.9122
