In [1]:
# !pip install transformers
# !pip install emoji
# !pip install soynlp
# !pip install accelerate -U

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from pprint import pprint

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import ExponentialLR

from transformers import AutoTokenizer, AutoModelForTokenClassification

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import re
import emoji
from soynlp.normalizer import repeat_normalize

In [2]:
import re
import ast
import emoji
from soynlp.normalizer import repeat_normalize

def change(txt):
    result = re.sub('\'', '', txt)[1:-1].split()
    return (result)

def change2(txt):
    return (ast.literal_eval(txt))

pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣]+')
url_pattern = re.compile(
    r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

def clean(x):
    x = pattern.sub(' ', x)
    x = emoji.replace_emoji(x, replace='') #emoji 삭제
    x = url_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x

In [3]:
import json
MODEL_NAME = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

with open('./all_ner_tags_tag2id.json', 'r') as f:
  tag_to_index = json.load(f)

with open('./all_ner_tags_id2tag.json', 'r') as f:
  index_to_tag = json.load(f)

pad_token_id = tokenizer.pad_token_id
cls_token_id = tokenizer.cls_token_id
sep_token_id = tokenizer.sep_token_id
pad_token_label_id = tag_to_index['O']
cls_token_label_id = tag_to_index['O']
sep_token_label_id = tag_to_index['O']

In [4]:
with open('./all_ner_tag_info.json', 'r') as f:
  raw_data = json.load(f)

train_sentence_data =[]
for idx1, s in enumerate(raw_data['sentence']):
  tmp = []
  remove_idx = []
  for idx2, w in enumerate(s):
    clean_text = clean(w)
    if len(clean(w)) != 0:
      tmp.append(clean_text)
    else:
      remove_idx.append(idx2)

  add_idx=0
  for idx2 in remove_idx:
    raw_data['tag'][idx1].pop(idx2 + add_idx)
    add_idx-=1

  train_sentence_data.append(tmp)

In [5]:
len(train_sentence_data)

464545

In [6]:
split_point = int(len(raw_data['sentence']) * 0.8)

train_sentence = train_sentence_data[:split_point]
train_label = raw_data['tag'][:split_point]

test_sentence = train_sentence_data[split_point:]
test_label = raw_data['tag'][split_point:]


In [46]:
len(train_sentence), len(test_sentence)

(371636, 92909)

In [7]:
import tensorflow as tf

def convert_features(examples, labels, max_seq_len, tokenizer, pad_token_id_for_segment = 0, pad_token_id_for_label = -100):

    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    pad_token_id = tokenizer.pad_token_id

    tokenizer_data, data_labels = [], []

    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        tokens = []
        labels_ids = []
        for one_word, label_token in zip(example, label):
            subword_tokens = tokenizer.tokenize(one_word)
            tokens.extend(subword_tokens)

            labels_ids.extend([tag_to_index[label_token]] + [pad_token_id_for_label] * (len(subword_tokens) - 1))

        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            labels_ids = labels_ids[:(max_seq_len - special_tokens_count)]

        tokens += [sep_token]
        labels_ids += [pad_token_id_for_label]

        tokens = [cls_token] + tokens
        labels_ids = [pad_token_id_for_label] + labels_ids

        input_id = tokenizer.convert_tokens_to_ids(tokens)

        attention_mask = [1] * len(input_id)
        padding_count = max_seq_len - len(input_id)
        input_id = input_id + ([pad_token_id] * padding_count)
        attention_mask = attention_mask + ([0] * padding_count)

        token_type_id = [pad_token_id_for_segment] * max_seq_len

        label = labels_ids + ([pad_token_id_for_label] * padding_count)

        assert len(input_id) == max_seq_len, "input length Error {} vs {}".format(len(input_id), max_seq_len)
        assert len(input_id) == max_seq_len, "attention mask Error {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(input_id) == max_seq_len, "type token Error {} vs {}".format(len(label), max_seq_len)

        tokenizer_data.append({
        'input_ids' : input_id,
        'attention_mask' : attention_mask,
        'token_type_ids' : token_type_id
        })
        data_labels.append(label)

    return tokenizer_data, data_labels


In [8]:
# tokenizer_train_sentence, tokenizer_train_labels =convert_features(train_sentence, train_label, 128, tokenizer=tokenizer)
tokenizer_test_sentence, tokenizer_test_labels =convert_features(test_sentence, test_label, 128, tokenizer=tokenizer)

100%|██████████| 92909/92909 [00:22<00:00, 4192.43it/s]


In [11]:
class TokenDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encoding = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key : torch.tensor(val) for key, val in self.encoding[idx].items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
# train_data_set = TokenDataset(tokenizer_train_sentence, tokenizer_train_labels)
test_data_set = TokenDataset(tokenizer_test_sentence, tokenizer_test_labels)

In [11]:
import torch
# device = torch.device('cuda:0')

In [12]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import math
traing_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs = 5,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 32,
    logging_dir = './loss',
    logging_steps = 500,
    learning_rate = 3e-5,
    weight_decay = 0.01,
    save_total_limit = 2,
    save_strategy = 'steps',
    evaluation_strategy = 'steps',
    save_steps= 500,
    eval_steps= 500,
    warmup_steps= math.ceil(len(train_data_set) * 5 / 64 * 0.1),
    seed=15,
    load_best_model_at_end=True,
)

In [14]:
from transformers import Trainer
# 손실함수를 따로 정의해줄 필요가 있어서 기본 트레이너에서 손실함수만 수정한 부분
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fn = torch.nn.CrossEntropyLoss().to(device = device)

        active_loss = torch.reshape(labels, (-1,)) != -100 
        tensor_size = active_loss.size()[0]
        active_logits_loss = active_loss.reshape(tensor_size, 1).expand(tensor_size, logits.size()[2])

        reduced_logits = torch.masked_select(torch.reshape(logits, (-1, logits.size()[2])), active_logits_loss)
        reduced_logits = reduced_logits.reshape(-1, logits.size()[2])
        labels = torch.masked_select(torch.reshape(labels, (-1,)), active_loss)

        loss = loss_fn(reduced_logits, labels)

        return (loss, outputs) if return_outputs else loss



In [15]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels = len(tag_to_index))
# model.to(device)

Some weights of the model checkpoint at beomi/KcELECTRA-base-v2022 were not used when initializing ElectraForTokenClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.weight', 'classifier.bias']

In [16]:
trainer = CustomTrainer(
    model = model,
    args = traing_args,
    train_dataset=train_data_set,
    eval_dataset=test_data_set,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [17]:
trainer.train()



Step,Training Loss,Validation Loss
500,1.3152,0.126782
1000,0.2028,0.072038
1500,0.1512,0.050982
2000,0.1227,0.043243
2500,0.1042,0.037843
3000,0.0935,0.034663
3500,0.0832,0.031754
4000,0.076,0.02988
4500,0.0761,0.031226
5000,0.0704,0.028179


TrainOutput(global_step=10500, training_loss=0.1343307604108538, metrics={'train_runtime': 13382.6613, 'train_samples_per_second': 138.85, 'train_steps_per_second': 1.085, 'total_flos': 8.779748238901555e+16, 'train_loss': 0.1343307604108538, 'epoch': 3.62})

In [1]:
trainer.evaluate()

NameError: name 'trainer' is not defined

In [15]:
from transformers import Trainer
import torch
device = torch.device('cuda:0')
model = AutoModelForTokenClassification.from_pretrained('./results/checkpoint-8000')
model.to(device)
trainer = Trainer(
    model=model
)

In [16]:
y_pred = trainer.predict(test_data_set)



In [17]:
preds = np.argmax(y_pred.predictions, axis=-1)

In [18]:
preds[109]

array([0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [19]:
index_to_tag[-100] = tokenizer.pad_token

In [27]:
index_to_tag2 = {key : 'B-'+value for key, value in index_to_tag.items()}

In [29]:
index_to_tag2['0'] = 'O'
index_to_tag2[-100] = tokenizer.pad_token

In [61]:
tokenizer.decode(test_data_set[107]['input_ids'][:6])

'[CLS] 우리도 나중에 멀지 않아서 통일'

In [57]:
preds[109], y_pred.label_ids[107]

(array([0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([-100,    0,    0,    0,    0,    0,    0, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100

In [31]:
pred_lst = []
for pred_labels, pred_pred in zip(y_pred.label_ids, preds):
    tmp =[]
    for idx in range(128):
        if pred_labels[idx] != -100:
            tmp.append(index_to_tag2[str(pred_pred[idx])])
        else:
            tmp.append(index_to_tag2[-100])
    pred_lst.append(tmp)

In [32]:
true_y = []
for l in tokenizer_test_labels:
  tmp = []
  for i, l_id in enumerate(l):
    if l_id != -100:
      tmp.append(index_to_tag2[str(l_id)])
    else:
          tmp.append(index_to_tag2[-100])
  true_y.append(tmp)


In [42]:
tmp_pred, tmp_turey = [], []
for tmp_p, tmp_y in zip(pred_lst, true_y):
    tmp1 = [value for value in tmp_p if value != '[PAD]']
    tmp2 = [value for value in tmp_y if value != '[PAD]']

    tmp_pred.append(tmp1), tmp_turey.append(tmp2)

In [44]:
len(tmp_pred), len(tmp_turey), tmp_pred[0]

(92909, 92909, ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])

In [28]:
from sklearn.preprocessing import MultiLabelBinarizer
true_y = MultiLabelBinarizer().fit_transform(true_y)
pred_lst = MultiLabelBinarizer().fit_transform(pred_lst)

In [None]:
score = f1_score(true_y, pred_lst, average='micro')

In [45]:
from seqeval.metrics import classification_report
print(classification_report(tmp_turey, tmp_pred))

              precision    recall  f1-score   support

          AF       0.82      0.80      0.81       164
         AFA       0.81      0.75      0.78       425
         AFW       0.05      0.14      0.08        14
          AM       0.85      0.86      0.85       461
          CV       0.84      0.90      0.87      2805
          DT       0.93      0.95      0.94       993
          EV       0.73      0.64      0.68        74
          FD       0.63      0.61      0.62       112
          LC       0.67      0.60      0.63        90
         LCG       0.83      0.89      0.86       105
         LCP       0.90      0.92      0.91       548
          MT       0.62      0.67      0.64        27
         OGG       0.70      0.70      0.70       304
          PS       0.83      0.83      0.83       483
          PT       0.72      0.64      0.68        73
          QT       0.87      0.94      0.90      1470
          TI       0.87      0.94      0.91       155
          TM       0.71    

In [None]:
def seq2tag(label_ids, pred_ids):
    label_list = []
    pred_list = []
    for i in range(0, len(label_ids)): 
        label_tag = []
        pred_tag = []
        for label_index, pred_index in zip(label_ids[i], pred_ids[i]):
            if label_index != -100:
                label_tag.append(index_to_tag[str(label_index)])
                pred_tag.append(index_to_tag[str(pred_index)])
            label_list.append(label_tag)
            pred_list.append(pred_tag)
    return label_list, pred_list

label_list, pred_list = seq2tag(tokenizer_test_labels, preds)
        
print(classification_report(label_list, pred_list))

In [None]:
label_list[0], pred_list[0]

In [33]:
tag_to_index.keys()

dict_keys(['O', 'CV', 'AFA', 'DT', 'OGG', 'QT', 'AM', 'LCP', 'TMM', 'MT', 'TI', 'TMI', 'PS', 'PT', 'AF', 'AFW', 'TMIG', 'TR', 'LCG', 'TM', 'FD', 'LC', 'EV'])

In [None]:
label_list[0]

In [None]:
from seqeval.metrics import classification_report
print(classification_report(label_list, pred_list))

In [36]:
tag_to_index.keys()

dict_keys(['O', 'CV', 'AFA', 'DT', 'OGG', 'QT', 'AM', 'LCP', 'TMM', 'MT', 'TI', 'TMI', 'PS', 'PT', 'AF', 'AFW', 'TMIG', 'TR', 'LCG', 'TM', 'FD', 'LC', 'EV'])