Last Update @ 2020.12.04

- Huggingface Transformers 4.0.0  버전 반영

# Package 설치 & 데이터 받기

In [1]:
!pip install -q transformers pytorch_lightning emoji soynlp

In [2]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df = pd.read_excel('/content/drive/MyDrive/NLP Project/한국어_단발성_대화_데이터셋_최종.xlsx')
df.head()

Unnamed: 0,Sentence,Emotion,New
0,언니 동생으로 부르는게 맞는 일인가요..??,공포,불안
1,그냥 내 느낌일뿐겠지?,공포,불안
2,아직너무초기라서 그런거죠?,공포,불안
3,유치원버스 사고 낫다던데,공포,중립
4,근데 원래이런거맞나요,공포,불안


In [6]:
df.dropna(inplace = True)
print(df.isnull().sum())

Sentence    0
Emotion     0
New         0
dtype: int64


In [7]:
df.drop('Emotion', axis = 1, inplace = True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19538 entries, 0 to 33164
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  19538 non-null  object
 1   New       19538 non-null  object
dtypes: object(2)
memory usage: 457.9+ KB
None


In [9]:
df.loc[(df['New'] == '행복'), 'New'] = 0
df.loc[(df['New'] == '중립'), 'New'] = 1
df.loc[(df['New'] == '불안'), 'New'] = 2
df.loc[(df['New'] == '슬픔'), 'New'] = 3
df.loc[(df['New'] == '분노'), 'New'] = 4

In [10]:
#데이터 비율 맞춰주기, 2000개 문장으로 먼저 학습진행
df_sample = pd.DataFrame(columns = ['Sentence', 'New'])
for i in df['New'].unique():
  sent = df.query('New==@i').sample(n = 2000)
  df_sample = pd.concat([df_sample, sent])#행방향으로 concat

df_sample = df_sample.reset_index(drop=True)
df_sample['New'].value_counts()

2    2000
1    2000
4    2000
0    2000
3    2000
Name: New, dtype: int64

In [20]:
df_sample.rename(columns = {'Sentence':'document', 'New':'label'}, inplace = True)

In [21]:
df_sample.head()

Unnamed: 0,document,label
0,이게 거의 폭설급인대 ㅡㅡ....,2
1,그렇게 친근하게 다가 오니 좀 의심되더라구요..,2
2,내가불안해서꾸는걸까..,2
3,뭘 할지도 모르겠고.,2
4,요즘 TV만 틀면 메르스 기사인지라정말이지 조바심도 나고 어찌해야 할지..,2


In [11]:
# data_list = []
# for ques, label in zip(df_sample['Sentence'], df_sample['New'])  :
#     data = []   
#     data.append(ques)
#     data.append(str(label))

#     data_list.append(data)

In [26]:
#train & test 데이터로 나누기
from sklearn.model_selection import train_test_split
                                                         
dataset_train, dataset_test = train_test_split(df_sample, test_size=0.25, random_state=0, stratify = df_sample['label'])
print(len(dataset_train))
print(len(dataset_test))

7500
2500


In [27]:
dataset_test['label'].value_counts()

4    500
3    500
1    500
2    500
0    500
Name: label, dtype: int64

In [28]:
dataset_test.head()

Unnamed: 0,document,label
4501,인맥축구 그만하고 실력껏 뽑아라,4
8601,제발 너무 힘들어요 정말,3
3245,그래서 언제 올지 모르는 기약없는 기다림을 해야하는데지금 무지 잠이 쏟아지고 있음....,1
9386,난 이제 그 사람 번호가 없거든요,3
2052,야구선수는 실력으로 보여주면 된다 ㅎㅎ,1


In [86]:
dataset_train.to_excel('/content/drive/MyDrive/NLP Project/train_data.xlsx')
dataset_test.to_excel('/content/drive/MyDrive/NLP Project/test_data.xlsx')

# 패키지 import & 기본 Args 설정

In [10]:
import os
import pandas as pd

from pprint import pprint

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import ExponentialLR, CosineAnnealingWarmRestarts

from pytorch_lightning import LightningModule, Trainer, seed_everything

from transformers import BertForSequenceClassification, BertTokenizer, AdamW

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import re
import emoji
from soynlp.normalizer import repeat_normalize

In [6]:
# !pip uninstall emoji
# !pip install emoji==1.7.0

Found existing installation: emoji 2.2.0
Uninstalling emoji-2.2.0:
  Would remove:
    /usr/local/lib/python3.8/dist-packages/emoji-2.2.0.dist-info/*
    /usr/local/lib/python3.8/dist-packages/emoji/*
Proceed (Y/n)? y
  Successfully uninstalled emoji-2.2.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji==1.7.0
  Downloading emoji-1.7.0.tar.gz (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 KB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171046 sha256=9255302f5541098fda721eec9d27b1526b5f783b89b4e4b639e707d0b3da11a4
  Stored in directory: /root/.cache/pip/wheels/5e/8c/80/c3646df8201ba6f5070297fe3779a4b70265d0bfd961c15302
Successfully built emoji
Installing c

In [4]:
emoji.__version__

'1.7.0'

## 기본 학습 Arguments

In [5]:
class Arg:
    random_seed: int = 42  # Random Seed
    pretrained_model: str = 'beomi/kcbert-base'  # Transformers PLM name
    pretrained_tokenizer: str = ''  # Optional, Transformers Tokenizer Name. Overrides `pretrained_model`
    auto_batch_size: str = 'power'  # Let PyTorch Lightening find the best batch size 
    batch_size: int = 0  # Optional, Train/Eval Batch Size. Overrides `auto_batch_size` 
    lr: float = 5e-6  # Starting Learning Rate
    epochs: int = 10  # Max Epochs
    max_length: int = 150  # Max Length input size
    report_cycle: int = 100  # Report (Train Metrics) Cycle
    train_data_path: str = '/content/drive/MyDrive/NLP Project/train_data.xlsx'  # Train Dataset file , 학습 데이터 파일 바꾸기
    val_data_path: str = '/content/drive/MyDrive/NLP Project/test_data.xlsx'  # Validation Dataset file 
    cpu_workers: int = os.cpu_count()  # Multi cpu workers
    test_mode: bool = False  # Test Mode enables `fast_dev_run`
    optimizer: str = 'AdamW'  # AdamW vs AdamP
    lr_scheduler: str = 'cos'  # ExponentialLR vs CosineAnnealingWarmRestarts
    fp16: bool = False  # Enable train on FP16
    tpu_cores: int = 0  # Enable TPU with 1 core or 8 cores

args = Arg()

## 기본값을 Override 하고싶은 경우 아래와 같이 수정

In [6]:
!nvidia-smi

Wed Jan 25 13:41:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P0    30W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

위에서 GPU가 V100/P100이면 아래 `batch_size`  를 32 이상으로 하셔도 됩니다.

In [6]:
# args.tpu_cores = 8  # Enables TPU
args.fp16 = True  # Enables GPU FP16
args.batch_size = 32  # Force setup batch_size

# Model 만들기 with Pytorch Lightning

In [7]:
class Model(LightningModule):
    def __init__(self, options):
        super().__init__()
        self.args = options
        #수정 num_labels = 5
        self.bert = BertForSequenceClassification.from_pretrained(self.args.pretrained_model, num_labels = 5)
        self.tokenizer = BertTokenizer.from_pretrained(
            self.args.pretrained_tokenizer
            if self.args.pretrained_tokenizer
            else self.args.pretrained_model
        )

    def forward(self, **kwargs):
        return self.bert(**kwargs)

    def training_step(self, batch, batch_idx):
        data, labels = batch
        output = self(input_ids=data, labels=labels)

        # Transformers 4.0.0+
        loss = output.loss
        logits = output.logits
        
        preds = logits.argmax(dim=-1)

        y_true = labels.cpu().numpy()
        y_pred = preds.cpu().numpy()

        # Acc, Precision, Recall, F1
        metrics = [
            metric(y_true=y_true, y_pred=y_pred, average = 'micro')
            for metric in
            (precision_score, recall_score, f1_score)
        ]

        tensorboard_logs = {
            'train_loss': loss.cpu().detach().numpy().tolist(),
            'train_precision': metrics[0],
            'train_recall': metrics[1],
            'train_f1': metrics[2],
        }
        if (batch_idx % self.args.report_cycle) == 0:
            print()
            pprint(tensorboard_logs)
        return {'loss': loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        data, labels = batch
        output = self(input_ids=data, labels=labels)

        # Transformers 4.0.0+
        loss = output.loss
        logits = output.logits

        preds = logits.argmax(dim=-1)

        y_true = list(labels.cpu().numpy())
        y_pred = list(preds.cpu().numpy())

        return {
            'loss': loss,
            'y_true': y_true,
            'y_pred': y_pred,
        }

    def validation_epoch_end(self, outputs):
        loss = torch.tensor(0, dtype=torch.float)
        for i in outputs:
            loss += i['loss'].cpu().detach()
        _loss = loss / len(outputs)

        loss = float(_loss)
        y_true = []
        y_pred = []

        for i in outputs:
            y_true += i['y_true']
            y_pred += i['y_pred']

        # Acc, Precision, Recall, F1
        metrics = [
            metric(y_true=y_true, y_pred=y_pred, average = 'micro')
            for metric in
            (precision_score, recall_score, f1_score)
        ]

        tensorboard_logs = {
            'val_loss': loss,
            'val_precision': metrics[0],
            'val_recall': metrics[1],
            'val_f1': metrics[2],
        }

        print()
        pprint(tensorboard_logs)
        return {'loss': _loss, 'log': tensorboard_logs}

    def configure_optimizers(self):
        if self.args.optimizer == 'AdamW':
            optimizer = AdamW(self.parameters(), lr=self.args.lr)
        elif self.args.optimizer == 'AdamP':
            from adamp import AdamP
            optimizer = AdamP(self.parameters(), lr=self.args.lr)
        else:
            raise NotImplementedError('Only AdamW and AdamP is Supported!')
        if self.args.lr_scheduler == 'cos':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2)
        elif self.args.lr_scheduler == 'exp':
            scheduler = ExponentialLR(optimizer, gamma=0.5)
        else:
            raise NotImplementedError('Only cos and exp lr scheduler is Supported!')
        return {
            'optimizer': optimizer,
            'scheduler': scheduler,
        }

    def read_data(self, path):
        if path.endswith('xlsx'):
            return pd.read_excel(path)
        elif path.endswith('csv'):
            return pd.read_csv(path)
        elif path.endswith('tsv') or path.endswith('txt'):
            return pd.read_csv(path, sep='\t')
        else:
            raise NotImplementedError('Only Excel(xlsx)/Csv/Tsv(txt) are Supported')

    def preprocess_dataframe(self, df):
        emojis = ''.join(emoji.UNICODE_EMOJI.keys())
        pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
        url_pattern = re.compile(
            r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

        def clean(x):
            x = pattern.sub(' ', x)
            x = url_pattern.sub('', x)
            x = x.strip()
            x = repeat_normalize(x, num_repeats=2)
            return x

        df['document'] = df['document'].map(lambda x: self.tokenizer.encode(
            clean(str(x)),
            padding='max_length',
            max_length=self.args.max_length,
            truncation=True,
        ))
        return df

    def train_dataloader(self):
        df = self.read_data(self.args.train_data_path)
        df = self.preprocess_dataframe(df)

        dataset = TensorDataset(
            torch.tensor(df['document'].to_list(), dtype=torch.long),
            torch.tensor(df['label'].to_list(), dtype=torch.long),
        )
        return DataLoader(
            dataset,
            batch_size=self.args.batch_size or self.batch_size,
            shuffle=True,
            num_workers=self.args.cpu_workers,
        )

    def val_dataloader(self):
        df = self.read_data(self.args.val_data_path)
        df = self.preprocess_dataframe(df)

        dataset = TensorDataset(
            torch.tensor(df['document'].to_list(), dtype=torch.long),
            torch.tensor(df['label'].to_list(), dtype=torch.long),
        )
        return DataLoader(
            dataset,
            batch_size=self.args.batch_size or self.batch_size,
            shuffle=False,
            num_workers=self.args.cpu_workers,
        )

In [8]:
def main():
    print("Using PyTorch Ver", torch.__version__)
    print("Fix Seed:", args.random_seed)
    seed_everything(args.random_seed)
    model = Model(args)

    print(":: Start Training ::")
    trainer = Trainer(
        max_epochs=args.epochs,
        fast_dev_run=args.test_mode,
        num_sanity_val_steps=None if args.test_mode else 0,
        auto_scale_batch_size=args.auto_batch_size if args.auto_batch_size and not args.batch_size else False,
        # For GPU Setup
        deterministic=torch.cuda.is_available(),
        gpus=-1 if torch.cuda.is_available() else None,
        precision=16 if args.fp16 else 32,
        # For TPU Setup
        tpu_cores=args.tpu_cores if args.tpu_cores else None,
    )
    trainer.fit(model)

# 학습!

> 주의: 1epoch별로 GPU-P100기준 약 1-2시간, GPU V100기준 ~30분이 걸립니다.

> Update @ 2020.09.01
> 최근 Colab Pro에서 V100이 배정됩니다.

```python
# 1epoch 기준 아래 score가 나옵니다.
{'val_acc': 0.90522,
 'val_f1': 0.9049023739289227,
 'val_loss': 0.23429009318351746,
 'val_precision': 0.9143146796431468,
 'val_recall': 0.8956818813808446}
```

In [9]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "-1"

In [None]:
main()

INFO:lightning_fabric.utilities.seed:Global seed set to 42


Using PyTorch Ver 1.13.1+cu116
Fix Seed: 42


Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

:: Start Training ::


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(
INFO:pytorch_lightning.callbacks.model_summary:
  | Name | Type                          | Params
-------------------------------------------------------
0 | bert | BertForSequenceClassification | 108 M 
-------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
217.845   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]


{'train_f1': 0.1875,
 'train_loss': 1.728912353515625,
 'train_precision': 0.1875,
 'train_recall': 0.1875}

{'train_f1': 0.5625,
 'train_loss': 1.372344970703125,
 'train_precision': 0.5625,
 'train_recall': 0.5625}

{'train_f1': 0.59375,
 'train_loss': 1.2631149291992188,
 'train_precision': 0.59375,
 'train_recall': 0.59375}


Validation: 0it [00:00, ?it/s]


{'val_f1': 0.7072,
 'val_loss': 0.8016902804374695,
 'val_precision': 0.7072,
 'val_recall': 0.7072}

{'train_f1': 0.6875,
 'train_loss': 0.7636184692382812,
 'train_precision': 0.6875,
 'train_recall': 0.6875}

{'train_f1': 0.8125,
 'train_loss': 0.5947599411010742,
 'train_precision': 0.8125,
 'train_recall': 0.8125}

{'train_f1': 0.84375,
 'train_loss': 0.4520282745361328,
 'train_precision': 0.84375,
 'train_recall': 0.84375}


Validation: 0it [00:00, ?it/s]


{'val_f1': 0.7312,
 'val_loss': 0.7238824963569641,
 'val_precision': 0.7312,
 'val_recall': 0.7312}

{'train_f1': 0.78125,
 'train_loss': 0.6294317245483398,
 'train_precision': 0.78125,
 'train_recall': 0.78125}

{'train_f1': 0.65625,
 'train_loss': 0.8144717216491699,
 'train_precision': 0.65625,
 'train_recall': 0.65625}

{'train_f1': 0.6875,
 'train_loss': 0.7557592391967773,
 'train_precision': 0.6875,
 'train_recall': 0.6875}


Validation: 0it [00:00, ?it/s]


{'val_f1': 0.736,
 'val_loss': 0.7225971221923828,
 'val_precision': 0.736,
 'val_recall': 0.736}

{'train_f1': 0.8125,
 'train_loss': 0.5438685417175293,
 'train_precision': 0.8125,
 'train_recall': 0.8125}

{'train_f1': 0.71875,
 'train_loss': 0.47316503524780273,
 'train_precision': 0.71875,
 'train_recall': 0.71875}

{'train_f1': 0.84375,
 'train_loss': 0.3368723392486572,
 'train_precision': 0.84375,
 'train_recall': 0.84375}


Validation: 0it [00:00, ?it/s]


{'val_f1': 0.7376,
 'val_loss': 0.7420414090156555,
 'val_precision': 0.7376,
 'val_recall': 0.7376}
