In [None]:
try:
    import transformers, emoji, soynlp, pytorch_lightning
except:
    !pip install -U -q transformers emoji soynlp pytorch-lightning

[K     |████████████████████████████████| 3.8 MB 7.3 MB/s 
[K     |████████████████████████████████| 175 kB 58.1 MB/s 
[K     |████████████████████████████████| 416 kB 63.3 MB/s 
[K     |████████████████████████████████| 527 kB 63.0 MB/s 
[K     |████████████████████████████████| 895 kB 53.1 MB/s 
[K     |████████████████████████████████| 67 kB 6.3 MB/s 
[K     |████████████████████████████████| 596 kB 50.3 MB/s 
[K     |████████████████████████████████| 6.5 MB 50.1 MB/s 
[K     |████████████████████████████████| 829 kB 63.2 MB/s 
[K     |████████████████████████████████| 134 kB 68.9 MB/s 
[K     |████████████████████████████████| 398 kB 61.1 MB/s 
[K     |████████████████████████████████| 952 kB 55.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 50.9 MB/s 
[K     |████████████████████████████████| 144 kB 71.8 MB/s 
[K     |████████████████████████████████| 94 kB 4.6 MB/s 
[K     |████████████████████████████████| 271 kB 74.8 MB/s 
[?25h  Building wheel for em

In [None]:
import os
import pandas as pd

from pprint import pprint

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import ExponentialLR

from pytorch_lightning import LightningModule, Trainer, seed_everything

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import re
import emoji
from soynlp.normalizer import repeat_normalize

In [None]:
args = {
    'random_seed': 42, # Random Seed
    'pretrained_model': 'beomi/KcELECTRA-base',  # Transformers PLM name
    'pretrained_tokenizer': '',  # Optional, Transformers Tokenizer Name. Overrides `pretrained_model`
    'batch_size': 64,
    'lr': 5e-6,  # Starting Learning Rate
    'epochs': 4,  # Max Epochs
    'max_length': 150,  # Max Length input size
    'train_data_path': "DB_train.csv",  # Train Dataset file 
    'val_data_path': "DB_test.csv",  # Validation Dataset file 
    'test_mode': False,  # Test Mode enables `fast_dev_run`
    'optimizer': 'AdamW',  # AdamW vs AdamP
    'lr_scheduler': 'exp',  # ExponentialLR vs CosineAnnealingWarmRestarts
    'fp16': True,  # Enable train on FP16(if GPU)
    'tpu_cores': 0,  # Enable TPU with 1 core or 8 cores
    'cpu_workers': os.cpu_count(),
}

In [None]:
!nvidia-smi

Sat Mar 26 03:42:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
class Model(LightningModule):
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters() # 이 부분에서 self.hparams에 위 kwargs가 저장된다.
        
        self.clsfier = AutoModelForSequenceClassification.from_pretrained(self.hparams.pretrained_model)
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.hparams.pretrained_tokenizer
            if self.hparams.pretrained_tokenizer
            else self.hparams.pretrained_model
        )

    def forward(self, **kwargs):
        return self.clsfier(**kwargs)

    def step(self, batch, batch_idx):
        data, labels = batch
        output = self(input_ids=data, labels=labels)

        # Transformers 4.0.0+
        loss = output.loss
        logits = output.logits

        preds = logits.argmax(dim=-1)

        y_true = list(labels.detach().cpu().numpy())
        y_pred = list(preds.detach().cpu().numpy())

        return {
            'loss': loss,
            'y_true': y_true,
            'y_pred': y_pred,
        }

    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def validation_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def epoch_end(self, outputs, state='train'):
        loss = torch.tensor(0, dtype=torch.float)
        for i in outputs:
            loss += i['loss'].cpu().detach()
        loss = loss / len(outputs)

        y_true = []
        y_pred = []
        for i in outputs:
            y_true += i['y_true']
            y_pred += i['y_pred']
        
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred)
        rec = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

        self.log(state+'_loss', float(loss), on_epoch=True, prog_bar=True)
        self.log(state+'_acc', acc, on_epoch=True, prog_bar=True)
        self.log(state+'_precision', prec, on_epoch=True, prog_bar=True)
        self.log(state+'_recall', rec, on_epoch=True, prog_bar=True)
        self.log(state+'_f1', f1, on_epoch=True, prog_bar=True)
        print(f'[Epoch {self.trainer.current_epoch} {state.upper()}] Loss: {loss}, Acc: {acc}, Prec: {prec}, Rec: {rec}, F1: {f1}')
        return {'loss': loss}
    
    def training_epoch_end(self, outputs):
        self.epoch_end(outputs, state='train')

    def validation_epoch_end(self, outputs):
        self.epoch_end(outputs, state='val')

    def configure_optimizers(self):
        if self.hparams.optimizer == 'AdamW':
            optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
        elif self.hparams.optimizer == 'AdamP':
            from adamp import AdamP
            optimizer = AdamP(self.parameters(), lr=self.hparams.lr)
        else:
            raise NotImplementedError('Only AdamW and AdamP is Supported!')
        if self.hparams.lr_scheduler == 'cos':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2)
        elif self.hparams.lr_scheduler == 'exp':
            scheduler = ExponentialLR(optimizer, gamma=0.5)
        else:
            raise NotImplementedError('Only cos and exp lr scheduler is Supported!')
        return {
            'optimizer': optimizer,
            'scheduler': scheduler,
        }

    def read_data(self, path):
        if path.endswith('xlsx'):
            return pd.read_excel(path)
        elif path.endswith('csv'):
            return pd.read_csv(path)
        elif path.endswith('tsv') or path.endswith('txt'):
            return pd.read_csv(path, sep='\t')
        else:
            raise NotImplementedError('Only Excel(xlsx)/Csv/Tsv(txt) are Supported')

    def clean(self, x):
        emojis = ''.join(emoji.UNICODE_EMOJI.keys())
        pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
        url_pattern = re.compile(
            r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
        x = pattern.sub(' ', x)
        x = url_pattern.sub('', x)
        x = x.strip()
        x = repeat_normalize(x, num_repeats=2)
        return x

    def encode(self, x, **kwargs):
        return self.tokenizer.encode(
            self.clean(str(x)),
            padding='max_length',
            max_length=self.hparams.max_length,
            truncation=True,
            **kwargs,
        )

    def preprocess_dataframe(self, df):
        df['message'] = df['message'].map(self.encode)
        return df

    def dataloader(self, path, shuffle=False):
        df = self.read_data(path)
        df = self.preprocess_dataframe(df)

        dataset = TensorDataset(
            torch.tensor(df['message'].to_list(), dtype=torch.long),
            torch.tensor(df['label'].to_list(), dtype=torch.long),
        )
        return DataLoader(
            dataset,
            batch_size=self.hparams.batch_size * 1 if not self.hparams.tpu_cores else self.hparams.tpu_cores,
            shuffle=shuffle,
            num_workers=self.hparams.cpu_workers,
        )

    def train_dataloader(self):
        return self.dataloader(self.hparams.train_data_path, shuffle=True)

    def val_dataloader(self):
        return self.dataloader(self.hparams.val_data_path, shuffle=False)

In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    filename='epoch{epoch}-val_acc{val_acc:.4f}',
    monitor='val_acc',
    save_top_k=3,
    mode='max',
    auto_insert_metric_name=False,
)

In [None]:
!nvidia-smi

Sat Mar 26 03:42:22 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
print("Using PyTorch Ver", torch.__version__)
print("Fix Seed:", args['random_seed'])
seed_everything(args['random_seed'])
model = Model(**args)

print(":: Start Training ::")
trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=args['epochs'],
    fast_dev_run=args['test_mode'],
    num_sanity_val_steps=None if args['test_mode'] else 0,
    # For GPU Setup
    deterministic=torch.cuda.is_available(),
    gpus=[0] if torch.cuda.is_available() else None,  # 0번 idx GPU  사용
    precision=16 if args['fp16'] and torch.cuda.is_available() else 32,
    #For TPU Setup
    #tpu_cores=args['tpu_cores'] if args['tpu_cores'] else None,
)
trainer.fit(model)

Global seed set to 42


Using PyTorch Ver 1.10.0+cu111
Fix Seed: 42


Downloading:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.weight', 'classifier.

Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/387k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


:: Start Training ::



  | Name    | Type                             | Params
-------------------------------------------------------------
0 | clsfier | ElectraForSequenceClassification | 124 M 
-------------------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
249.093   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

[Epoch 0 VAL] Loss: 0.6400181651115417, Acc: 0.6508333333333334, Prec: 0.8175438596491228, Rec: 0.3883333333333333, F1: 0.5265536723163841
[Epoch 0 TRAIN] Loss: 0.6824734807014465, Acc: 0.5544791666666666, Prec: 0.5663200608673599, Rec: 0.46520833333333333, F1: 0.510808646917534


Validating: 0it [00:00, ?it/s]

[Epoch 1 VAL] Loss: 0.2972770929336548, Acc: 0.89, Prec: 0.8887043189368771, Rec: 0.8916666666666667, F1: 0.8901830282861898
[Epoch 1 TRAIN] Loss: 0.4269288182258606, Acc: 0.8305208333333334, Prec: 0.8377687885884607, Rec: 0.8197916666666667, F1: 0.8286827419185004


Validating: 0it [00:00, ?it/s]

[Epoch 2 VAL] Loss: 0.2426200658082962, Acc: 0.9129166666666667, Prec: 0.9188503803888419, Rec: 0.9058333333333334, F1: 0.9122954259336971
[Epoch 2 TRAIN] Loss: 0.26733192801475525, Acc: 0.9065625, Prec: 0.9138918345705196, Rec: 0.8977083333333333, F1: 0.9057277982133474


Validating: 0it [00:00, ?it/s]

[Epoch 3 VAL] Loss: 0.22179995477199554, Acc: 0.92625, Prec: 0.9612263300270514, Rec: 0.8883333333333333, F1: 0.9233434387180598
[Epoch 3 TRAIN] Loss: 0.2072506546974182, Acc: 0.9266666666666666, Prec: 0.9342663273960984, Rec: 0.9179166666666667, F1: 0.9260193358554015


In [None]:
from glob import glob

latest_ckpt = sorted(glob('./lightning_logs/version_0/checkpoints/*.ckpt'))[-1]
latest_ckpt

'./lightning_logs/version_0/checkpoints/epoch3-val_acc0.9263.ckpt'

In [None]:
model = Model.load_from_checkpoint(latest_ckpt)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.weight', 'classifier.

In [None]:
import numpy as np

raw = pd.read_csv('DB_labeling.csv')
raw['label_logits'] = None
raw['label_KcELECTRA'] = np.nan

for idx, sent in enumerate(raw.message):
  result = model(**model.tokenizer(sent, return_tensors='pt', padding=True, truncation=True, max_length=512)).logits

  raw.label_logits[idx] = result.tolist()[0]
  raw.label_KcELECTRA[idx] = result.argmax(dim=1).item()
  
  print(idx, '번째 ...')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
135494 번째 ...
135495 번째 ...
135496 번째 ...
135497 번째 ...
135498 번째 ...
135499 번째 ...
135500 번째 ...
135501 번째 ...
135502 번째 ...
135503 번째 ...
135504 번째 ...
135505 번째 ...
135506 번째 ...
135507 번째 ...
135508 번째 ...
135509 번째 ...
135510 번째 ...
135511 번째 ...
135512 번째 ...
135513 번째 ...
135514 번째 ...
135515 번째 ...
135516 번째 ...
135517 번째 ...
135518 번째 ...
135519 번째 ...
135520 번째 ...
135521 번째 ...
135522 번째 ...
135523 번째 ...
135524 번째 ...
135525 번째 ...
135526 번째 ...
135527 번째 ...
135528 번째 ...
135529 번째 ...
135530 번째 ...
135531 번째 ...
135532 번째 ...
135533 번째 ...
135534 번째 ...
135535 번째 ...
135536 번째 ...
135537 번째 ...
135538 번째 ...
135539 번째 ...
135540 번째 ...
135541 번째 ...
135542 번째 ...
135543 번째 ...
135544 번째 ...
135545 번째 ...
135546 번째 ...
135547 번째 ...
135548 번째 ...
135549 번째 ...
135550 번째 ...
135551 번째 ...
135552 번째 ...
135553 번째 ...
135554 번째 ...
135555 번째 ...
135556 번째 ...
135557 번째 ...
135558 번째 ...
135559 번째 ...
135560 번째 ...
135561 번째 ..

In [None]:
labeling2 = raw[['', '', '', '', '', 'label',	'label_logits', 'label_KcELECTRA']]
labeling2.rename(columns = {'label':'label_lexicon'}, inplace=True)
labeling2.label_KcELECTRA = labeling2.label_KcELECTRA.astype(int)
labeling2.to_csv('DB_labeling_final.csv', encoding = 'utf-8-sig', index = False)
labeling2

In [None]:
from google.colab import files

files.download('DB_labeling_final.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>