### Import & Setting

In [1]:
# T4보다 P100이 더 빠릅니다.
!nvidia-smi

Thu Apr  7 13:58:43 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/MyDrive/industry_classification

/content/drive/MyDrive/industry_classification


In [4]:
!pip install transformers -qq
!pip install datasets -qq
!pip install wandb -qq
!pip install scikit-learn -qq

[K     |████████████████████████████████| 4.0 MB 4.1 MB/s 
[K     |████████████████████████████████| 77 kB 7.4 MB/s 
[K     |████████████████████████████████| 895 kB 60.7 MB/s 
[K     |████████████████████████████████| 596 kB 85.0 MB/s 
[K     |████████████████████████████████| 6.5 MB 64.7 MB/s 
[K     |████████████████████████████████| 325 kB 4.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 72.1 MB/s 
[K     |████████████████████████████████| 136 kB 80.0 MB/s 
[K     |████████████████████████████████| 212 kB 86.2 MB/s 
[K     |████████████████████████████████| 127 kB 56.1 MB/s 
[K     |████████████████████████████████| 271 kB 79.3 MB/s 
[K     |████████████████████████████████| 94 kB 126 kB/s 
[K     |████████████████████████████████| 144 kB 75.2 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires foliu

In [25]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm, tqdm_notebook
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import TrainingArguments, Trainer

In [26]:
from logger import get_logger
from preprocess import Preprocess
from model import Model
from loss import FocalLoss
from dataset import IndustryDataset
from label_encoder import get_label_encoder

In [27]:
def seed_everything(seed) :
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
seed_everything(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [28]:
'''
#############
fold 번호를 입력해주세요! (0~4)
#############
'''
fold_num = 0

In [29]:
# root logger setting
import logging
FORMAT = '%(asctime)s - %(name)s | %(levelname)s - %(message)s'
logging.basicConfig(filename=f"roberta-base_{fold_num}.log", format=FORMAT, level=logging.INFO)

### Prepare Data

In [30]:
train = pd.read_csv('data/1. 실습용자료.txt', sep='|', encoding='cp949')

In [31]:
preprocesser = Preprocess()
train = preprocesser.train_preprocess(train)

2022-04-07 14:08:35,777 - preprocess | INFO - Success train data Preprocessing
2022-04-07 14:08:35,777 - preprocess | INFO - Success train data Preprocessing


In [32]:
train_dataset, eval_dataset = train_test_split(train, test_size=0.2, stratify=train["digit_1"], random_state=42)
train_dataset = train_dataset.reset_index(drop=True)
eval_dataset = eval_dataset.reset_index(drop=True)

### Label Encoding

In [33]:
label_encoder = get_label_encoder()
train_encoded = label_encoder.transform(train["label"])
train_encoded

array([224, 122, 119, ..., 124, 208, 145])

### Load Pretrained Model, Tokenizer

In [34]:
model_name = "klue/roberta-base"
model_info = Model(model_name)
model = model_info.get_model()
tokenizer = model_info.get_tokenizer()

loading configuration file https://huggingface.co/klue/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a96469ca2a238496d435a0e9e202f261119c146a0326444b6d68ae1adc35e04f.85b0b02ba2a483f3adb8a60ab70dbd875768fcd5e6cdb21a593c6e02fdffac3a
Model config RobertaConfig {
  "_name_or_path": "klue/roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,


### Train

In [35]:
from sklearn.model_selection import KFold, StratifiedKFold
from torch.utils.data.dataset import Subset

kfd = StratifiedKFold(n_splits=5, shuffle=False)
train["label"] = label_encoder.transform(train["label"])
train_data = IndustryDataset(train, tokenizer)

In [36]:
# fold 생성, digit_1을 기준으로 stratified fold
train_idx = []
val_idx = []
for (train, val) in kfd.split(train["text"], train["digit_1"]):
  train_idx.append(train)
  val_idx.append(val)

In [37]:
train_set = Subset(train_data, train_idx[fold_num])
val_set = Subset(train_data, val_idx[fold_num])

In [38]:
# https://huggingface.co/course/chapter3/3?fw=pt
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
# 2개 이상 metric 정상 적용 X

from datasets import load_metric
from sklearn.metrics import f1_score, accuracy_score

# acuracy_metric = load_metric('accuracy')
# f1_metric = load_metric('f1')


def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)
    label_indices=list(range(len(labels)))
    # # accuracy = acuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    # # f1 = f1_metric.compute(predictions=predictions, references=labels, average="micro", labels=label_indices)["f1"]
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro", labels=label_indices)
    return {"accuracy": accuracy, "f1": f1}

In [39]:
class CustomTrainer(Trainer):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def compute_loss(self, model, inputs, return_outputs=False):
    custom_loss = FocalLoss()
    labels = inputs.pop("labels")
    outputs = model(**inputs)

    if labels is not None:
      loss = custom_loss(outputs.get('logits'), labels)
      loss = loss.mean()
    else:
      loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
    
    return (loss, outputs) if return_outputs else loss

In [40]:
torch.cuda.empty_cache()

In [41]:
import wandb
wandb.login()

training_args=TrainingArguments(output_dir=f"./roberta-base_{fold_num}",
                                num_train_epochs=5,
                                learning_rate=5e-5,
                                save_total_limit=3,
                                # save_steps=3000,
                                per_device_train_batch_size=128,
                                per_device_eval_batch_size=128,
                                evaluation_strategy='epoch',
                                save_strategy='epoch',
                                # eval_steps = 3000,
                                logging_first_step=True,
                                logging_dir="./",
                                logging_steps=100,
                                seed=42,
                                weight_decay=0.01,
                                load_best_model_at_end = True,
                                report_to="wandb",
                                run_name=f"./roberta-base_{fold_num}")
trainer = CustomTrainer(model=model,
                  args=training_args,
                  train_dataset=train_set,
                  eval_dataset=val_set,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

PyTorch: setting up devices


In [42]:
torch.cuda.empty_cache()

In [None]:
train_result = trainer.train() 
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

model.save_pretrained(f"./roberta-base_{fold_num}/result/best_model")
logging.info("Success model trained")

***** Running training *****
  Num examples = 800000
  Num Epochs = 5
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 31250
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss
