### Import & Setting

In [1]:
# T4보다 P100이 더 빠릅니다.
!nvidia-smi

Sat Apr  2 10:58:12 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd /content/drive/MyDrive/industry_classification

/content/drive/MyDrive/industry_classification


In [5]:
!pip install transformers -qq
!pip install datasets -qq
!pip install wandb -qq
!pip install scikit-learn -qq

[K     |████████████████████████████████| 3.8 MB 7.2 MB/s 
[K     |████████████████████████████████| 67 kB 6.2 MB/s 
[K     |████████████████████████████████| 895 kB 67.6 MB/s 
[K     |████████████████████████████████| 596 kB 64.8 MB/s 
[K     |████████████████████████████████| 6.5 MB 55.7 MB/s 
[K     |████████████████████████████████| 325 kB 7.8 MB/s 
[K     |████████████████████████████████| 212 kB 67.3 MB/s 
[K     |████████████████████████████████| 136 kB 76.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 69.0 MB/s 
[K     |████████████████████████████████| 127 kB 86.5 MB/s 
[K     |████████████████████████████████| 94 kB 4.0 MB/s 
[K     |████████████████████████████████| 144 kB 68.7 MB/s 
[K     |████████████████████████████████| 271 kB 78.9 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires foliu

In [6]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm, tqdm_notebook
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import TrainingArguments, Trainer

In [7]:
from logger import get_logger
from preprocess import Preprocess
from model import Model
from loss import FocalLoss
from dataset import IndustryDataset
from label_encoder import get_label_encoder

In [8]:
def seed_everything(seed) :
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
seed_everything(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [9]:
# root logger setting
import logging
FORMAT = '%(asctime)s - %(name)s | %(levelname)s - %(message)s'
logging.basicConfig(filename="run.log", format=FORMAT, level=logging.INFO)

### Prepare Data

In [10]:
train = pd.read_csv('data/1. 실습용자료.txt', sep='|', encoding='cp949')

In [11]:
preprocesser = Preprocess()
train = preprocesser.train_preprocess(train)

2022-04-02 10:59:28,326 - preprocess | INFO - Success train data Preprocessing


In [12]:
train_dataset, eval_dataset = train_test_split(train, test_size=0.2, stratify=train["digit_1"], random_state=42)
train_dataset = train_dataset.reset_index(drop=True)
eval_dataset = eval_dataset.reset_index(drop=True)

### Label Encoding

In [13]:
label_encoder = get_label_encoder()
train_encoded = label_encoder.transform(train["label"])
train_encoded

array([224, 122, 119, ..., 124, 208, 145])

### Load Pretrained Model, Tokenizer

In [43]:
# "monologg/kobert", "monologg/kodistilbert" evaluation 후 vocab 저장 과정에서 오류 확인
model_name = "klue/roberta-small"
model_info = Model(model_name)
model = model_info.get_model()
tokenizer = model_info.get_tokenizer()

loading configuration file https://huggingface.co/klue/roberta-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6aa0817a5b48e36ec821c3d9cb8267f2dda41a32b317311688f06c742b6b2a1d.7f009d6b27d54554dce063a5fc8273742e9a198f6f627aada4bc5ea2dbbc0313
Model config RobertaConfig {
  "_name_or_path": "klue/roberta-small",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.17.0",
  "type_vocab_size": 1,
  "use_cache": true,

### Dataset

In [15]:
 train_dataset["label"] = label_encoder.transform(train_dataset["label"])
 eval_dataset["label"] = label_encoder.transform(eval_dataset["label"])

In [16]:
train_dataset = IndustryDataset(train_dataset, tokenizer)
eval_dataset = IndustryDataset(eval_dataset, tokenizer)

### Train

In [44]:
# https://huggingface.co/course/chapter3/3?fw=pt
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
# 2개 이상 metric 정상 적용 X

from datasets import load_metric
from sklearn.metrics import f1_score, accuracy_score

# acuracy_metric = load_metric('accuracy')
# f1_metric = load_metric('f1')


def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)
    label_indices=list(range(len(labels)))
    # # accuracy = acuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    # # f1 = f1_metric.compute(predictions=predictions, references=labels, average="micro", labels=label_indices)["f1"]
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="micro", labels=label_indices)
    return {"accuracy": accuracy, "f1": f1}

In [45]:
class CustomTrainer(Trainer):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def compute_loss(self, model, inputs, return_outputs=False):
    custom_loss = FocalLoss()   
    labels = inputs.pop("labels")
    outputs = model(**inputs)

    if labels is not None:
      loss = custom_loss(outputs.get('logits'), labels)
      loss = loss.mean()
    else:
      loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
    
    return (loss, outputs) if return_outputs else loss


In [46]:
torch.cuda.empty_cache()

In [47]:
import wandb
wandb.login()

training_args=TrainingArguments(output_dir="./test",
                                num_train_epochs=1,
                                learning_rate=5e-5,
                                save_total_limit=3,
                                # save_steps=3000,
                                per_device_train_batch_size=128,
                                per_device_eval_batch_size=128,
                                evaluation_strategy='epoch',
                                save_strategy='epoch',
                                # eval_steps = 3000,
                                logging_first_step=True,
                                logging_dir="./",
                                logging_steps=100,
                                seed=42,
                                weight_decay=0.01,
                                load_best_model_at_end = True,
                                report_to="wandb",
                                run_name="test")
trainer = CustomTrainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

PyTorch: setting up devices


In [48]:
torch.cuda.empty_cache()

In [25]:
train_result = trainer.train() 
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

model.save_pretrained('./roberta-small/result/best_model')
logging.info("Success model trained")

***** Running training *****
  Num examples = 800000
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 6250
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2278,0.211098,0.91495,0.91495


***** Running Evaluation *****
  Num examples = 200000
  Batch size = 128
Saving model checkpoint to ./test/checkpoint-6250
Configuration saved in ./test/checkpoint-6250/config.json
Model weights saved in ./test/checkpoint-6250/pytorch_model.bin
tokenizer config file saved in ./test/checkpoint-6250/tokenizer_config.json
Special tokens file saved in ./test/checkpoint-6250/special_tokens_map.json
Deleting older checkpoint [test/checkpoint-3] due to args.save_total_limit
Deleting older checkpoint [test/checkpoint-6] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./test/checkpoint-6250 (score: 0.21109837293624878).
Configuration saved in ./roberta-small/result/best_model/config.json
Model weights saved in ./roberta-small/result/best_model/pytorch_model.bin


### Inference

In [None]:
submission_name = "roberta-small_test"
inference_model = './roberta-small/checkpoint-30000'

In [None]:
from inference import Inference

test = pd.read_csv('data/2. 모델개발용자료.txt', sep='|', encoding='cp949')
submission = pd.read_csv("data/답안 작성용 파일.csv", encoding='cp949')

# Inference(test, inference_model의 위치, submission df파일)
inference_model = Inference(test, inference_model, submission)

In [None]:
# 혹시 UnboundedLocalError 발생시 한 번 더 실행해주세요
result = inference_model.inference()

2022-04-02 01:33:36,414 - model | INFO - Success Loading Model: ./roberta-small/checkpoint-30000
2022-04-02 01:33:36,414 - model | INFO - Success Loading Model: ./roberta-small/checkpoint-30000
2022-04-02 01:33:37,840 - model | INFO - Success Loading tokenizer: ./roberta-small/checkpoint-30000
2022-04-02 01:33:37,840 - model | INFO - Success Loading tokenizer: ./roberta-small/checkpoint-30000
2022-04-02 01:33:50,649 - preprocess | INFO - Success test data Preprocessing
***** Running Prediction *****
  Num examples = 100000
  Batch size = 8


2022-04-02 01:39:53,017 - inference | INFO - Success decoding
2022-04-02 01:39:53,019 - inference | INFO - Success inference: ./roberta-small/checkpoint-30000


In [None]:
result.to_csv(f"{submission_name}.csv", index=False)