## Import & Setting

In [1]:
# T4보다 P100이 더 빠릅니다.
!nvidia-smi

Sun Apr 10 14:10:54 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd /content/drive/MyDrive/industry_classification

/content/drive/MyDrive/industry_classification


In [4]:
!pip install transformers -qq
!pip install datasets -qq
!pip install wandb -qq
!pip install scikit-learn -qq

In [5]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm, tqdm_notebook
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import TrainingArguments, Trainer

In [6]:
from logger import get_logger
from preprocess import Preprocess
from model import Model
from loss import FocalLoss
from dataset import IndustryDataset
from label_encoder import get_label_encoder

In [7]:
def seed_everything(seed) :
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
seed_everything(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [8]:
# root logger setting
import logging
FORMAT = '%(asctime)s - %(name)s | %(levelname)s - %(message)s'
logging.basicConfig(filename="run.log", format=FORMAT, level=logging.INFO)

## Prepare Data

In [None]:
train = pd.read_csv('data/1. 실습용자료.txt', sep='|', encoding='cp949')

In [None]:
preprocesser = Preprocess()
train = preprocesser.train_preprocess(train)

2022-04-07 13:26:26,303 - preprocess | INFO - Success train data Preprocessing
2022-04-07 13:26:26,303 - preprocess | INFO - Success train data Preprocessing
2022-04-07 13:26:26,303 - preprocess | INFO - Success train data Preprocessing


In [None]:
train_dataset, eval_dataset = train_test_split(train, test_size=0.2, stratify=train["digit_1"], random_state=42)
train_dataset = train_dataset.reset_index(drop=True)
eval_dataset = eval_dataset.reset_index(drop=True)

## Label Encoding

In [None]:
label_encoder = get_label_encoder()
train_encoded = label_encoder.transform(train["label"])
train_encoded

array([224, 122, 119, ..., 124, 208, 145])

## Load Pretrained Model, Tokenizer

In [None]:
# "monologg/kobert", "monologg/kodistilbert" evaluation 후 vocab 저장 과정에서 오류 확인
model_name = "klue/roberta-small"
model_info = Model(model_name)
model = model_info.get_model()
tokenizer = model_info.get_tokenizer()

Downloading:   0%|          | 0.00/545 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/260M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

2022-04-07 13:25:13,849 - model | INFO - Success Loading tokenizer: klue/roberta-small


## Dataset

In [None]:
 train_dataset["label"] = label_encoder.transform(train_dataset["label"])
 eval_dataset["label"] = label_encoder.transform(eval_dataset["label"])

In [None]:
train_dataset = IndustryDataset(train_dataset, tokenizer)
eval_dataset = IndustryDataset(eval_dataset, tokenizer)

## Train

In [None]:
# https://huggingface.co/course/chapter3/3?fw=pt
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
# 2개 이상 metric 정상 적용 X

from datasets import load_metric
from sklearn.metrics import f1_score, accuracy_score

# acuracy_metric = load_metric('accuracy')
# f1_metric = load_metric('f1')


def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)
    label_indices=list(range(len(labels)))
    # # accuracy = acuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    # # f1 = f1_metric.compute(predictions=predictions, references=labels, average="micro", labels=label_indices)["f1"]
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro", labels=label_indices)
    return {"accuracy": accuracy, "f1": f1}

In [None]:
class CustomTrainer(Trainer):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def compute_loss(self, model, inputs, return_outputs=False):
    custom_loss = FocalLoss()
    labels = inputs.pop("labels")
    outputs = model(**inputs)

    if labels is not None:
      loss = custom_loss(outputs.get('logits'), labels)
      loss = loss.mean()
    else:
      loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
    
    return (loss, outputs) if return_outputs else loss

In [None]:
torch.cuda.empty_cache()

In [None]:
import wandb
wandb.login()

training_args=TrainingArguments(output_dir="./test",
                                num_train_epochs=1,
                                learning_rate=5e-5,
                                save_total_limit=3,
                                # save_steps=3000,
                                per_device_train_batch_size=128,
                                per_device_eval_batch_size=128,
                                evaluation_strategy='epoch',
                                save_strategy='epoch',
                                # eval_steps = 3000,
                                logging_first_step=True,
                                logging_dir="./",
                                logging_steps=100,
                                seed=42,
                                weight_decay=0.01,
                                load_best_model_at_end = True,
                                report_to="wandb",
                                run_name="test")
trainer = CustomTrainer(model=model,
                  args=training_args,
                  train_dataset=train_set,
                  eval_dataset=val_set,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
torch.cuda.empty_cache()

In [None]:
train_result = trainer.train() 
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

model.save_pretrained('./test/result/best_model')
logging.info("Success model trained")

***** Running training *****
  Num examples = 800000
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 6250
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mjdg4661[0m (use `wandb login --relogin` to force relogin)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

## Inference or Ensemble

### Ensemble

In [9]:
submission_name = "roberta-base_ensemble"

# If you use ensemble inference
ensemble_entry = ['./roberta-base_0/checkpoint-25000', 
                  './roberta-base_1/checkpoint-25000',
                  './roberta-base_2/checkpoint-25000',
                  './roberta-base_3/checkpoint-25000',
                  './roberta-base_4/checkpoint-25000']

In [10]:
from ensemble import Ensemble

test = pd.read_csv('data/2. 모델개발용자료.txt', sep='|', encoding='cp949')
submission = pd.read_csv("data/답안 작성용 파일.csv", encoding='cp949')

ensemble_model = Ensemble(test, submission, ensemble_entry)

In [11]:
submission = ensemble_model.soft_ensemble()

2022-04-10 14:11:18,189 - model | INFO - Success Loading Model: ./roberta-base_0/checkpoint-25000
2022-04-10 14:11:18,218 - model | INFO - Success Loading tokenizer: ./roberta-base_0/checkpoint-25000
2022-04-10 14:11:24,074 - preprocess | INFO - Success test data Preprocessing
***** Running Prediction *****
  Num examples = 100000
  Batch size = 128


loading configuration file ./roberta-base_1/checkpoint-25000/config.json
Model config RobertaConfig {
  "_name_or_path": "./roberta-base_1/checkpoint-25000",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25":

loading configuration file ./roberta-base_2/checkpoint-25000/config.json
Model config RobertaConfig {
  "_name_or_path": "./roberta-base_2/checkpoint-25000",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25":

loading configuration file ./roberta-base_3/checkpoint-25000/config.json
Model config RobertaConfig {
  "_name_or_path": "./roberta-base_3/checkpoint-25000",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25":

loading configuration file ./roberta-base_4/checkpoint-25000/config.json
Model config RobertaConfig {
  "_name_or_path": "./roberta-base_4/checkpoint-25000",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25":

2022-04-10 14:49:32,762 - inference | INFO - Success decoding
2022-04-10 14:49:32,764 - inference | INFO - Success ensemble and inference


In [12]:
submission.to_csv(f"{submission_name}.csv", index=False)

In [13]:
submission

Unnamed: 0,AI_id,digit_1,digit_2,digit_3,text_obj,text_mthd,text_deal
0,id_000001,I,56,561,치킨전문점에서,고객의주문에의해,치킨판매
1,id_000002,G,46,466,산업공구,다른 소매업자에게,철물 수공구
2,id_000003,S,94,949,절에서,신도을 대상으로,불교단체운영
3,id_000004,C,30,302,영업장에서,고객요구로,자동차튜닝
4,id_000005,I,56,562,실내포장마차에서,접객시설을 갖추고,"소주,맥주제공"
...,...,...,...,...,...,...,...
99995,id_099996,G,47,472,사업장에서,일반인대상으로,버섯농장
99996,id_099997,Q,86,862,한의원에서,외래환자위주고,치료
99997,id_099998,G,47,478,일반점포에서,소비자에게,그림판매
99998,id_099999,R,90,902,사업장에서,일반인.학생대상으로,학습공간제공


In [21]:
compare = pd.read_csv("koelectra-base.csv")
count = 0
for i in range(100000):
  if compare["digit_3"][i] != submission["digit_3"][i]:
    count += 1
count

6031

### Inference

In [None]:
from inference import Inference

test = pd.read_csv('data/2. 모델개발용자료.txt', sep='|', encoding='cp949')
submission = pd.read_csv("data/답안 작성용 파일.csv", encoding='cp949')

# Inference(test, inference_model의 위치, submission df파일)
inference_model = Inference(test, inference_model, submission)

In [None]:
# 혹시 UnboundedLocalError 발생시 한 번 더 실행해주세요
result = inference_model.inference()

2022-04-02 01:33:36,414 - model | INFO - Success Loading Model: ./roberta-small/checkpoint-30000
2022-04-02 01:33:36,414 - model | INFO - Success Loading Model: ./roberta-small/checkpoint-30000
2022-04-02 01:33:37,840 - model | INFO - Success Loading tokenizer: ./roberta-small/checkpoint-30000
2022-04-02 01:33:37,840 - model | INFO - Success Loading tokenizer: ./roberta-small/checkpoint-30000
2022-04-02 01:33:50,649 - preprocess | INFO - Success test data Preprocessing
***** Running Prediction *****
  Num examples = 100000
  Batch size = 8


2022-04-02 01:39:53,017 - inference | INFO - Success decoding
2022-04-02 01:39:53,019 - inference | INFO - Success inference: ./roberta-small/checkpoint-30000


In [None]:
result.to_csv(f"{submission_name}.csv", index=False)