<a href="https://colab.research.google.com/github/sw6820/kostat/blob/main/model/one_to_one_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd /content/drive/MyDrive/industry_classification

/content/drive/MyDrive/industry_classification


In [None]:
!pip install transformers -qq
!pip install datasets -qq
!pip install wandb -qq
!pip install scikit-learn -qq

[K     |████████████████████████████████| 3.8 MB 14.5 MB/s 
[K     |████████████████████████████████| 596 kB 68.1 MB/s 
[K     |████████████████████████████████| 6.5 MB 50.9 MB/s 
[K     |████████████████████████████████| 67 kB 4.9 MB/s 
[K     |████████████████████████████████| 895 kB 63.4 MB/s 
[K     |████████████████████████████████| 325 kB 13.7 MB/s 
[K     |████████████████████████████████| 134 kB 80.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 69.3 MB/s 
[K     |████████████████████████████████| 212 kB 97.1 MB/s 
[K     |████████████████████████████████| 127 kB 73.6 MB/s 
[K     |████████████████████████████████| 94 kB 3.2 MB/s 
[K     |████████████████████████████████| 271 kB 74.6 MB/s 
[K     |████████████████████████████████| 144 kB 97.8 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires fol

In [None]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm, tqdm_notebook
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from matplotlib import pyplot as plt
import seaborn as sns

from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer
from transformers import TrainingArguments, Trainer

In [None]:
def seed_everything(seed) :
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
seed_everything(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


### Prepare Data

In [None]:
train = pd.read_csv('data/1. 실습용자료.txt', sep='|', encoding='cp949')
test = pd.read_csv('data/2. 모델개발용자료.txt', sep='|', encoding='cp949')
submission = pd.read_csv("data/답안 작성용 파일.csv", encoding='cp949')

In [None]:
# digit은 classification을 위한 것이므로 연속형 변수 dtype인 int, float를 범주형 변수를 위한 dtype인 str으로 변경

train.loc[:,'digit_2'] = train.loc[:,'digit_2'].astype(str)
train.loc[:,'digit_3'] = train.loc[:,'digit_3'].astype(str)
test.loc[:,'digit_1'] = test.loc[:,'digit_1'].astype(str)
test.loc[:,'digit_2'] = test.loc[:,'digit_2'].astype(str)
test.loc[:,'digit_3'] = test.loc[:,'digit_3'].astype(str)

In [None]:
# NaN은 덧셈이 불가능하므로 빈 문자열로 교체

train = train.fillna('')
test = test.fillna('')

In [None]:
# 변수 합치기

train.loc[:,"label"] = train.loc[:,"digit_3"]
train.loc[:,"text"] = train.loc[:,"text_obj"] + train.loc[:,"text_mthd"] + train.loc[:,"text_deal"]
test.loc[:,"text"] = test.loc[:,"text_obj"] + test.loc[:,"text_mthd"] + test.loc[:,"text_deal"]

In [None]:
columns = ["AI_id", "text", "label"]
train = train.loc[:,columns]
train.sample(5)

Unnamed: 0,AI_id,text,label
769808,id_0769809,건설기계 대여,426
896283,id_0896284,"가게에서접객시설을갖추고소주,맥주",562
889979,id_0889980,개인택시로일반인을 대상으로승객운송서비스,492
341288,id_0341289,고객의 가정에서센터에서영유아및초등학생돌봄,872
666328,id_0666329,대학교학생들 대상으로전문적 학문 교육서비스,853


In [None]:
train_count = train.groupby(by=["label"]).count()
train_count["AI_id"]

label
101      729
102      985
103     1177
104      769
105       64
       ...  
951     1508
952    12142
953     6911
961    43040
969    13170
Name: AI_id, Length: 225, dtype: int64

In [None]:
train['text'] = train['text'].str.replace("[^a-zA-Z가-힣]","", regex=True)

In [None]:
train['text'] = train['text'].str.replace('^ +', "", regex=True)

In [None]:
# text는 합친 것이므로 text_obj, text_mthd, text_deal 중 하나라도 있으면 사용 가능
# text_obj, text_mthd, text_deal가 모두 빈 문자열인 경우는 train dataset에 없었다.

train[train["text"]==""]

Unnamed: 0,AI_id,text,label


In [None]:
len(train)

1000000

In [None]:
len(train["label"].unique())

225

In [None]:
train_dataset, eval_dataset = train_test_split(train, test_size=0.2, shuffle=True)
train_dataset = train_dataset.reset_index(drop=True)
eval_dataset = eval_dataset.reset_index(drop=True)

In [None]:
len(max(train["text"], key=len))

69

In [None]:
len(max(test["text"], key=len))

82

### Label Encoding

In [None]:
ctd = pd.read_csv("data/class_table_chunk_kor.csv")
ctd

Unnamed: 0,code,chunk_text
0,11,"채소작물 재배업 채소, 화훼작물 및 종묘 재배업 과실작물 재배업 시설작물 재배업 화..."
1,12,양계업 기타 축산업 소 사육업 축산업 양돈업 육우 사육업 가금류 및 조류 사육업 농...
2,13,작물재배 및 축산 복합농업 농업
3,14,"작물재배 관련 서비스업 작물재배 및 축산 관련 서비스업 농업 농산물 건조, 선별 및..."
4,15,수렵 및 관련 서비스업 농업
...,...,...
227,969,세탁업 기타 개인 서비스업 가정용 세탁업 산업용 세탁업 개인 간병 및 유사 서비스업...
228,970,가구 내 고용활동
229,981,자가 소비를 위한 가사 생산 활동 달리 분류되지 않은 자가소비를 위한 가구의 재화 ...
230,982,달리 분류되지 않은 자가소비를 위한 가구의 재화 및 서비스 생산활동 자가 소비를 위...


In [None]:
ctd.loc[:,"code"] = ctd.loc[:,"code"].astype(str)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(ctd["code"])
train_encoded = label_encoder.transform(train["label"])
train_encoded

array([224, 122, 119, ..., 124, 208, 145])

### Load Pretrained Model, Tokenizer

In [None]:
model_name = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(ctd["code"].unique())
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config).to(device)

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier

### Dataset

In [None]:
 train_dataset["label"] = label_encoder.transform(train_dataset["label"])
 eval_dataset["label"] = label_encoder.transform(eval_dataset["label"])

In [None]:
class IndustryDataset(torch.utils.data.Dataset):
  def __init__(self, dataset, is_train=True):
    self.dataset = dataset
    self.text = self.dataset["text"]
    self.is_train = is_train
    if is_train:
      self.labels = self.dataset["label"]

  def __getitem__(self, idx):
    text = self.text[idx]
    item = tokenizer(
        text,
        max_length = 100,
        padding = "max_length",
        truncation=True,
        return_tensors = "pt",
        add_special_tokens=True,
        return_token_type_ids=False
        )
    if self.is_train:
      labels = self.labels[idx]
      item['labels'] = torch.tensor(labels)
    item["input_ids"] = item["input_ids"].squeeze(0)
    item["attention_mask"] = item["attention_mask"].squeeze(0)
    return item

  def __len__(self):
    return len(self.dataset)

In [None]:
train_dataset = IndustryDataset(train_dataset)
eval_dataset = IndustryDataset(eval_dataset)

In [None]:
train_dataset[0]

{'input_ids': tensor([    0,  7103,  2373,  2286, 20446,  2700,  2144,  2425,     2,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 

### Train

In [None]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric.compute(predictions=predictions, references=labels)
    return accuracy

In [None]:
torch.cuda.empty_cache()

In [None]:
# 중간 학습을 위한 model load
# model_name = "./checkpoint-12000"
# config = AutoConfig.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config).to(device)

In [None]:
import wandb
wandb.login()

training_args=TrainingArguments(output_dir="./roberta",
                                num_train_epochs=3,
                                learning_rate=5e-5,
                                save_total_limit=3,
                                save_steps=3000,
                                per_device_train_batch_size=128,
                                per_device_eval_batch_size=128,
                                evaluation_strategy='steps',
                                eval_steps = 3000,
                                weight_decay=0.01,
                                load_best_model_at_end = True,
                                # metric_for_best_model = "f1",
                                # greater_is_better = True,    
                                report_to="wandb",
                                run_name="bert-base")
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

[34m[1mwandb[0m: Currently logged in as: [33mjdg4661[0m (use `wandb login --relogin` to force relogin)


In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()
model.save_pretrained('./roberta/result/best_model')

***** Running training *****
  Num examples = 800000
  Num Epochs = 3
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 18750
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy
3000,0.4439,0.412586,0.89728
6000,0.3749,0.351092,0.909455
9000,0.3182,0.331955,0.91501
12000,0.2954,0.312545,0.917915
15000,0.2459,0.305876,0.92079


***** Running Evaluation *****
  Num examples = 200000
  Batch size = 128
Saving model checkpoint to ./roberta/checkpoint-3000
Configuration saved in ./roberta/checkpoint-3000/config.json
Model weights saved in ./roberta/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in ./roberta/checkpoint-3000/tokenizer_config.json
Special tokens file saved in ./roberta/checkpoint-3000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 200000
  Batch size = 128
Saving model checkpoint to ./roberta/checkpoint-6000
Configuration saved in ./roberta/checkpoint-6000/config.json
Model weights saved in ./roberta/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in ./roberta/checkpoint-6000/tokenizer_config.json
Special tokens file saved in ./roberta/checkpoint-6000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 200000
  Batch size = 128
Saving model checkpoint to ./roberta/checkpoint-9000
Configuration saved in ./roberta/checkpoint-900

Step,Training Loss,Validation Loss,Accuracy
3000,0.4439,0.412586,0.89728
6000,0.3749,0.351092,0.909455
9000,0.3182,0.331955,0.91501
12000,0.2954,0.312545,0.917915
15000,0.2459,0.305876,0.92079
18000,0.2422,0.295553,0.922535


Saving model checkpoint to ./roberta/checkpoint-18000
Configuration saved in ./roberta/checkpoint-18000/config.json
Model weights saved in ./roberta/checkpoint-18000/pytorch_model.bin
tokenizer config file saved in ./roberta/checkpoint-18000/tokenizer_config.json
Special tokens file saved in ./roberta/checkpoint-18000/special_tokens_map.json
Deleting older checkpoint [roberta/checkpoint-9000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./roberta/checkpoint-18000 (score: 0.29555314779281616).
Configuration saved in ./roberta/result/best_model/config.json
Model weights saved in ./roberta/result/best_model/pytorch_model.bin


### Inference

In [None]:
test_dataset = IndustryDataset(test, is_train=False)
outputs = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 100000
  Batch size = 128


In [None]:
outputs = np.argmax(outputs[0], axis=-1)
outputs = label_encoder.inverse_transform(outputs)

In [None]:
test["label"] = outputs

In [None]:
test

Unnamed: 0,AI_id,digit_1,digit_2,digit_3,text_obj,text_mthd,text_deal,text,label
0,id_000001,,,,치킨전문점에서,고객의주문에의해,치킨판매,치킨전문점에서고객의주문에의해치킨판매,561
1,id_000002,,,,산업공구,다른 소매업자에게,철물 수공구,산업공구다른 소매업자에게철물 수공구,466
2,id_000003,,,,절에서,신도을 대상으로,불교단체운영,절에서신도을 대상으로불교단체운영,949
3,id_000004,,,,영업장에서,고객요구로,자동차튜닝,영업장에서고객요구로자동차튜닝,952
4,id_000005,,,,실내포장마차에서,접객시설을 갖추고,"소주,맥주제공","실내포장마차에서접객시설을 갖추고소주,맥주제공",562
...,...,...,...,...,...,...,...,...,...
99995,id_099996,,,,사업장에서,일반인대상으로,버섯농장,사업장에서일반인대상으로버섯농장,472
99996,id_099997,,,,한의원에서,외래환자위주고,치료,한의원에서외래환자위주고치료,862
99997,id_099998,,,,일반점포에서,소비자에게,그림판매,일반점포에서소비자에게그림판매,478
99998,id_099999,,,,사업장에서,일반인.학생대상으로,학습공간제공,사업장에서일반인.학생대상으로학습공간제공,902


In [None]:
test.to_csv("roberta_submission.csv", index=False)

In [None]:
test = pd.read_csv("tmp_submission.csv")

In [None]:
# 중분류 코드를 대분류 코드(alphabet)으로 매핑

categories = {}
def set_keys(alpha, code_start, code_end):
  global categories
  for i in range(code_start, code_end+1):
    categories[i] = alpha

In [None]:
set_keys("A", 1,3)
set_keys("B", 5,8)
set_keys("C", 10,34)
set_keys("D", 35,35)
set_keys("E", 36,39)
set_keys("F", 41,42)
set_keys("G", 45,47)
set_keys("H", 49,52)
set_keys("I", 55,56)
set_keys("J", 58,63)
set_keys("K", 64,66)
set_keys("L", 68,68)
set_keys("M", 70,73)
set_keys("N", 74,76)
set_keys("O", 84,84)
set_keys("P", 85,85)
set_keys("Q", 86,87)
set_keys("R", 90,91)
set_keys("S", 94,96)
set_keys("T", 97,98)
set_keys("U", 99,99)

In [None]:
submission["digit_3"] = test["label"]
submission["digit_2"] = submission["digit_3"] // 10

In [None]:
for i in range(len(submission)):
  submission.loc[i,"digit_1"] = categories[submission.loc[i,"digit_2"]]
submission

Unnamed: 0,AI_id,digit_1,digit_2,digit_3,text_obj,text_mthd,text_deal
0,id_000001,I,56,561,치킨전문점에서,고객의주문에의해,치킨판매
1,id_000002,G,46,466,산업공구,다른 소매업자에게,철물 수공구
2,id_000003,S,94,949,절에서,신도을 대상으로,불교단체운영
3,id_000004,S,95,952,영업장에서,고객요구로,자동차튜닝
4,id_000005,I,56,562,실내포장마차에서,접객시설을 갖추고,"소주,맥주제공"
...,...,...,...,...,...,...,...
99995,id_099996,G,47,472,사업장에서,일반인대상으로,버섯농장
99996,id_099997,Q,86,862,한의원에서,외래환자위주고,치료
99997,id_099998,G,47,478,일반점포에서,소비자에게,그림판매
99998,id_099999,R,90,902,사업장에서,일반인.학생대상으로,학습공간제공


In [None]:
submission.to_csv("bert_base_epoch2.csv", index=False)