# [모의 캐글 - 게임] 비매너 댓글 식별 

- 자연어 multi label classification 과제

참고 논문 : 
- [BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding](https://arxiv.org/pdf/1810.04805.pdf)
- [Attention Is All You Need](https://arxiv.org/pdf/1706.03762.pdf)

# 1. 환경 설정 및 라이브러리 불러오기

In [3]:
#!pip install -r requirements.txt
#!pip install attrdict



In [4]:
import pandas as pd
import os
import json
import numpy as np
import shutil

from sklearn.metrics import f1_score
from datetime import datetime, timezone, timedelta
import random
from tqdm import tqdm


from attrdict import AttrDict
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils import *
from torch.optim import Adam, AdamW

from transformers import logging, get_linear_schedule_with_warmup


from transformers import ( 
    BertConfig,
    ElectraConfig,
    ElectraConfig
)

from transformers import (
    BertTokenizer,  
    AutoTokenizer,
    ElectraTokenizer,
)

from transformers import (
    BertModel,
    AutoModel, 
    ElectraForSequenceClassification,
    BertForSequenceClassification
)


In [7]:
# 사용할 GPU 지정
print("number of GPUs: ", torch.cuda.device_count())
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
use_cuda = torch.cuda.is_available()
print("Does GPU exist? : ", use_cuda)
DEVICE = torch.device("cuda" if use_cuda else "cpu")

number of GPUs:  1
Does GPU exist? :  True


In [50]:
# True 일 때 코드를 실행하면 example 등을 보여줌
DEBUG = True

# config.json

config.json = {
    
    "data_dir": "/USER/nlp/data",
    "result_dir": "/USER/nlp/result/",
    "config_dir": "/USER/nlp/exp_config/",
    
    # 모델 종류
    "pretrained_model": "beomi/kcbert-base",  
    
    # 
    "architecture": "BertForSequenceClassification",
    
    # tokenizer 종류
    "tokenizer_class": "BertTokenizer",
    
    # 분류하고자하는 클래스 수 (0, 1, 2, 3, 4, 5)
    "num_classes": 6,
    "max_seq_len": 128,    #
    "train_epochs": 10,
    "adam_epsilon": 1e-8,
    "seed": 42,
    "train_batch_size": 32,
    "eval_batch_size": 32,
    "learning_rate": 1e-6,
    "warmup_proportion": 0.1,
    "run": "demo0",
    "patience": 3"data_dir": "/USER/nlp/data",
    "result_dir": "/USER/nlp/result/",
    "config_dir": "/USER/nlp/exp_config/",
    
    "pretrained_model": "beomi/kcbert-base",  
    
    "architecture": "BertForSequenceClassification",
    
    "tokenizer_class": "BertTokenizer",
    
    "num_classes": 6,
    "max_seq_len": 128,
    "train_epochs": 10,
    "adam_epsilon": 1e-8,
    "seed": 42,
    "train_batch_size": 32,
    "eval_batch_size": 32,
    "learning_rate": 1e-6,
    "warmup_proportion": 0.1,
    "run": "demo0",
    "patience": 3
}

In [51]:
# config 파일 불러오기
config_path = os.path.join('config.json')

def set_config(config_path):
    if os.path.lexists(config_path):  # config_path가 실제 존재하면 -> true return
        with open(config_path) as f:
            args = AttrDict(json.load(f))
            print("config file loaded.")
            print(args.pretrained_model)
    else:
        assert False, 'config json file cannot be found.. please check the path again.'
    
    return args
    

# 코드 중간중간에 끼워넣어 리셋 가능
args = set_config(config_path)

# 결과 저장 폴더 미리 생성
os.makedirs(args.result_dir, exist_ok=True)
os.makedirs(args.config_dir, exist_ok=True)

config file loaded.
beomi/kcbert-base


# 2. EDA 및 데이터 전처리

In [52]:
# data 경로 설정  -> 데이터 경로가 올바른지 확인
train_path = os.path.join(args.data_dir,'train.csv')

print("train 데이터 경로가 올바른가요? : ", os.path.lexists(train_path))


train 데이터 경로가 올바른가요? :  True


### 2-1. Train 데이터 확인

In [53]:
train_df = pd.read_csv(train_path, encoding = 'UTF-8-SIG')

# train data 구조확인
train_df.head()

Unnamed: 0,title,comment,bias,hate
0,"""'미스터 션샤인' 변요한, 김태리와 같은 양복 입고 학당 방문! 이유는?""",김태리 정말 연기잘해 진짜,none,none
1,"""[SC현장]""""극사실주의 현실♥""""…'가장 보통의 연애' 김래원X공효진, 16년만...",공효진 발연기나이질생각이읍던데 왜계속주연일까,none,hate
2,"""손연재, 리듬체조 학원 선생님 """"하고 싶은 일 해서 행복하다""""""",누구처럼 돈만 밝히는 저급인생은 살아가지마시길~~ 행복은 머니순이 아니니깐 작은거에...,others,hate
3,"""'섹션TV' 김해숙 """"'허스토리' 촬영 후 우울증 얻었다""""""",일본 축구 져라,none,none
4,"""[단독] 임현주 아나운서 “‘노브라 챌린지’ 방송 덕에 낸 용기, 자연스런 논의의...",난 절대로 임현주 욕하는인간이랑은 안논다 @.@,none,none


In [54]:
# train data 개수 확인
len(train_df)

8367

In [55]:
# bias -> 3개 / hate -> 2개
print("bias classes: ", train_df.bias.unique())
print("hate classes: ", train_df.hate.unique())

bias classes:  ['none' 'others' 'gender']
hate classes:  ['none' 'hate']


In [56]:
# crosstab : 범주형 변수를 기준으로 개수 파악
pd.crosstab(train_df.bias, train_df.hate, margins=True)

hate,hate,none,All
bias,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gender,1216,83,1299
none,2068,3422,5490
others,1437,141,1578
All,4721,3646,8367


### 2-2. Test 데이터 확인

In [57]:
test_path = os.path.join(args.data_dir,'test.csv')
print("test 데이터 경로가 올바른가요? : ", os.path.lexists(test_path))

test 데이터 경로가 올바른가요? :  True


In [58]:
test_df = pd.read_csv(test_path)
test_df.head()

Unnamed: 0,ID,title,comment
0,0,"류현경♥︎박성훈, 공개연애 4년차 애정전선 이상無..""의지 많이 된다""[종합]",둘다 넘 좋다~행복하세요
1,1,"""현금 유도+1인 1라면?""…'골목식당' 백종원, 초심 잃은 도시락집에 '경악' [종합]",근데 만원이하는 현금결제만 하라고 써놓은집 우리나라에 엄청 많은데
2,2,"입대 D-11' 서은광의 슬픈 멜로디..비투비, 눈물의 첫 체조경기장[콘서트 종합]",누군데 얘네?
3,3,"아이콘택트' 리쌍 길, 3년 전 결혼설 부인한 이유 공개…""결혼,출산 숨겼다""","쑈 하지마라 짜식아!음주 1번은 실수, 2번은 고의, 3번은 인간쓰레기다.슬금슬금 ..."
4,4,"구하라, 안검하수 반박 해프닝...""당당하다""vs""그렇게까지"" 설전 [종합]",안검하수 가지고 있는 분께 희망을 주고 싶은건가요? 수술하면 이렇게 자연스러워진다고...


In [59]:
len(test_df)

511

### 2-3. 데이터 전처리 (Label Encoding)
bias, hate 라벨들의 class를 정수로 변경하여 라벨 인코딩을 하기 위한 딕셔너리입니다.

- bias, hate 컬럼을 합쳐서 하나의 라벨로 만들기 

In [60]:
# 두 라벨의 가능한 모든 조합 만들기
combinations = np.array(np.meshgrid(train_df.bias.unique(), train_df.hate.unique())).T.reshape(-1,2)

if DEBUG==True:
    print(combinations)

[['none' 'none']
 ['none' 'hate']
 ['others' 'none']
 ['others' 'hate']
 ['gender' 'none']
 ['gender' 'hate']]


In [61]:
# bias, hate 컬럼을 합친 것
bias_hate = list(np.array([train_df['bias'].values, train_df['hate'].values]).T.reshape(-1,2))

if DEBUG==True:
    print(bias_hate[:5])


[array(['none', 'none'], dtype=object), array(['none', 'hate'], dtype=object), array(['others', 'hate'], dtype=object), array(['none', 'none'], dtype=object), array(['none', 'none'], dtype=object)]


In [62]:
# 정답(labels)값 형성 - > 0, 1, 2, 3, 4, 5 : 총 6개
labels = []
for i, arr in enumerate(bias_hate):
    for idx, elem in enumerate(combinations):
        if np.array_equal(elem, arr):
            labels.append(idx)

train_df['label'] = labels
train_df.head()

Unnamed: 0,title,comment,bias,hate,label
0,"""'미스터 션샤인' 변요한, 김태리와 같은 양복 입고 학당 방문! 이유는?""",김태리 정말 연기잘해 진짜,none,none,0
1,"""[SC현장]""""극사실주의 현실♥""""…'가장 보통의 연애' 김래원X공효진, 16년만...",공효진 발연기나이질생각이읍던데 왜계속주연일까,none,hate,1
2,"""손연재, 리듬체조 학원 선생님 """"하고 싶은 일 해서 행복하다""""""",누구처럼 돈만 밝히는 저급인생은 살아가지마시길~~ 행복은 머니순이 아니니깐 작은거에...,others,hate,3
3,"""'섹션TV' 김해숙 """"'허스토리' 촬영 후 우울증 얻었다""""""",일본 축구 져라,none,none,0
4,"""[단독] 임현주 아나운서 “‘노브라 챌린지’ 방송 덕에 낸 용기, 자연스런 논의의...",난 절대로 임현주 욕하는인간이랑은 안논다 @.@,none,none,0


In [96]:
# train_data 특수문자 제거 (title , comment)
train_df["title"] = train_df["title"].str.replace(pat=r'[^\w]', repl=r'', regex=True)
train_df["comment"] = train_df["comment"].str.replace(pat=r'[^\w]', repl=r'', regex=True)

# test_data 특수문자 제거 (title , comment)
test_df["title"] = test_df["title"].str.replace(pat=r'[^\w]', repl=r'', regex=True)
test_df["comment"] = test_df["comment"].str.replace(pat=r'[^\w]', repl=r'', regex=True)
# 자연어 처리 패키지에 보면 특수문자를 제거 해주는 메소드 찾아보기

In [97]:
#train_df.head()
test_df.head()

Unnamed: 0,ID,title,comment
0,0,류현경박성훈공개연애4년차애정전선이상無의지많이된다종합,둘다넘좋다행복하세요
1,1,현금유도1인1라면골목식당백종원초심잃은도시락집에경악종합,근데만원이하는현금결제만하라고써놓은집우리나라에엄청많은데
2,2,입대D11서은광의슬픈멜로디비투비눈물의첫체조경기장콘서트종합,누군데얘네
3,3,아이콘택트리쌍길3년전결혼설부인한이유공개결혼출산숨겼다,쑈하지마라짜식아음주1번은실수2번은고의3번은인간쓰레기다슬금슬금기어나올생각말고하던대로그...
4,4,구하라안검하수반박해프닝당당하다vs그렇게까지설전종합,안검하수가지고있는분께희망을주고싶은건가요수술하면이렇게자연스러워진다고눈감았다가뜨는동영상...


## 3. Dataset 로드

### 3-0. Pre-trained tokenizer 탐색

 sentencepiece, opennmt, huggingface

In [82]:
# config.json 에서 지정 이름별로 가져올 라이브러리 지정

TOKENIZER_CLASSES = {
    "BertTokenizer": BertTokenizer,
     "AutoTokenizer": AutoTokenizer,
    "ElectraTokenizer": ElectraTokenizer
}


- Tokenizer 사용 예시

In [83]:
TOKENIZER = TOKENIZER_CLASSES[args.tokenizer_class].from_pretrained(args.pretrained_model)
if DEBUG==True:
    print(TOKENIZER)

PreTrainedTokenizer(name_or_path='beomi/kcbert-base', vocab_size=30000, model_max_len=300, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [84]:
if DEBUG == True:
    example = train_df['title'][0]
    print(TOKENIZER(example))

{'input_ids': [2, 26176, 4533, 5250, 4118, 4318, 4040, 4047, 4656, 4048, 4038, 4196, 8036, 4237, 4232, 13669, 4087, 4081, 21550, 25127, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [85]:
if DEBUG==True:
    print(TOKENIZER.encode(example),"\n")
    
    # 토큰으로 나누기
    print(TOKENIZER.tokenize(example),"\n")
    
    # 토큰 id로 매핑하기
    print(TOKENIZER.convert_tokens_to_ids(TOKENIZER.tokenize(example)))


[2, 26176, 4533, 5250, 4118, 4318, 4040, 4047, 4656, 4048, 4038, 4196, 8036, 4237, 4232, 13669, 4087, 4081, 21550, 25127, 3] 

['미스터', '##션', '##샤', '##인', '##변', '##요', '##한', '##김', '##태', '##리', '##와', '##같은', '##양', '##복', '##입고', '##학', '##당', '##방문', '##이유는'] 

[26176, 4533, 5250, 4118, 4318, 4040, 4047, 4656, 4048, 4038, 4196, 8036, 4237, 4232, 13669, 4087, 4081, 21550, 25127]


### 3-1. Dataset 만드는 함수 정의

In [86]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len, mode = 'train'):

        self.data = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mode = mode
        
        if self.mode!='test':
            try: 
                self.labels = df['label'].tolist()
            except:
                assert False, 'CustomDataset Error : \'label\' column does not exist in the dataframe'
     
    def __len__(self):
        return len(self.data)
                

    def __getitem__(self, idx):
        """
        전체 데이터에서 특정 인덱스 (idx)에 해당하는 기사제목과 댓글 내용을 
        토크나이즈한 data('input_ids', 'attention_mask','token_type_ids')의 딕셔너리 형태로 불러옴
        """
        title = self.data.title.iloc[idx]
        comment = self.data.comment.iloc[idx]
        
        tokenized_text = self.tokenizer(title, comment,
                             padding= 'max_length',
                             max_length=self.max_len,
                             truncation=True,
                             return_token_type_ids=True,
                             return_attention_mask=True,
                             return_tensors = "pt")
        
        data = {'input_ids': tokenized_text['input_ids'].clone().detach().long(),
               'attention_mask': tokenized_text['attention_mask'].clone().detach().long(),
               'token_type_ids': tokenized_text['token_type_ids'].clone().detach().long(),
               }
        
        if self.mode != 'test':
            label = self.data.label.iloc[idx]
            return data, label
        else:
            return data
        

    
train_dataset = CustomDataset(train_df, TOKENIZER, args.max_seq_len, mode ='train')
print("train dataset loaded.")

train dataset loaded.


In [87]:
if DEBUG ==True :
    print("dataset sample : ")
    print(train_dataset[0])

dataset sample : 
({'input_ids': tensor([[    2, 26176,  4533,  5250,  4118,  4318,  4040,  4047,  4656,  4048,
          4038,  4196,  8036,  4237,  4232, 13669,  4087,  4081, 21550, 25127,
             3, 26217,  4038, 16223, 19625, 26745, 10073,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,    

In [88]:
print("dataset sample : ")
print(train_dataset[0])

dataset sample : 
({'input_ids': tensor([[    2, 26176,  4533,  5250,  4118,  4318,  4040,  4047,  4656,  4048,
          4038,  4196,  8036,  4237,  4232, 13669,  4087,  4081, 21550, 25127,
             3, 26217,  4038, 16223, 19625, 26745, 10073,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,    

In [None]:
# encoded_plus = tokenizer.encode_plus(
#                     sentence,                      # Sentence to encode.
#                     add_special_tokens = True, # Add '[CLS]' and '[SEP]'
#                     max_length = 128,           # Pad & truncate all sentences.
#                     pad_to_max_length = True,
#                     return_attention_mask = True,   # Construct attention masks.
#                     return_tensors = 'pt',     # Return pytorch tensors.
#                )

### 3-2. Train, Validation set 나누기

In [89]:
from sklearn.model_selection import train_test_split
                                                         
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=args.seed)

train_dataset = CustomDataset(train_data, TOKENIZER, args.max_seq_len, 'train')
val_dataset = CustomDataset(val_data, TOKENIZER, args.max_seq_len, 'validation')

print("Train dataset: ", len(train_dataset))
print("Validation dataset: ", len(val_dataset))

Train dataset:  7530
Validation dataset:  837


## 4. 분류 모델 학습을 위한 세팅

### 4-1. BertForSequenceClassification 설정


(https://huggingface.co/docs/transformers/v4.16.2/en/main_classes/configuration#transformers.PretrainedConfig.from_pretrained)

[PretrainedConfig](https://huggingface.co/transformers/v3.0.2/main_classes/configuration.html)


In [90]:
from transformers import logging
# 아래 코드를 쓰지않으면 출력값이 너무 길게 나옴 
logging.set_verbosity_error()

# config.json 에 입력된 architecture 에 따라 베이스 모델 설정
BASE_MODELS = {
    "BertForSequenceClassification": BertForSequenceClassification
}


myModel = BASE_MODELS[args.architecture].from_pretrained(args.pretrained_model, 
                                                         num_labels = args.num_classes, 
                                                         output_attentions = False, # Whether the model returns attentions weights.
                                                         output_hidden_states = True # Whether the model returns all hidden-states.
                                                        )
if DEBUG==True:
    # 모델 구조 확인
    print(myModel)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(300, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [91]:
# git 설치하고, github파일 다운로드 
# apt-get git
# !pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

### 4-2. 모델 설정


BertForSequenceClassifier (line 1232부터 참고) [source code](https://github.com/huggingface/transformers/blob/a39dfe4fb122c11be98a563fb8ca43b322e01036/src/transformers/modeling_bert.py#L1284-L1287)



In [92]:
class myClassifier(nn.Module):
    def __init__(self, model, hidden_size = 768, num_classes=args.num_classes, dr_rate=None, params=None):
        super(myClassifier, self).__init__()
        self.model = model
        self.softmax = nn.Softmax(dim=1)

    def forward(self, token_ids, attention_mask, segment_ids):      
        outputs = self.model(input_ids = token_ids, 
                             token_type_ids = segment_ids.long(), 
                             attention_mask = attention_mask.float().to(token_ids.device))
         
        logits = outputs.logits
        output = self.softmax(logits)
        return output
        
model = myClassifier(myModel, dr_rate=0.1)

# if DEBUG ==True :
#     print(model)

### 4-3. 모델 구성 확인

In [78]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

model.bert.embeddings.word_embeddings.weight            (30000, 768)
model.bert.embeddings.position_embeddings.weight          (300, 768)
model.bert.embeddings.token_type_embeddings.weight          (2, 768)
model.bert.embeddings.LayerNorm.weight                        (768,)
model.bert.embeddings.LayerNorm.bias                          (768,)

==== First Transformer ====

model.bert.encoder.layer.0.attention.self.query.weight    (768, 768)
model.bert.encoder.layer.0.attention.self.query.bias          (768,)
model.bert.encoder.layer.0.attention.self.key.weight      (768, 768)
model.bert.encoder.layer.0.attention.self.key.bias            (768,)
model.bert.encoder.layer.0.attention.self.value.weight    (768, 768)
model.bert.encoder.layer.0.attention.self.value.bias          (768,)
model.bert.encoder.layer.0.attention.output.dense.weight   (768, 768)
model.bert.encoder.layer.0.attention.output.dense.bias        

## 5. 학습 진행

### 5-0. Early Stopper 함수 정의

In [93]:
class LossEarlyStopper():
    """Early stopper

        patience (int): loss가 줄어들지 않아도 학습할 epoch 수
        patience_counter (int): loss 가 줄어들지 않을 때 마다 1씩 증가
        min_loss (float): 최소 loss
        stop (bool): True 일 때 학습 중단

    """

    def __init__(self, patience: int)-> None:
        """ 초기화

        Args:
            patience (int): loss가 줄어들지 않아도 학습할 epoch 수
            weight_path (str): weight 저장경로
            verbose (bool): 로그 출력 여부, True 일 때 로그 출력
        """
        self.patience = patience
        self.patience_counter = 0
        self.min_loss = np.Inf
        self.stop = False

    def check_early_stopping(self, loss: float)-> None:
        msg = ''
        # 첫 에폭
        if self.min_loss == np.Inf:
            self.min_loss = loss
           
        # loss가 줄지 않는다면 -> patience_counter 1 증가
        elif loss > self.min_loss:
            self.patience_counter += 1
            msg = f"Early stopping counter {self.patience_counter}/{self.patience}"

            # patience 만큼 loss가 줄지 않았다면 학습을 중단합니다.
            if self.patience_counter == self.patience:
                self.stop = True
  
        # loss가 줄어듬 -> min_loss 갱신, patience_counter 초기화
        elif loss <= self.min_loss:
            self.patience_counter = 0
            self.save_model = True
            msg = f"Validation loss decreased {self.min_loss} -> {loss}"
            self.min_loss = loss

        print(msg)

### 5-1. Epoch 별 학습 및 검증

- Adam optimizer의 epsilon 파라미터 eps = 1e-8 는 "계산 중 0으로 나눔을 방지 하기 위한 아주 작은 숫자 " 입니다. ([출처](https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/))
- `warmup_ratio` : 
  - 학습이 진행되면서 학습률을 그 상황에 맞게 가변적으로 적당하게 변경되게 하기 위해 Scheduler를 사용합니다.
  - 처음 학습률(Learning rate)를 warm up하기 위한 비율을 설정하는 warmup_ratio을 설정합니다.

In [None]:
import wandb
wandb login
wandb

In [94]:
# args = set_config(config_path)

logging.set_verbosity_warning()

# 재현을 위해 모든 곳의 시드 고정
seed_val = args.seed
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def train(model, train_data, val_data, args, mode = 'train'):
    
    # args.run은 실험 이름 (어디까지나 팀원들간의 버전 관리 및 공유 편의를 위한 것으로, 자유롭게 수정 가능합니다.)
    print("RUN : ", args.run)
    shutil.copyfile("config.json", os.path.join(args.config_dir, f"config_{args.run}.json"))

    early_stopper = LossEarlyStopper(patience=args.patience)
    
    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=args.train_batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=args.train_batch_size)

    DEBUG=False
    
    if DEBUG == True:
        # 데이터로더가 성공적으로 로드 되었는지 확인
        for idx, data in enumerate(train_dataloader):
            if idx==0:
                print("batch size : ", len(data[0]['input_ids']))
                print("The first batch looks like ..\n", data[0])
    
    
    criterion = nn.CrossEntropyLoss()
    
    total_steps = len(train_dataloader) * args.train_epochs

    optimizer = Adam(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(total_steps * args.warmup_proportion), num_training_steps=total_steps)

    
    if use_cuda:
        model = model.to(DEVICE)
        criterion = criterion.to(DEVICE)
        
    model.train()
    optimizer.zero_grad()

    tr_loss = 0.0
    val_loss = 0
    best_score = 0
      

    for epoch_num in range(args.train_epochs):

            total_acc_train = 0
            total_loss_train = 0
            
            assert mode in ['train', 'val'], 'your mode should be either \'train\' or \'val\''
            
            if mode =='train':
                for train_input, train_label in tqdm(train_dataloader):
                    
                    mask = train_input['attention_mask'].to(DEVICE)
                    input_id = train_input['input_ids'].squeeze(1).to(DEVICE)
                    segment_ids = train_input['token_type_ids'].squeeze(1).to(DEVICE)
                    train_label = train_label.long().to(DEVICE)                

                    output = model(input_id, mask, segment_ids)
                    
                    batch_loss = criterion(output.view(-1,6), train_label.view(-1))
                    total_loss_train += batch_loss.item()

                    acc = (output.argmax(dim=1) == train_label).sum().item()
                    total_acc_train += acc

                    model.zero_grad()
                    batch_loss.backward()
                    optimizer.step()
                    

            total_acc_val = 0
            total_loss_val = 0
            
            # validation을 위해 이걸 넣으면 이 evaluation 프로세스 중엔 dropout 레이어가 다르가 동작한다.
            model.eval()
            
            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    mask = val_input['attention_mask'].to(DEVICE)
                    input_id = val_input['input_ids'].squeeze(1).to(DEVICE)
                    segment_ids = val_input['token_type_ids'].squeeze(1).to(DEVICE)
                    val_label = val_label.long().to(DEVICE)

                    output = model(input_id, mask, segment_ids)

                    batch_loss = criterion(output.view(-1,6), val_label.view(-1))
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            
            train_loss = total_loss_train / len(train_data)
            train_accuracy = total_acc_train / len(train_data)
            val_loss = total_loss_val / len(val_data)
            val_accuracy = total_acc_val / len(val_data)
            
            # 한 Epoch 학습 후 학습/검증에 대해 loss와 평가지표 (여기서는 accuracy로 임의로 설정) 출력
            print(
                f'Epoch: {epoch_num + 1} \
                | Train Loss: {train_loss: .3f} \
                | Train Accuracy: {train_accuracy: .3f} \
                | Val Loss: {val_loss: .3f} \
                | Val Accuracy: {val_accuracy: .3f}')
          
            # early_stopping check
            early_stopper.check_early_stopping(loss=val_loss)

            if early_stopper.stop:
                print('Early stopped, Best score : ', best_score)
                break

            if val_accuracy > best_score:
            # 모델이 개선됨 -> 검증 점수와 weight 갱신
                best_score = val_accuracy
                
                # 학습된 모델을 저장할 디렉토리 및 모델 이름 지정
                SAVED_MODEL =  os.path.join(args.result_dir, f'best_{args.run}.pt')
            
                check_point = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict()
                }
                torch.save(check_point, SAVED_MODEL)  
              


            


train(model, train_dataset, val_dataset, args, mode = 'train')

RUN :  demo0


100% 236/236 [02:29<00:00,  1.58it/s]


Epoch: 1                 | Train Loss:  0.056                 | Train Accuracy:  0.292                 | Val Loss:  0.057                 | Val Accuracy:  0.327



100% 236/236 [02:41<00:00,  1.46it/s]
  0% 0/236 [00:00<?, ?it/s]

Epoch: 2                 | Train Loss:  0.056                 | Train Accuracy:  0.302                 | Val Loss:  0.057                 | Val Accuracy:  0.327
Validation loss decreased 0.05710026425984883 -> 0.05710026425984883


100% 236/236 [02:43<00:00,  1.44it/s]
  0% 0/236 [00:00<?, ?it/s]

Epoch: 3                 | Train Loss:  0.056                 | Train Accuracy:  0.302                 | Val Loss:  0.057                 | Val Accuracy:  0.327
Validation loss decreased 0.05710026425984883 -> 0.05710026425984883


100% 236/236 [02:43<00:00,  1.44it/s]
  0% 0/236 [00:00<?, ?it/s]

Epoch: 4                 | Train Loss:  0.056                 | Train Accuracy:  0.302                 | Val Loss:  0.057                 | Val Accuracy:  0.327
Validation loss decreased 0.05710026425984883 -> 0.05710026425984883


100% 236/236 [02:43<00:00,  1.44it/s]
  0% 0/236 [00:00<?, ?it/s]

Epoch: 5                 | Train Loss:  0.056                 | Train Accuracy:  0.302                 | Val Loss:  0.057                 | Val Accuracy:  0.327
Validation loss decreased 0.05710026425984883 -> 0.05710026425984883


100% 236/236 [02:44<00:00,  1.44it/s]
  0% 0/236 [00:00<?, ?it/s]

Epoch: 6                 | Train Loss:  0.056                 | Train Accuracy:  0.302                 | Val Loss:  0.057                 | Val Accuracy:  0.327
Validation loss decreased 0.05710026425984883 -> 0.05710026425984883


100% 236/236 [02:44<00:00,  1.44it/s]
  0% 0/236 [00:00<?, ?it/s]

Epoch: 7                 | Train Loss:  0.056                 | Train Accuracy:  0.302                 | Val Loss:  0.057                 | Val Accuracy:  0.327
Validation loss decreased 0.05710026425984883 -> 0.05710026425984883


100% 236/236 [02:44<00:00,  1.43it/s]
  0% 0/236 [00:00<?, ?it/s]

Epoch: 8                 | Train Loss:  0.056                 | Train Accuracy:  0.302                 | Val Loss:  0.057                 | Val Accuracy:  0.327
Validation loss decreased 0.05710026425984883 -> 0.05710026425984883


100% 236/236 [02:44<00:00,  1.43it/s]
  0% 0/236 [00:00<?, ?it/s]

Epoch: 9                 | Train Loss:  0.056                 | Train Accuracy:  0.302                 | Val Loss:  0.057                 | Val Accuracy:  0.327
Validation loss decreased 0.05710026425984883 -> 0.05710026425984883


100% 236/236 [02:44<00:00,  1.43it/s]


Epoch: 10                 | Train Loss:  0.056                 | Train Accuracy:  0.302                 | Val Loss:  0.057                 | Val Accuracy:  0.327
Validation loss decreased 0.05710026425984883 -> 0.05710026425984883


## 6. Test dataset으로 추론 (Prediction)

In [99]:
from torch.utils.data import DataLoader

# 테스트 데이터셋 불러오기
test_data = CustomDataset(test_df, tokenizer = TOKENIZER, max_len= args.max_seq_len, mode='test')

def test(model, SAVED_MODEL, test_data, args, mode = 'test'):


    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=args.eval_batch_size)


    if use_cuda:

        model = model.to(DEVICE)
        model.load_state_dict(torch.load(SAVED_MODEL)['model'])


    model.eval()

    pred = []

    with torch.no_grad():
        for test_input in test_dataloader:

            mask = test_input['attention_mask'].to(DEVICE)
            input_id = test_input['input_ids'].squeeze(1).to(DEVICE)
            segment_ids = test_input['token_type_ids'].squeeze(1).to(DEVICE)

            output = model(input_id, mask, segment_ids)

            output = output.argmax(dim=1).cpu().tolist()

            for label in output:
                pred.append(label)
                
    return pred

SAVED_MODEL =  os.path.join(args.result_dir, f'best_{args.run}.pt')

pred = test(model, SAVED_MODEL, test_data, args)

In [100]:
print("prediction completed for ", len(pred), "comments")


prediction completed for  511 comments


### 

In [101]:
# 0-5 사이의 라벨 값 별로 bias, hate로 디코딩 하기 위한 딕셔너리
bias_dict = {0: 'none', 1: 'none', 2: 'others', 3:'others', 4:'gender', 5:'gender'}
hate_dict = {0: 'none', 1: 'hate', 2: 'none', 3:'hate', 4:'none', 5:'hate'}

# 인코딩 값으로 나온 타겟 변수를 디코딩
pred_bias = ['' for i in range(len(pred))]
pred_hate = ['' for i in range(len(pred))]

for idx, label in enumerate(pred):
    pred_bias[idx]=(str(bias_dict[label]))
    pred_hate[idx]=(str(hate_dict[label]))
print('decode Completed!')



decode Completed!


In [103]:
submit = pd.read_csv(os.path.join(args.data_dir,'sample_submission.csv'))

submit['bias'] = pred_bias
submit['hate'] = pred_hate
submit

Unnamed: 0,ID,bias,hate
0,0,none,none
1,1,none,hate
2,2,none,none
3,3,none,none
4,4,none,hate
...,...,...,...
506,506,none,none
507,507,none,hate
508,508,none,hate
509,509,none,hate


In [104]:
submit.to_csv(os.path.join(args.result_dir, f"submission_{args.run}.csv"), index=False)