## 모델구성 과정의 실험
- 최종모델을 만들어나가는 과정에서 사용을 고려했던 코드 목록
- 해당 코드들은 최종예측에는 사용되지 않았으나 실험에 사용되었음

### 전처리 ver1
- 초기 전처리 과정에서 data_cleansing_text 함수 사용
- 최종예측에서는 추가적인 전처리 작업 및 맞춤법 교정을 제외함

In [None]:
from hanspell import spell_checker
from hanspell.constants import CheckResult

In [None]:
def data_cleansing_text(df,col_lst):
    for col in col_lst:
        # null값 채우기
        df[col] = df[col].fillna("")
        
        # 양쪽 공백 삭제
        df[col] = df[col].apply(lambda x : x.strip())
        
        # 영어 대문자 -> 소문자
        #df[col] = df[col].apply(lambda x : x.lower())
        
        #영어 문자 사이의 특수기호 삭제
        df[col] = df[col].str.replace(pat="(?<=[a-z])[/$](?=[a-z])",repl = "",regex=True)
        
        # ^ 삭제
        df[col] = df[col].str.replace("^","")
        
        # 특수기호 -> 공백으로
        #df[col] = df[col].apply(lambda x : re.sub('[^\w\s]', ' ', x))
        
        # 특정 특수기호 -> 공백으로
        df[col] = df[col].apply(lambda x : re.sub(r'[!@#$%&*<>?\|+-,./]', ' ', x))
        #% 또는 $ 또는 ^ 또는 * 또는 !을 없애주는 것이다.
        # ( ) 남기기
        
        # 숫자 지우기
        df[col] = df[col].apply(lambda x : re.sub(r'[0-9-]','',x))
        
        # 다중공백 -> 단일공백
        df[col] = df[col].apply(lambda x : re.sub(' +', ' ', x))
        
        # 한글 자음모음 삭제
        df[col] = df[col].apply(lambda x : re.sub('([ㄱ-ㅎㅏ-ㅣ]+)',"",x))
        
        # ㎡ 변환
        df[col] = df[col].apply(lambda x : re.sub('㎡',"제곱미터",x))
        
        # m2 변환
        df[col] = df[col].apply(lambda x : re.sub('m2',"제곱미터",x))
        
        # m2 변환
        df[col] = df[col].apply(lambda x : re.sub('M2',"제곱미터",x))
        
        # 맞춤법 검사기
        df[col] = df[col].apply(lambda x : spell_checker.check(x).checked)
    return df

### 중복처리
- 텍스트와 label이 모두 중복되는 데이터 30만개이상 발견
- 텍스트만 중복되고 label 같지 않은 데이터 발견
- 해당 데이터들을 제외하고 모델링하는 것이 성능에 긍정적이지 않기에 전처리하지 않음

In [None]:
# 특정 파일 불러오기
import pandas as pd
train_preprocessed = pd.read_csv("/content/MyDrive/MyDrive/test/train_preprocessed_ver2_no_lower_no_ques.csv")

In [None]:
train_preprocessed["document"] = train_preprocessed["document"].apply(lambda x : x.strip())

train_preprocessed = train_preprocessed.drop_duplicates(["document","label"], keep='first', inplace=False, ignore_index=False).reset_index(drop=True)
# document, label 중복 drop하기

train_preprocessed = train_preprocessed.drop_duplicates(["document"],keep=False).sort_values("document").reset_index(drop=True)
# document만 중복인 것들까지 전부 drop

### 역번역(Back Translation)을 통한 데이터 증강(Data Augmentation)
- 예측할 label(표준산업분류코드)가 매우 imbalance한 것 EDA를 통해 확인함
- 한글 -> 영어 -> 한글로 번역 과정을 통해 희소한 label을 증강하려고 함
- 성능에 긍정적인 영향을 주지 못해 역번역을 통한 데이터 증강 중지
- label 개수가 230개 미만인 Q1 데이터에 대해서 역번역 시도

In [None]:
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [None]:
import pandas as pd
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from tqdm import tnrange
from urllib.request import urlopen
import re
import requests
import urllib.request
from tqdm import tqdm

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import sys
import numpy as np
import pandas as pd
import random
from tqdm.notebook import tqdm

In [None]:
def chrome_setting():
  chrome_options = webdriver.ChromeOptions()
  chrome_options.add_argument('--headless')
  chrome_options.add_argument('--no-sandbox')
  chrome_options.add_argument('--disable-dev-shm-usage')
  driver = webdriver.Chrome('chromedriver', options=chrome_options)
  return driver

In [None]:
driver=chrome_setting()

In [None]:
# 구글드라이브 연동
from google.colab import drive
drive.mount('/content/MyDrive')

In [None]:
# 특정 파일 불러오기
import pandas as pd
train_preprocessed = pd.read_csv("/content/MyDrive/MyDrive/test/train_preprocessed_ver2_no_lower_no_ques.csv")
test_preprocessed = pd.read_csv("/content/MyDrive/MyDrive/test/test_preprocessed_ver2_no_lower_no_ques.csv")

In [None]:
# 역번역할 데이터 수집
augment_label = train_preprocessed["label"].value_counts()[train_preprocessed["label"].value_counts() < 230].index
mask = train_preprocessed['label'].isin(augment_label)
augment_preprocessed = train_preprocessed[mask].reset_index(drop=True)
augment_preprocessed.head()

In [2]:
# 한국어에서 다른 언어로 번역

def kor_to_trans(text_data, trans_lang,start_index,final_index):
    trans_list = []
    target_present = EC.presence_of_element_located((By.XPATH, '//*[@id="txtTarget"]'))

    for i in tqdm(range(start_index,final_index)): 
    
        if (i!=0)&(i%99==0):
            time.sleep(2)
            print('{}th : '.format(i), backtrans)
    
        try:
            driver.get('https://papago.naver.com/?sk=ko&tk='+trans_lang+'&st='+text_data[i])
            time.sleep(1.5)
            element=WebDriverWait(driver, 10).until(target_present)
            time.sleep(0.1)
            backtrans = element.text 

            if (backtrans=='')|(backtrans==' '):
                element=WebDriverWait(driver, 10).until(target_present)
                backtrans = element.text 
                trans_list.append(backtrans)
            else:
                trans_list.append(backtrans)
        
        except:
            trans_list.append('')

    return trans_list

In [3]:
# 다른언어에서 한국어로 번역

def trans_to_kor(transed_list, transed_lang,start_index,final_index): 
  
    trans_list = []
    target_present = EC.presence_of_element_located((By.XPATH, '//*[@id="txtTarget"]'))

    for i in tqdm(range(start_index,final_index)): 
    
        if (i!=0)&(i%99==0):
            time.sleep(1.5)
            print('{}th : '.format(i), backtrans)
    
        try:
            driver.get('https://papago.naver.com/?sk=en&tk='+transed_lang+'&st='+transed_list[i])
            time.sleep(2)
            element=WebDriverWait(driver, 10).until(target_present)
            time.sleep(0.2)
            backtrans = element.text 

            if (backtrans=='')|(backtrans==' '):
                element=WebDriverWait(driver, 10).until(target_present)
                backtrans = element.text
                trans_list.append(backtrans)
            else:
                trans_list.append(backtrans)
    
        except:
            trans_list.append('')
    return trans_list

In [None]:
# 한글 -> 영어 번역
trans_list = kor_to_trans(augment_preprocessed['document'], 'en',0,len(augment_preprocessed))
augment_preprocessed["eng"] = augment_preprocessed

# 영어 -> 한글 재번역
trans_list = trans_to_kor(augment_preprocessed['eng'], 'en',0,len(augment_preprocessed))

### FocalLoss 함수
- label이 불균형한 데이터 더 효과적인 FocalLoss 함수를 손실함수로 책정
- CrossEntropyLoss 대신 FocalLoss 사용 시, 예측성능이 낮아지기에 CrossEntropyLoss 채택

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha = 0.25, gamma=2, logits=False, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduce = reduce

    def forward(self, inputs, targets):
        
        ce_loss = nn.CrossEntropyLoss()(inputs, targets)

        pt = torch.exp(-ce_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * ce_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [None]:
#BERT 모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate=0.2).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = FocalLoss()
#loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader_1) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
    
#train_dataloader

### KLUE-BERT
- 한국어 pre-trained 모델이며 벤치마크 데이터 셋을 가진 KLUE-BERT 모델로 예측진행
- small/base/large 모델 다양한 실험 진행
- 예측성능이 KoBERT 모델보다 높지 않기에 KoBERT 최종모델로 채택

In [None]:
import tensorflow as tf
import os

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)

strategy = tf.distribute.TPUStrategy(resolver)

pip install transformers

import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import urllib.request
from sklearn import preprocessing
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import transformers

In [None]:
# 특정 파일 불러오기
import pandas as pd
train_preprocessed = pd.read_csv("test/train_preprocessed_ver2_no_lower_no_ques.csv")
test_preprocessed = pd.read_csv("test/test_preprocessed_ver2_no_lower_no_ques.csv")

In [None]:
# label 1개인 것들 제거
except_label = train_preprocessed["label"].value_counts()[train_preprocessed["label"].value_counts() < 2].index
mask = train_preprocessed['label'].isin(except_label)
train_preprocessed = train_preprocessed[~mask].reset_index(drop=True)

In [None]:
# 인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train_preprocessed["label"])

LABEL = le.transform(train_preprocessed["label"])
train_preprocessed["encoded_cat"] = LABEL

In [None]:
# KLUE 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

In [None]:
# 모델링을 위한 데이터 convert
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
    
    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []
    
    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)
        padding_count = input_id.count(tokenizer.pad_token_id)
        attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
        token_type_id = [0] * max_seq_len

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)

    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels

In [None]:
# 훈련용 검증용 데이터 split
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(train_preprocessed['document'], train_preprocessed['encoded_cat'],
                                                  test_size = 0.2, 
                                                  random_state = 777,
                                                  stratify = train_preprocessed['encoded_cat'],
                                                  shuffle = True)

# 데이터 convert
max_seq_len = 64

train_X, train_y = convert_examples_to_features(train_X, train_y, max_seq_len=max_seq_len, tokenizer=tokenizer)
val_X, val_y = convert_examples_to_features(val_X, val_y, max_seq_len=max_seq_len, tokenizer=tokenizer)

In [None]:
import transformers
from transformers import BertTokenizer,AdamWeightDecay,TFRobertaModel,TFBertModel
#model = TFRobertaModel.from_pretrained("klue/roberta-large", from_pt=True)
#model = TFRobertaModel.from_pretrained("klue/roberta-small", from_pt=True)
model = TFRobertaModel.from_pretrained("klue/roberta-base", from_pt=True)

In [None]:
# 레이어 설정
input_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
attention_masks_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
token_type_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
outputs = model([input_ids_layer, attention_masks_layer, token_type_ids_layer])

In [None]:
# 분류 모델 생성
class Klue_RobertaClassifier(tf.keras.Model):
    def __init__(self, num_class):
        super(Klue_RobertaClassifier, self).__init__()
        #self.bert = TFRobertaModel.from_pretrained("klue/roberta-large", from_pt=True)
        #self.bert = TFRobertaModel.from_pretrained("klue/roberta-small", from_pt=True)
        self.bert = TFRobertaModel.from_pretrained("klue/roberta-base", from_pt=True)
        self.classifier = tf.keras.layers.Dense(num_class,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range,seed=42), 
                                                name='classifier')

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls_token = outputs[1]
        prediction = self.classifier(cls_token)

        return prediction

In [None]:
NUM_CLASSES = 223
model = Klue_RobertaClassifier(num_class=NUM_CLASSES)

In [None]:
optimizer = AdamWeightDecay(1e-5,weight_decay_rate=1e-4)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics = [metric])

In [None]:
# 오버피팅 방지를 위한 EarlyStopping
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)

In [None]:
model_path = F"/content/MyDrive/MyDrive/test/"
checkpoint_path = os.path.join(model_path,'weight_klue_roberta_large_add.h5')
cp_callback = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

In [None]:
# 학습진행
model.fit(train_X, train_y, epochs=2, batch_size=64, validation_data = (val_X,val_y), callbacks=[earlystop_callback, cp_callback])

In [None]:
# 예측진행
test_preprocessed["label"] = 0
test_X, test_y = convert_examples_to_features(test_preprocessed['document'], test_preprocessed['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)
preds = model.predict(test_X)

# 예측값 디코딩
PRED_LABEL = le.inverse_transform(pred_label)
test_preprocessed["label"] = PRED_LABEL

### DistillKoBERT + 딥러닝 모델
- DistillKoBERT 모델에 딥러닝 모델을 얹어서 실험
- 예측성능이 KoBERT 모델보다 높지 않기에 KoBERT 최종모델로 채택

In [None]:
my_path = '/content/drive/MyDrive/additional_package'

!pip install --target=$my_path transformers # transformers 대신에 원하는 패키지 이름을 넣으시면 됩니다

import sys
sys.path.append('/content/drive/MyDrive/additional_package')

import os
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm # Progress Bar
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import transformers
from transformers import DistilBertTokenizer, TFDistilBertModel, DistilBertConfig
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
import warnings
from transformers import logging as hf_logging
hf_logging.set_verbosity_error() # Hidding Huggingface Warnings
warnings.filterwarnings("ignore")

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

In [None]:
except_label = train_preprocessed["label"].value_counts()[train_preprocessed["label"].value_counts() < 230].index
mask = train_preprocessed['label'].isin(except_label)
train_preprocessed = train_preprocessed[~mask].reset_index(drop=True)
train_preprocessed

In [None]:
# 특정 파일 불러오기
import pandas as pd
train_preprocessed = pd.read_csv("test/train_preprocessed_ver2_no_lower_no_ques.csv")
test_preprocessed = pd.read_csv("test/test_preprocessed_ver2_no_lower_no_ques.csv")

In [None]:
# label 개수 230개 미만인 것 삭제
except_label = train_preprocessed["label"].value_counts()[train_preprocessed["label"].value_counts() < 230].index
mask = train_preprocessed['label'].isin(except_label)
train_preprocessed = train_preprocessed[~mask].reset_index(drop=True)

In [None]:
# train 데이터를 8:2로 분할 (train set, val set)
train_df = train_preprocessed.sample(frac=0.8, random_state =1)
val_df = train_preprocessed.drop(train_df.index)

In [None]:
# 인코딩
le = LabelEncoder()
train_df['encoded_label'] = le.fit_transform(train_df['label'])
val_df['encoded_label'] = le.transform(val_df['label'])

In [None]:
label_dict = (val_df[['label','encoded_label']].drop_duplicates()
              .sort_values(by='encoded_label')
              .reset_index(drop=True)['label']
              .to_dict())

for index, key in label_dict.items():
    print(index, key)

In [None]:
from kobert_transformers.tokenization_kobert import KoBertTokenizer
tokenizer = KoBertTokenizer.from_pretrained('monologg/distilkobert') # monologg/distilkobert도 동일

# 토크나이저 로드
tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME) # Loading the tokenizer

In [None]:
x_train = train_df['document']
y_train = train_df['encoded_label']
x_val = val_df['document']
y_val = val_df['encoded_label']

In [None]:
max_val = 0
for sent in (x_train.tolist() + x_train.tolist() + x_val.tolist()):
    try:
        sent_tok_len = len(tokenizer.tokenize(sent))
        max_val = sent_tok_len if (sent_tok_len > max_val) else max_val
    except:
        pass
    
print(f"The maximum amount of tokens in the dataset is {max_val}")

In [None]:
MAX_LENGTH = 27
from kobert_transformers.tokenization_kobert import KoBertTokenizer
tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME,                                              
                                            add_special_tokens=True,
                                            max_length=MAX_LENGTH, 
                                            pad_to_max_length=True) # monologg/distilkobert도 동일

def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, 
                                       add_special_tokens=True, 
                                       max_length=MAX_LENGTH, 
                                       pad_to_max_length=True, 
                                       return_attention_mask=True, 
                                       return_token_type_ids=True, 
                                       truncation=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32')

In [None]:
X_train = tokenize(x_train, tokenizer)
x_test = test_df['document']
X_test = tokenize(x_test, tokenizer)
X_val = tokenize(x_val, tokenizer)

In [None]:
index_num = 10

print(train_df.iloc[index_num]['label'], '\n')

print(x_train.iloc[index_num], '\n')
temp_tokens = tokenizer.tokenize(x_train.iloc[index_num])
temp_ids = tokenizer.encode(x_train.iloc[index_num])

print('ID\'s', 'Input Tokens', sep='\t')
for i in range(len(temp_ids)):
    if i == 0:
        print(temp_ids[i], '[CLS]', sep='\t')
        continue
    if i == len(temp_ids)-1:
        print(temp_ids[i], '[SEP]', sep='\t')
        break
    print(temp_ids[i], temp_tokens[i-1], sep='\t')

In [None]:
model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")

In [None]:
config = DistilBertConfig.from_pretrained(MODEL_NAME, output_hidden_states=True, output_attentions=True)
DistilBERT = TFDistilBertModel.from_pretrained(MODEL_NAME, config=config,from_pt=True)

In [None]:
input_ids_in = tf.keras.layers.Input(shape=(MAX_LENGTH,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(MAX_LENGTH,), name='masked_token', dtype='int32') 

embedding_layer = DistilBERT(input_ids = input_ids_in, attention_mask = input_masks_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(1024, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(169, activation='softmax')(X)

model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

# for layer in model.layers[:3]:
#     layer.trainable = False

model.summary()

In [None]:
model_checkpoint = ModelCheckpoint(filepath=output_dir+'/weights.{epoch:02d}.hdf5',
                                  save_weights_only=True)

early_stopping = EarlyStopping(patience=3, # Stop after 3 epochs of no improvement
                               monitor='val_loss', # Look at validation_loss
                               min_delta=0, # After 0 change
                               mode='min', # Stop when quantity has stopped decreasing
                               restore_best_weights=False, # Don't Restore the best weights
                               verbose=1) 

reduce_lr = ReduceLROnPlateau(monitor='val_loss', # Look at validation loss
                              min_lr=0.000001, # Lower bound of learning rate
                              patience=1, # Reduce after 1 with little change
                              mode='min', # Stop when quantity has stopped decreasing
                              factor=0.1, # Reduce by a factor of 1/10
                              min_delta=0.01, # Minimumn change needed
                              verbose=1)

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, 
                    y_train, 
                    epochs = 3,
                    batch_size=16, 
                    validation_data=(X_val, y_val), 
                    callbacks=[model_checkpoint, early_stopping, reduce_lr])

In [None]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    print("Lowest Validation Loss: epoch {}".format(np.argmin(val_loss)+1))
    print("Highest Validation Accuracy: epoch {}".format(np.argmax(val_acc)+1))

plot_history(history)

In [None]:
def get_min_val_loss_epoch(history):
    return "0"+str(np.argmin(history.history['val_loss'])+1)

def get_max_val_acc_epoch(history):
    return "0"+str(np.argmax(history.history['val_accuracy'])+1)

In [None]:
epoch_num = get_max_val_acc_epoch(history)
model.load_weights(output_dir+"/weights."+epoch_num+".hdf5") # Load in model weights

In [None]:
y_test_probs = model.predict(X_test)

# Turn probabilities into an interger prediction
y_hat = []
for prob in y_test_probs:
    y_hat.append(np.argmax(prob))

In [None]:
print("Accuracy:", accuracy_score(y_test, y_hat))
print_cf1(y_test, y_hat)

### FastText
- BERT가 아닌 임베딩 모델인 FastText 사용한 예측 진행
- FastText와 딥러닝 모델을 함께 이용한 예측도 진행
- 예측성능이 KoBERT 모델보다 높지 않기에 KoBERT을 최종모델로 채택

In [None]:
!pip install fasttext
import fasttext

In [None]:
# 특정 파일 불러오기
import pandas as pd
train_preprocessed = pd.read_csv("/content/MyDrive/MyDrive/test/train_preprocessed_ver2_no_lower_no_ques.csv")
test_preprocessed = pd.read_csv("/content/MyDrive/MyDrive/test/test_preprocessed_ver2_no_lower_no_ques.csv")

In [None]:
# fasttext 사용 용이하게 하기 위해 txt로 저장했다가 다시 불러옴
train_preprocessed.to_csv('df2.txt', sep = '\t', index = False)
labeling = pd.read_csv("/content/df2.txt", sep = '\t')

In [None]:
model = fasttext.train_supervised("/content/df2.txt", wordNgrams=3, epoch=15, lr=0.35,verbose=0)

In [None]:
predictions = []
for line in test_df['document']:
    pred_label = model.predict(line, threshold=0.2)[0]
    predictions.append(pred_label)

In [None]:
anslist = []
for i in predictions:
    try:
        i = i[0].replace('_','').replace('label','')
        #print(i)
        anslist.append(i)
    except:
        anslist.append("")

자모단위 임베딩

In [None]:
from jamo import h2j, j2hcj
sample_text = "가나다한글"
j2hcj(h2j(sample_text))

In [None]:
import fasttext
def word2jamo(sen):
    return j2hcj(h2j(sen))

In [None]:
train_preprocessed['jamo'] = train_preprocessed['document'].apply(word2jamo)
train_preprocessed['jamo'].to_csv('fasttext_embedding_ver0.1_corpus.txt', sep = '\t', index = False)

In [None]:
model = fasttext.train_unsupervised("fasttext_embedding_ver0.1_corpus.txt", wordNgrams=3,model='skipgram', epoch=50, lr=0.05,verbose=0)
model.save_model("fasttext_ver0.1.bin")

In [None]:
vec_list_train = []
for i in train_preprocessed['jamo']:
    vec = model.get_sentence_vector(i)
    vec_list_train.append(vec)

vec_list_test = []
for i in test_preprocessed['jamo']:
    vec = model.get_sentence_vector(i)
    vec_list_test.append(vec)

In [None]:
# label 230미만 삭제
train_preprocessed['emb_vec'] = vec_list_train
test_preprocessed['emb_vec'] = vec_list_test

countdf = train_preprocessed.groupby('label').count().sort_values(by='document')
over230labels = countdf[countdf['document']>230].index.to_list()
train_preprocessed = train_preprocessed[train_preprocessed['label'].isin(over230labels)]

In [None]:
X_train = np.array(train_preprocessed['emb_vec'].to_list())
y_train = np.array(train_preprocessed['label'].to_list())
X_test = np.array(test_preprocessed['emb_vec'].to_list())

np.save('X_train_ver0.1.npy', X_train)
np.save('X_test_ver0.1.npy', X_test)
np.save('y_train_ver0.1.npy', y_train)

In [None]:
#딥러닝 모델 구축 시작
X_train = np.load('X_train_ver0.1.npy',allow_pickle=True)
y_train = np.load('y_train_ver0.1.npy',allow_pickle=True)

In [None]:
import tensorflow as tf
import pandas as pd

import matplotlib.pyplot as plt

from tensorflow.keras.layers import Dense,BatchNormalization,Dropout
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
encoder.fit(y_train)
y_train_digit = encoder.transform(y_train)
x_train, x_val, y_train, y_val = train_test_split(X_train,y_train_digit, 
                                                    test_size=0.2,  
                                                    random_state=1004)

In [None]:
#from tensorflow.keras.utils import plot_model

model1 = Sequential(name = 'model1')
model1.add(Dense(100, activation='relu', input_dim=(100)))
model1.add(Dense(1024, activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(1024, activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(168, activation='softmax'))
model1.summary()
#plot_model(model, show_shapes=True)

In [None]:
model1.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
from tensorflow import keras
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=4,restore_best_weights=True)
history = model1.fit(x_train,y_train,
                    epochs=100,
                    batch_size=100,
                    validation_data=(x_val,y_val),
                    callbacks=early_stop)

In [None]:
model2 = Sequential(name = 'model1')
model2.add(Dense(100, activation='relu', input_dim=(100)))
model2.add(Dense(512, activation='relu'))
model2.add(Dropout(0.25))
model2.add(Dense(512, activation='relu'))
model2.add(Dropout(0.25))
model2.add(Dense(168, activation='softmax'))
model2.summary()

model2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=4,restore_best_weights=True)
history_model2 = model2.fit(x_train,y_train,
                    epochs=100,
                    batch_size=100,
                    validation_data=(x_val,y_val),
                    callbacks=early_stop)

In [None]:
model3 = Sequential(name = 'model3')
model3.add(Dense(100, activation='relu', input_dim=(100)))
model3.add(Dense(256, activation='relu'))
model3.add(Dropout(0.25))
model3.add(Dense(256, activation='relu'))
model3.add(Dropout(0.25))
model3.add(Dense(256, activation='relu'))
model3.add(Dropout(0.25))
model3.add(Dense(168, activation='softmax'))
model3.summary()

model3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=4,restore_best_weights=True)
history_model3 = model3.fit(x_train,y_train,
                    epochs=100,
                    batch_size=100,
                    validation_data=(x_val,y_val),
                    callbacks=early_stop)

In [None]:
model5.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=4,restore_best_weights=True)
history_model5 = model5.fit(x_train,y_train,
                    epochs=100,
                    batch_size=100,
                    validation_data=(x_val,y_val),
                    callbacks=early_stop)