> https://www.analyticsvidhya.com/blog/2021/12/multiclass-classification-using-transformers/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 단계 1: 폰트 설치
import matplotlib.font_manager as fm

!apt-get -qq -y install fonts-nanum > /dev/null
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
fm._rebuild()

In [None]:
# 단계 2: 런타임 재시작
import os
os.kill(os.getpid(), 9)

In [None]:
# 단계 3: 한글 폰트 설정
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm

# 마이너스 표시 문제
mpl.rcParams['axes.unicode_minus'] = False
	
# 한글 폰트 설정
path = '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf'
font_name = fm.FontProperties(fname=path, size=18).get_name()
plt.rc('font', family=font_name)
fm._rebuild()

---

In [None]:
# RNN
new = pd.DataFrame({'Model': ['Baseline', 'Word2Vec', 'OverSampling', 'OverSampling+Word2Vec'],
                       'train_acc': [0.9007, 0.8677, 0.9659, 0.8783],
                       'train_loss': [0.3453, 0.4126, 0.1127, 0.3591],
                       'val_acc': [0.7683, 0.6500, 0.9400, 0.8400],
                       'val_loss': [1.1689, 1.6168, 0.2860, 0.6284],
                       'test_acc': [0.7263, 0.6091, 0.9272, 0.8402],
                       'test_loss': [1.3240, 1.8558, 0.2818, 0.6815]})
new

Unnamed: 0,Model,train_acc,train_loss,val_acc,val_loss,test_acc,test_loss
0,Baseline,0.9007,0.3453,0.7683,1.1689,0.7263,1.324
1,Word2Vec,0.8677,0.4126,0.65,1.6168,0.6091,1.8558
2,OverSampling,0.9659,0.1127,0.94,0.286,0.9272,0.2818
3,OverSampling+Word2Vec,0.8783,0.3591,0.84,0.6284,0.8402,0.6815


In [None]:
# BERT
new = pd.DataFrame({'Model': ['bert-base', 'bert-uncased', 'bert-base-multilingual-cased', 'functional', 'function+OverSampling'],
                       'train_acc': [0.0818, 0.0167, 0.1857, 0.8869, 0.9511],
                       'train_loss': [3.0445, 3.0445, 2.9173, 0.4181, 0.1989],
                       'test_acc': [0.0822, 0.0017, 0.1789, 0.7586, 0.9370],
                       'test_loss': [3.0445, 3.0445, 3.1299, 1.0214, 0.2668]})
new

Unnamed: 0,Model,train_acc,train_loss,test_acc,test_loss
0,bert-base,0.0818,3.0445,0.0822,3.0445
1,bert-uncased,0.0167,3.0445,0.0017,3.0445
2,bert-base-multilingual-cased,0.1857,2.9173,0.1789,3.1299
3,functional,0.8869,0.4181,0.7586,1.0214
4,function+OverSampling,0.9511,0.1989,0.937,0.2668


In [None]:
# Transformer

new = pd.DataFrame({'Model': ['bert-case', 'Okt', 'Okt+Word2Vec', 'Stopwords', '단순 복제', 'OverSampling'],
                    'train_acc': [0.0800, 0.0827, 0.0827, 0.1318, 0.1887, 0.1921],
                    'train_loss': [0.0069, 0.0091, 0.0087, 0.0111, 0.0074, 0.0053]
                    })
new

### **데이터셋 가져오기**

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/AIFFELTHON/AIFFELTHON_BBC/no_nan_qna_set.csv', index_col=0)
# df = pd.read_table('/content/drive/MyDrive/AIFFELTHON/AIFFELTHON_BBC/moonee_qna_set.txt', sep=',', index_col=0)
# df.head()

In [None]:
questions = list(df['inquiry'])
# questions

In [None]:
print(len(questions))

5836


In [None]:
cnt = 0
for sentence in questions:
  if '인입' in sentence or '재인입' in sentence:
      cnt += 1

print(cnt)

2064


In [None]:
cnt = 0
for sentence in questions:
  if '문의' in sentence:
      cnt += 1

print(cnt)

837


### **오버샘플링 적용**
> 해당 데이터는 `class imbalance`의 문제가 심함
> - 제일 많은 `내비게이션` 데이터는 1058개, 제일 적은 `충전기확인` 데이터는 4개 뿐
> - **RandomOverSampling**을 이용해 데이터를 복제해 최대 갯수인 1058개에 맞추자.

In [None]:
df['sub_type'].value_counts()

내비게이션      1058
타이어         675
시동          672
경고등         642
라이트         583
차량외부        389
차량내부        368
주행관련        348
하이패스        199
후방카메라       190
단말기         180
주유/충전카드     158
브레이크        142
블랙박스         90
주차장          48
위생문제         46
차량점검         13
사고조사         11
비치품          10
ADAS         10
충전기확인         4
Name: sub_type, dtype: int64

In [None]:
df['sub_type'].value_counts(ascending=True).plot.barh(figsize=(15, 8), fontsize=15, color='lightblue')

In [None]:
# 특징과 label 분리
X = df.drop('sub_type', axis=1)
y = df['sub_type']

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

# 인스턴스 생성
method = RandomOverSampler()

# 오버샘플링 적용
X, y = method.fit_resample(X, y)

X = pd.DataFrame(X, columns=X.columns)
y = pd.Series(y)

In [None]:
df = pd.concat([X, y], axis=1) # 데이터프레임(X)와 시리즈(y) 합치기

In [None]:
df['sub_type'].value_counts()

브레이크       1058
타이어        1058
ADAS       1058
충전기확인      1058
주차장        1058
비치품        1058
위생문제       1058
차량점검       1058
차량외부       1058
단말기        1058
후방카메라      1058
경고등        1058
하이패스       1058
주유/충전카드    1058
내비게이션      1058
블랙박스       1058
차량내부       1058
주행관련       1058
라이트        1058
시동         1058
사고조사       1058
Name: sub_type, dtype: int64

In [None]:
df['sub_type'].value_counts(ascending=True).plot.barh(figsize=(15, 8), fontsize=15, color='lightblue')

In [None]:
# df[df['sub_type'] == '충전기확인']

### **train, test set 분리**

In [None]:
from sklearn.model_selection import train_test_split

# train_test_split
df_train, df_test = train_test_split(df,
                                     test_size=0.2,
                                     shuffle=True,
                                     random_state=930,
                                     stratify=df['sub_type'].values)

In [None]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
df_train['sub_type'].nunique(), df_test['sub_type'].nunique()

(21, 21)

### **sub_type을 categorical data로**

In [None]:
# 정수 인코딩
sub_type_mapping = {
    '내비게이션': 0,
    '타이어': 1,
    '라이트': 2,
    '시동': 3,
    '경고등': 4,
    '차량외부': 5,
    '차량내부': 6,
    '주행관련': 7,
    '사고조사': 8,
    '단말기': 9,
    '주유/충전카드': 10,
    '후방카메라': 11,
    '하이패스': 12,
    '차량점검': 13,
    '브레이크': 14,
    '블랙박스': 15,
    '위생문제': 16,
    '주차장': 17,
    'ADAS': 18,
    '비치품': 19,
    '충전기확인': 20
}

func = lambda x: sub_type_mapping.get(x, x)

In [None]:
df_train['sub_type'] = df_train['sub_type'].map(func)
df_test['sub_type'] = df_test['sub_type'].map(func)

In [None]:
# target data를 categorical 형태로 변환
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(df_train['sub_type'])
y_test = to_categorical(df_test['sub_type'])

### **트랜스포머**

In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 4.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 46.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYA

> - `AutoTokenizer`: 다양한 타입의 tokenizer 포함
> - `TFBertModel`: 텐서플로우에서 사용하는 pre-trained된 Bert
>> 한국어를 사용하기 위해 100개 이상의 언어로 사전학습된 **`bert-base-multilingual-cased`** BERT를 이용하자.

In [None]:
# import transformers
# from transformers import AutoTokenizer, TFBertModel

# tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
# bert = TFBertModel.from_pretrained('bert-base-cased')

In [None]:
import transformers
from transformers import AutoTokenizer, TFBertModel

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
bert = TFBertModel.from_pretrained('bert-base-multilingual-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
tokenizer.tokenize('I have a new GPU!')

['I', 'have', 'a', 'new', 'GPU', '!']

In [None]:
tokenizer.tokenize('내비게이션이 잘 안 됩니다!')

['내', '##비', '##게', '##이션', '##이', '잘', '안', '[UNK]', '!']

### **Input Data Modeling**
> Input 텍스트 데이터를 tokenizer를 사용해 Bert의 input 형식으로 맞추자.

In [None]:
x_train = tokenizer(
    text = df_train['inquiry'].tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

x_test = tokenizer(
    text=df_test['inquiry'].tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True
)

In [None]:
x_train

{'input_ids': <tf.Tensor: shape=(17774, 70), dtype=int32, numpy=
array([[   101,   9730,  44321, ...,      0,      0,      0],
       [   101,   9359, 118900, ...,      0,      0,      0],
       [   101,   9560, 119045, ...,      0,      0,      0],
       ...,
       [   101,   9730,  44321, ...,      0,      0,      0],
       [   101,   9952,  10739, ...,      0,      0,      0],
       [   101,   9428,  35866, ...,      0,      0,      0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(17774, 70), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [None]:
input_ids = x_train['input_ids']
attention_mask = x_train['attention_mask']

### **Model Building**
> functional API 사용

In [None]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

In [None]:
max_len = 70

input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32, activation='relu')(out)
y = Dense(21, activation='softmax')(out)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [None]:
optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = CategoricalCrossentropy(from_logits=True)
metric = CategoricalAccuracy('balanced_accuracy')

# Compile the model
model.compile(
    optimizer=optimizer,
    loss=loss, 
    metrics=metric)

> 2개의 input arrays, **input_ids, attention_mask**
> - input_ids: 우리의 input 토큰에 대해 vocab dic의 key 값을 출력
> - attention_mask: special token인지 아닌지 출력해줌
> - token_type_ids: 0번째 문장인지, 1번째 문장인지에 대해 출력

In [None]:
train_history = model.fit(
    x = {'input_ids': x_train['input_ids'], 'attention_mask': x_train['attention_mask']},
    y = y_train,
    validation_data = (
    {'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']}, y_test
    ),
  epochs=10,
  batch_size=36
)

Epoch 1/10


  return dispatch_target(*args, **kwargs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
predicted_raw = model.predict({'input_ids': x_test['input_ids'],
                               'attention_mask': x_test['attention_mask']})
predicted_raw[0]

array([2.6578953e-06, 3.4617221e-06, 5.6444274e-05, 6.1977044e-04,
       2.2349651e-07, 5.8148987e-07, 2.5998659e-06, 1.3595930e-04,
       8.3686127e-06, 1.9341365e-05, 1.7646222e-07, 2.7111396e-06,
       2.4794231e-06, 2.1048725e-05, 1.1075466e-08, 4.9025025e-06,
       5.3780799e-04, 9.9842715e-01, 1.4894789e-04, 1.2578395e-09,
       5.3160411e-06], dtype=float32)

In [None]:
import numpy as np

y_predicted = np.argmax(predicted_raw, axis=1)
y_true = df_test['sub_type']

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_predicted))

              precision    recall  f1-score   support

           0       0.92      0.84      0.88       212
           1       0.92      0.81      0.86       212
           2       0.79      0.84      0.82       211
           3       0.90      0.90      0.90       212
           4       0.83      0.75      0.79       212
           5       0.93      0.91      0.92       211
           6       0.88      0.90      0.89       212
           7       0.90      0.92      0.91       212
           8       1.00      1.00      1.00       212
           9       0.90      0.91      0.90       211
          10       0.98      0.99      0.98       212
          11       0.95      0.92      0.94       212
          12       0.95      0.99      0.97       211
          13       1.00      1.00      1.00       211
          14       0.95      0.99      0.97       211
          15       0.98      1.00      0.99       212
          16       0.95      1.00      0.97       212
          17       0.97    

In [None]:
sub_type_mapping_new = dict(zip(sub_type_mapping.values(), sub_type_mapping.keys()))
sub_type_mapping_new

{0: '내비게이션',
 1: '타이어',
 2: '라이트',
 3: '시동',
 4: '경고등',
 5: '차량외부',
 6: '차량내부',
 7: '주행관련',
 8: '사고조사',
 9: '단말기',
 10: '주유/충전카드',
 11: '후방카메라',
 12: '하이패스',
 13: '차량점검',
 14: '브레이크',
 15: '블랙박스',
 16: '위생문제',
 17: '주차장',
 18: 'ADAS',
 19: '비치품',
 20: '충전기확인'}

In [None]:
# sub_type_mapping_new = {v: k in k, v in sub_type_mapping.items()}

In [None]:
while(1):
    texts = input(str('input the text: '))
    if texts == '.':
        break

    x_val = tokenizer(
        text=texts,
        add_special_tokens=True,
        max_length=70,
        truncation=True,
        padding='max_length', 
        return_tensors='tf',
        return_token_type_ids=False,
        return_attention_mask=True,
        verbose = True)

    validation = model.predict({'input_ids': x_val['input_ids'],
                                'attention_mask': x_val['attention_mask']})*100

    score = (-validation).argsort()
    for i in score[0]:
        print(sub_type_mapping_new.get(i), '  ', validation[0][i])

# for key, value in zip(sub_type_mapping.get(score[0]), validation[0]):
#     print(key, value)

input the text: 네비게이션이 잘 안 돼요.
내비게이션    96.68482
타이어    0.71299684
차량내부    0.6356766
후방카메라    0.5538289
블랙박스    0.38444978
하이패스    0.3435386
주행관련    0.20147695
단말기    0.20079741
위생문제    0.07876538
브레이크    0.06536599
충전기확인    0.042415857
차량외부    0.023407243
시동    0.015978178
경고등    0.012790054
비치품    0.012448683
주차장    0.008770052
사고조사    0.007034761
차량점검    0.0067003164
라이트    0.004806188
ADAS    0.0030468141
주유/충전카드    0.00089371734
input the text: 쏘카존이 너무 더러워요.
주차장    60.93767
주행관련    10.883636
시동    10.487886
라이트    8.0882435
단말기    4.5494127
위생문제    2.4949334
내비게이션    0.7682961
타이어    0.76462895
차량내부    0.23430623
하이패스    0.2018984
후방카메라    0.12793112
블랙박스    0.099898025
차량점검    0.09353551
브레이크    0.07156643
ADAS    0.059283998
차량외부    0.03958868
경고등    0.03547209
충전기확인    0.030640915
사고조사    0.02674087
주유/충전카드    0.0042753653
비치품    0.00014933466
input the text: .
