> https://www.analyticsvidhya.com/blog/2021/12/multiclass-classification-using-transformers/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/AIFFELTHON/AIFFELTHON_BBC/no_nan_qna_set.csv', index_col=0)
# df = pd.read_table('/content/drive/MyDrive/AIFFELTHON/AIFFELTHON_BBC/moonee_qna_set.txt', sep=',', index_col=0)
# df.head()

In [None]:
from sklearn.model_selection import train_test_split

# train_test_split
df_train, df_test = train_test_split(df,
                                     test_size=0.2,
                                     shuffle=True,
                                     random_state=930,
                                     stratify=df['sub_type'].values)

In [None]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
df_train['sub_type'].nunique(), df_test['sub_type'].nunique()

(21, 21)

In [None]:
sub_type_mapping = {
    '내비게이션': 0,
    '타이어': 1,
    '라이트': 2,
    '시동': 3,
    '경고등': 4,
    '차량외부': 5,
    '차량내부': 6,
    '주행관련': 7,
    '사고조사': 8,
    '단말기': 9,
    '주유/충전카드': 10,
    '후방카메라': 11,
    '하이패스': 12,
    '차량점검': 13,
    '브레이크': 14,
    '블랙박스': 15,
    '위생문제': 16,
    '주차장': 17,
    'ADAS': 18,
    '비치품': 19,
    '충전기확인': 20
}

func = lambda x: sub_type_mapping.get(x, x)

In [None]:
df_train['sub_type'] = df_train['sub_type'].map(func)
df_test['sub_type'] = df_test['sub_type'].map(func)

In [None]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(df_train['sub_type'])
y_test = to_categorical(df_test['sub_type'])

### **Transformers**

In [None]:
# ! pip install transformers

In [None]:
# import transformers
# from transformers import AutoTokenizer, TFBertModel

# tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
# bert = TFBertModel.from_pretrained('bert-base-cased')

In [None]:
import transformers
from transformers import AutoTokenizer, TFBertModel

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
bert = TFBertModel.from_pretrained('bert-base-multilingual-cased')

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
tokenizer.tokenize('I have a new GPU!')

['I', 'have', 'a', 'new', 'GPU', '!']

In [None]:
tokenizer.tokenize('내비게이션이 잘 안 됩니다!')

['내', '##비', '##게', '##이션', '##이', '잘', '안', '[UNK]', '!']

In [None]:
x_train = tokenizer(
    text = df_train['inquiry'].tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

x_test = tokenizer(
    text=df_test['inquiry'].tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True
)

In [None]:
x_train

{'input_ids': <tf.Tensor: shape=(4668, 70), dtype=int32, numpy=
array([[   101,   9011,  29455, ...,      0,      0,      0],
       [   101,   9689,  42815, ...,      0,      0,      0],
       [   101,   8996,  29455, ...,      0,      0,      0],
       ...,
       [   101,   9378, 118855, ...,      0,      0,      0],
       [   101,   9665, 119215, ...,      0,      0,      0],
       [   101,  10003,  42337, ...,      0,      0,      0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(4668, 70), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [None]:
input_ids = x_train['input_ids']
attention_mask = x_train['attention_mask']

In [None]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

In [None]:
max_len = 70
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32, activation='relu')(out)
y = Dense(21, activation='softmax')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [None]:
optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = CategoricalCrossentropy(from_logits=True)
metric = CategoricalAccuracy('balanced_accuracy')

# Compile the model
model.compile(
    optimizer=optimizer,
    loss=loss, 
    metrics=metric)

In [None]:
train_history = model.fit(
    x = {'input_ids': x_train['input_ids'], 'attention_mask': x_train['attention_mask']},
    y = y_train,
    validation_data = (
    {'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']}, y_test
    ),
  epochs=10,
  batch_size=36
)

Epoch 1/10


  return dispatch_target(*args, **kwargs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
predicted_raw = model.predict({'input_ids': x_test['input_ids'],
                               'attention_mask': x_test['attention_mask']})
predicted_raw[0]

array([7.7808851e-05, 1.0027929e-04, 2.1376344e-03, 2.8714119e-05,
       5.6288773e-05, 1.5432775e-05, 9.9444097e-01, 2.9482835e-04,
       2.9170975e-05, 6.6386085e-05, 2.3210675e-06, 4.1844053e-04,
       3.0256235e-06, 1.9044100e-04, 5.2989424e-05, 3.3378063e-04,
       1.0213920e-04, 1.1260177e-03, 4.0654367e-04, 2.4408899e-05,
       9.2423041e-05], dtype=float32)

In [None]:
import numpy as np

y_predicted = np.argmax(predicted_raw, axis=1)
y_true = df_test['sub_type']

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_predicted))

              precision    recall  f1-score   support

           0       0.90      0.93      0.92       212
           1       0.82      0.78      0.80       135
           2       0.68      0.56      0.62       117
           3       0.88      0.89      0.88       134
           4       0.61      0.79      0.69       128
           5       0.66      0.63      0.64        78
           6       0.64      0.73      0.68        74
           7       0.60      0.59      0.59        70
           8       0.00      0.00      0.00         2
           9       0.41      0.39      0.40        36
          10       0.85      0.88      0.86        32
          11       0.89      0.82      0.85        38
          12       0.95      0.90      0.92        40
          13       0.00      0.00      0.00         2
          14       0.75      0.75      0.75        28
          15       0.94      0.83      0.88        18
          16       0.00      0.00      0.00         9
          17       0.73    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
sub_type_mapping_new = dict(zip(sub_type_mapping.values(), sub_type_mapping.keys()))
sub_type_mapping_new

{0: '내비게이션',
 1: '타이어',
 2: '라이트',
 3: '시동',
 4: '경고등',
 5: '차량외부',
 6: '차량내부',
 7: '주행관련',
 8: '사고조사',
 9: '단말기',
 10: '주유/충전카드',
 11: '후방카메라',
 12: '하이패스',
 13: '차량점검',
 14: '브레이크',
 15: '블랙박스',
 16: '위생문제',
 17: '주차장',
 18: 'ADAS',
 19: '비치품',
 20: '충전기확인'}

In [None]:
# sub_type_mapping_new = {v: k in k, v in sub_type_mapping.items()}

In [None]:
texts = input(str('input the text: '))

x_val = tokenizer(
    text=texts,
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose = True)

validation = model.predict({'input_ids': x_val['input_ids'],
                            'attention_mask': x_val['attention_mask']})*100

score = (-validation).argsort()
for i in score[0]:
    print(sub_type_mapping_new.get(i), '  ', validation[0][i])

# for key, value in zip(sub_type_mapping.get(score[0]), validation[0]):
#     print(key, value)

input the text: 내비게이션이 잘 안 돼요.
내비게이션    98.555435
후방카메라    0.49746093
차량내부    0.45252427
위생문제    0.16070208
비치품    0.0998362
시동    0.039750934
충전기확인    0.034468334
브레이크    0.03296155
주유/충전카드    0.030996986
ADAS    0.027660912
타이어    0.017348079
블랙박스    0.016026167
라이트    0.014712976
주행관련    0.009273067
차량외부    0.0030279213
단말기    0.0028419497
주차장    0.0026300112
하이패스    0.00082677935
경고등    0.000777303
차량점검    0.00067206495
사고조사    6.271599e-05
