> https://www.analyticsvidhya.com/blog/2021/12/multiclass-classification-using-transformers/

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/AIFFELTHON/qna_for_git.csv', index_col=0)
# df = pd.read_csv('/content/drive/MyDrive/AIFFELTHON/AIFFELTHON_BBC/no_nan_qna_set.csv', index_col=0)
# df = pd.read_table('/content/drive/MyDrive/AIFFELTHON/AIFFELTHON_BBC/moonee_qna_set.txt', sep=',', index_col=0)
# df.head()

In [3]:
from sklearn.model_selection import train_test_split

# train_test_split
df_train, df_test = train_test_split(df,
                                     test_size=0.2,
                                     shuffle=True,
                                     random_state=930,
                                     stratify=df['sub_type'].values)

In [4]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [5]:
df_train['sub_type'].nunique(), df_test['sub_type'].nunique()

(21, 21)

In [6]:
# 정수 인코딩
sub_type_mapping = {
    'type_1': 0,
    'type_2': 1,
    'type_3': 2,
    'type_4': 3,
    'type_5': 4,
    'type_6': 5,
    'type_7': 6,
    'type_8': 7,
    'type_9': 8,
    'type_10': 9,
    'type_11': 10,
    'type_12': 11,
    'type_13': 12,
    'type_14': 13,
    'type_15': 14,
    'type_16': 15,
    'type_17': 16,
    'type_18': 17,
    'type_19': 18,
    'type_20': 19,
    'type_21': 20
}

func = lambda x: sub_type_mapping.get(x, x)

In [7]:
df_train['sub_type'] = df_train['sub_type'].map(func)
df_test['sub_type'] = df_test['sub_type'].map(func)

In [8]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(df_train['sub_type'])
y_test = to_categorical(df_test['sub_type'])

### **Transformers**

In [13]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 8.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 70.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13


In [14]:
# import transformers
# from transformers import AutoTokenizer, TFBertModel

# tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
# bert = TFBertModel.from_pretrained('bert-base-cased')

In [15]:
import transformers
from transformers import AutoTokenizer, TFBertModel

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
bert = TFBertModel.from_pretrained('bert-base-multilingual-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [16]:
tokenizer.tokenize('I have a new GPU!')

['I', 'have', 'a', 'new', 'GPU', '!']

In [17]:
tokenizer.tokenize('내비게이션이 잘 안 됩니다!')

['내', '##비', '##게', '##이션', '##이', '잘', '안', '[UNK]', '!']

In [18]:
x_train = tokenizer(
    text = df_train['inquiry'].tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

x_test = tokenizer(
    text=df_test['inquiry'].tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True
)

In [19]:
x_train

{'input_ids': <tf.Tensor: shape=(4668, 70), dtype=int32, numpy=
array([[   101,   9485,  18778, ...,      0,      0,      0],
       [   101,   9011,  29455, ...,      0,      0,      0],
       [   101,   9952,  10739, ...,      0,      0,      0],
       ...,
       [   101,   8996,  29455, ...,      0,      0,      0],
       [   101,   8888, 118617, ...,      0,      0,      0],
       [   101,   9952,  10739, ...,      0,      0,      0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(4668, 70), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [20]:
input_ids = x_train['input_ids']
attention_mask = x_train['attention_mask']

In [21]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

In [22]:
max_len = 70
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32, activation='relu')(out)
y = Dense(21, activation='softmax')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [23]:
optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = CategoricalCrossentropy(from_logits=True)
metric = CategoricalAccuracy('balanced_accuracy')

# Compile the model
model.compile(
    optimizer=optimizer,
    loss=loss, 
    metrics=metric)

In [24]:
train_history = model.fit(
    x = {'input_ids': x_train['input_ids'], 'attention_mask': x_train['attention_mask']},
    y = y_train,
    validation_data = (
    {'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']}, y_test
    ),
  epochs=10,
  batch_size=36
)

Epoch 1/10


  return dispatch_target(*args, **kwargs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
predicted_raw = model.predict({'input_ids': x_test['input_ids'],
                               'attention_mask': x_test['attention_mask']})
predicted_raw[0]

array([4.0352042e-03, 9.6409810e-01, 2.6146099e-03, 2.8272078e-03,
       4.9295886e-03, 2.9866523e-04, 4.0709076e-04, 2.6036159e-03,
       1.7634881e-04, 1.1643064e-03, 3.8966158e-04, 1.3789826e-04,
       3.7649308e-05, 1.3310474e-03, 2.7158554e-03, 6.6746543e-03,
       4.0239482e-03, 1.1742614e-03, 1.8362976e-04, 4.2986809e-05,
       1.3375757e-04], dtype=float32)

In [26]:
import numpy as np

y_predicted = np.argmax(predicted_raw, axis=1)
y_true = df_test['sub_type']

In [27]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_predicted))

              precision    recall  f1-score   support

           0       0.89      0.90      0.89       212
           1       0.74      0.87      0.80       135
           2       0.69      0.69      0.69       117
           3       0.88      0.94      0.91       134
           4       0.75      0.66      0.71       128
           5       0.79      0.73      0.76        78
           6       0.68      0.73      0.71        74
           7       0.60      0.59      0.59        70
           8       1.00      1.00      1.00         2
           9       0.61      0.53      0.57        36
          10       0.90      0.88      0.89        32
          11       0.91      0.82      0.86        38
          12       0.93      0.93      0.93        40
          13       0.00      0.00      0.00         2
          14       0.73      0.68      0.70        28
          15       0.94      0.89      0.91        18
          16       0.29      0.22      0.25         9
          17       0.58    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
sub_type_mapping_new = dict(zip(sub_type_mapping.values(), sub_type_mapping.keys()))
sub_type_mapping_new

{0: 'type_1',
 1: 'type_2',
 2: 'type_3',
 3: 'type_4',
 4: 'type_5',
 5: 'type_6',
 6: 'type_7',
 7: 'type_8',
 8: 'type_9',
 9: 'type_10',
 10: 'type_11',
 11: 'type_12',
 12: 'type_13',
 13: 'type_14',
 14: 'type_15',
 15: 'type_16',
 16: 'type_17',
 17: 'type_18',
 18: 'type_19',
 19: 'type_20',
 20: 'type_21'}

In [29]:
# sub_type_mapping_new = {v: k in k, v in sub_type_mapping.items()}

In [30]:
texts = input(str('input the text: '))

x_val = tokenizer(
    text=texts,
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose = True)

validation = model.predict({'input_ids': x_val['input_ids'],
                            'attention_mask': x_val['attention_mask']})*100

score = (-validation).argsort()
for i in score[0]:
    print(sub_type_mapping_new.get(i), '  ', validation[0][i])

# for key, value in zip(sub_type_mapping.get(score[0]), validation[0]):
#     print(key, value)

input the text: 내비게이션이 잘 안돼요
type_1    97.50724
type_7    0.68976057
type_16    0.6364491
type_9    0.24037969
type_5    0.18022537
type_3    0.14667031
type_15    0.12009635
type_8    0.07890173
type_21    0.078779064
type_6    0.052897703
type_19    0.049398333
type_18    0.045705687
type_10    0.045067154
type_14    0.040502552
type_4    0.04000027
type_20    0.019102007
type_11    0.00914818
type_13    0.006989299
type_2    0.005156693
type_17    0.00503224
type_12    0.0025036363
