In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
import sys
sys.path.append('C:/Users/User/Desktop/r8')
from data_preprocessing import *
from transformer_build import  *
from sklearn.metrics import f1_score

In [2]:
train_df = pd.read_csv('C:/Users/User/Desktop/r8/r8_dataset/r8-train-stemmed.csv')
valid_df = pd.read_csv('C:/Users/User/Desktop/r8/r8_dataset/r8-dev-stemmed.csv')
test_df = pd.read_csv('C:/Users/User/Desktop/r8/r8_dataset/r8-test-stemmed.csv')
train_df.head()

Unnamed: 0,text,edge,intent
0,champion product approv stock split champion p...,champion product approv stock split champion p...,earn
1,comput termin system cpml complet sale comput ...,comput termin system cpml complet sale comput ...,acq
2,cobanco inc cbco year net shr ct dlr net asset...,cobanco inc cbco year net shr ct dlr net asset...,earn
3,intern inc qtr jan oper shr loss two ct profit...,intern inc qtr jan oper shr loss two ct profit...,earn
4,brown forman inc bfd qtr net shr dlr ct net ml...,brown forman inc bfd qtr net shr dlr ct net ml...,earn


text와 edge는 동일한 것으로 판단되므로 text만 활용하여 모델 학습

In [3]:
# train_df와 valid_df 결합
train_val_df = pd.concat([train_df, valid_df])

# 전처리

In [4]:
train_val_df['text']=train_val_df['text'].apply(clean_text)
test_df['text']=test_df['text'].apply(clean_text)

In [5]:
x_train=list(train_val_df['text'])
y_train=list(train_val_df['intent'])
x_test=list(test_df['text'])
y_test=list(test_df['intent'])

In [6]:
y=y_train+y_test

In [7]:
encoder=LabelEncoder()
encoder.fit(y)
label=encoder.transform(y)

y_train=list(label[:5484])
y_test=list(label[5484:])

In [8]:
sos_x_train=[]
sos_x_test=[]
for sen in x_train:
    sos_x_train.append('<sos> '+sen)
for sen in x_test:
    sos_x_test.append('<sos> '+sen)

all_txt=sos_x_train+sos_x_test


tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_txt)

vocab_size =len(tokenizer.word_index)+1 #1을 더해야 에러가 안터짐 토큰 영향으로 보임

x_train_encoded = tokenizer.texts_to_sequences(sos_x_train)
x_test_encoded = tokenizer.texts_to_sequences(sos_x_test)

max_len = 300

xtext_train = tf.keras.preprocessing.sequence.pad_sequences(x_train_encoded, maxlen=max_len)
xtext_test = tf.keras.preprocessing.sequence.pad_sequences(x_test_encoded, maxlen=max_len)

In [9]:
xtext_train.shape

(5484, 300)

In [10]:
y_train=np.array(y_train)
y_test=np.array(y_test)

In [11]:
x_train=xtext_train[:4937]
x_val=xtext_train[4937:]
y_train1=y_train[:4937]
y_val=y_train[4937:]

In [12]:
x_test=xtext_test
y_test1=y_test

# 모델 빌드

In [13]:
callbacks = [
    keras.callbacks.ModelCheckpoint(
        "best_model_{epoch}.h5", save_best_only=False, save_freq=5
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=2, min_lr=0.0001
    ),
]

In [14]:
embedding_dim = 256  # 각 단어의 임베딩 벡터의 차원   #128 #256 #512 #1024
num_heads = 1  # 어텐션 헤드의 수
dff = 64 # 포지션 와이즈 피드 포워드 신경망의 은닉층의 크기 #32 #64 #128 #256
num_transformer_blocks = 1

inputs = tf.keras.layers.Input(shape=(max_len,))
embedding_layer = TokenAndPositionEmbedding(max_len, vocab_size, embedding_dim)
x = embedding_layer(inputs)

for _ in range(num_transformer_blocks):
    transformer_block = TransformerBlock(embedding_dim, num_heads, dff)
    x = transformer_block(x)

x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(256, activation="relu")(x)
# x = tf.keras.layers.Dropout(0.5)(x)
outputs = tf.keras.layers.Dense(8, activation="softmax")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 300)]             0         
                                                                 
 token_and_position_embeddin  (None, 300, 256)         4528384   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_block (Transfor  (None, 300, 256)         297280    
 merBlock)                                                       
                                                                 
 global_average_pooling1d (G  (None, 256)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_2 (Dropout)         (None, 256)               0     

In [27]:
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(x_train, y_train1, batch_size=256,callbacks=callbacks, epochs=20, validation_data=(x_val, y_val))

print("테스트 정확도: %.4f" % (model.evaluate(x_test, y_test1)[1]))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
테스트 정확도: 0.9708


In [18]:
from keras.models import load_model
custom_objects = {"TokenAndPositionEmbedding": TokenAndPositionEmbedding, "TransformerBlock": TransformerBlock}
model = load_model('./save model/best_model_Transformer classifier.h5', custom_objects=custom_objects)
model.evaluate(x_test, y_test)



[0.11191032826900482, 0.9721333980560303]

In [19]:
y_pred = model.predict(x_test)
y_pred_labels = np.argmax(y_pred, axis=1)
macro_f1 = f1_score(y_test, y_pred_labels, average='macro')
print("Macro_F1 스코어:", macro_f1)
weighted_f1 = f1_score(y_test, y_pred_labels, average='weighted')
print("weighted_F1 스코어:", weighted_f1)

Macro_F1 스코어: 0.9421381911481157
weighted_F1 스코어: 0.9720590094045531


In [None]:
10에포크 dropout1 활성화 256 64
97.2