In [67]:
import tensorflow as tf

In [68]:
import pandas as pd
import numpy as np
import os
import json
import csv
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras import layers
from keras.layers import Input, Embedding, Conv1D, MaxPooling1D, LSTM, Dense, concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from keras.layers import TimeDistributed

# 사전 데이터 불러오기
word_to_id = pd.read_csv("word_to_id.csv", encoding='utf-8')
id_to_word = pd.read_csv("id_to_word.csv", encoding='utf-8')

# 정수화된 문장 데이터 불러오기
standard_data = pd.read_csv("standard_padding_data.csv", encoding='utf-8')
dialect_data = pd.read_csv("dialect_padding_data.csv", encoding='utf-8')
from gensim.models import Word2Vec

# 저장한 Word2Vec 모델 불러오기
dialect_word2vec = Word2Vec.load("dialect_word2vec.bin")
standard_word2vec = Word2Vec.load("standard_word2vec.bin")

In [69]:
dialect_word2vec

<gensim.models.word2vec.Word2Vec at 0x24f4283ef08>

In [70]:
standard_word2vec

<gensim.models.word2vec.Word2Vec at 0x24f427d2648>

In [None]:
# word_to_id

In [None]:
# standard_data

In [71]:
print(len(word_to_id))
print(len(id_to_word))
print(len(standard_data))
print(len(dialect_data))

108341
108341
331083
331083


In [72]:
standard_data.shape

(331083, 40)

In [73]:
dialect_data.shape

(331083, 40)

In [74]:
# 데이터 분리
X_train_dialect, X_test_dialect, y_train_standard, y_test_standard = train_test_split(dialect_data ,standard_data, test_size=0.2, random_state=42)

In [75]:
X_train_dialect.shape

(264866, 40)

In [76]:
X_test_dialect.shape

(66217, 40)

In [77]:
y_train_standard.shape

(264866, 40)

In [78]:
y_test_standard.shape

(66217, 40)

In [79]:
type(y_train_standard)

pandas.core.frame.DataFrame

In [87]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

dialect_embedding_matrix = dialect_word2vec.wv.vectors
standard_embedding_matrix = standard_word2vec.wv.vectors

vocab_size = len(word_to_id)

max_length = 40
input_layer = Input(shape=(max_length,), dtype='float32')

# 임베딩 레이어
embedding_layer = Embedding(input_dim=dialect_embedding_matrix.shape[0], output_dim=dialect_embedding_matrix.shape[1], weights=[dialect_embedding_matrix], input_length=max_length, trainable=True)(input_layer)

# CNN 레이어
conv_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding_layer)
pooling_layer = MaxPooling1D(pool_size=2)(conv_layer)

# 과적합 방지를 위한 드롭아웃 레이어
dropout_layer = Dropout(0.2)(pooling_layer)

# LSTM 레이어
lstm_layer = LSTM(units=64, return_sequences=True)(dropout_layer)

# 완전 연결 레이어
output_layer = TimeDistributed(Dense(units=vocab_size, activation='softmax'))(lstm_layer)

# 모델 생성wd
model = Model(inputs=input_layer, outputs=output_layer)

# 모델 컴파일 
model.compile(optimizer='adam', loss="sparse_categorical_crossentropy", metrics=['accuracy'])

# type(dialect_data)

model.summary()
early_stopping = EarlyStopping()

y_train_standard_array = y_train_standard.values
y_train_standard_padded = pad_sequences(y_train_standard_array, maxlen=max_length, padding='post')

Model: "model_30"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_35 (InputLayer)       [(None, 40)]              0         
                                                                 
 embedding_47 (Embedding)    (None, 40, 100)           10084400  
                                                                 
 conv1d_33 (Conv1D)          (None, 38, 64)            19264     
                                                                 
 max_pooling1d_33 (MaxPoolin  (None, 19, 64)           0         
 g1D)                                                            
                                                                 
 dropout_18 (Dropout)        (None, 19, 64)            0         
                                                                 
 lstm_32 (LSTM)              (None, 19, 64)            33024     
                                                          

In [88]:
# 모델 학습
hist = model.fit(X_train_dialect , y_train_standard_padded, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])
loss, accuracy = model.evaluate(X_test_dialect, y_test_dialect)
print(f'Accuracy: {accuracy * 100:.2f}%')

Epoch 1/10


ValueError: in user code:

    File "c:\Users\edcrf\anaconda3\envs\new_thing\lib\site-packages\keras\engine\training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\edcrf\anaconda3\envs\new_thing\lib\site-packages\keras\engine\training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\edcrf\anaconda3\envs\new_thing\lib\site-packages\keras\engine\training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\edcrf\anaconda3\envs\new_thing\lib\site-packages\keras\engine\training.py", line 1028, in train_step
        return self.compute_metrics(x, y, y_pred, sample_weight)
    File "c:\Users\edcrf\anaconda3\envs\new_thing\lib\site-packages\keras\engine\training.py", line 1122, in compute_metrics
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "c:\Users\edcrf\anaconda3\envs\new_thing\lib\site-packages\keras\engine\compile_utils.py", line 605, in update_state
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "c:\Users\edcrf\anaconda3\envs\new_thing\lib\site-packages\keras\utils\metrics_utils.py", line 77, in decorated
        update_op = update_state_fn(*args, **kwargs)
    File "c:\Users\edcrf\anaconda3\envs\new_thing\lib\site-packages\keras\metrics\base_metric.py", line 140, in update_state_fn
        return ag_update_state(*args, **kwargs)
    File "c:\Users\edcrf\anaconda3\envs\new_thing\lib\site-packages\keras\metrics\base_metric.py", line 691, in update_state  **
        matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\edcrf\anaconda3\envs\new_thing\lib\site-packages\keras\metrics\metrics.py", line 3669, in sparse_categorical_accuracy
        matches = metrics_utils.sparse_categorical_matches(y_true, y_pred)
    File "c:\Users\edcrf\anaconda3\envs\new_thing\lib\site-packages\keras\utils\metrics_utils.py", line 970, in sparse_categorical_matches
        matches = tf.cast(tf.equal(y_true, y_pred), backend.floatx())

    ValueError: Dimensions must be equal, but are 40 and 19 for '{{node Equal}} = Equal[T=DT_FLOAT, incompatible_shape_error=true](Cast_25, Cast_26)' with input shapes: [?,40], [?,19].


In [None]:
# from tensorflow.keras.layers import Input, Embedding, Dense, TransformerEncoder, TransformerDecoder

In [5]:
# tensorflow 같은 딥러닝 프레임워크는 주로 numpy 배열을 입력으로 받음
# 때문에 values 속성 사용해 Pandas 데이터프레임을 Numpy 배열로 변환
input_sequences = dialect_data.values
output_sequences = standard_data.values

In [9]:
# EarlyStopping 콜백 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

In [6]:
# 모델 구성
# 임베딩 레이어: 단어를 밀집된 벡터로 변환
# 첫번째 LSTM 레이어: 데이터의 패턴을 학습
# RepactVector 레이어: 출력 시퀀스의 길이만큼 입력을 반복
# 두번째 LSTM 레이어: 첫번째 LSTM 레이어와 마찬가지로 패턴을 학습
# Dense 레이어: 단어 사전의 크기에 따라 출력 크기 설정하고, softmax 함수 사용해 확률 분포 출력
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_to_id)+ 1, 256, input_length=input_sequences.shape[1]),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.RepeatVector(output_sequences.shape[1]),
    tf.keras.layers.LSTM(256, return_sequences=True),
    tf.keras.layers.Dense(len(word_to_id) + 1, activation='softmax')
])

In [7]:
# 모델 compile
# 모델이 훈련 가능한 상태가 되도록 하는 것
# optimizer는 학습률을 조절해 최적화하는 방법이며 adam은 RMSProp과 Momentum 방식을 결합한 방식
# Loss function은 손실 값을 측정하는 방법이며 여기서는 훈련 데이터의 라벨 값이 정수이므로 sparse_categorical_crossentropy 사용
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [11]:
print(type(input_sequences))
print(type(output_sequences))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [12]:
print(len(input_sequences))
print(len(output_sequences))
print(len(word_to_id))

367954
367954
128707


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
# ModelCheckpoint 콜백 정의
checkpoint_callback = ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)


In [None]:
# 모델 학습 시 fit() 메서드의 callbacks 인자에 콜백 전달
history = model.fit(input_sequences, np.expand_dims(output_sequences, -1), epochs=100, batch_size=64, validation_split=0.2, callbacks=[checkpoint_callback])