In [12]:
import os
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.layers import Layer, TimeDistributed, RepeatVector, Reshape, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# CSV 파일 경로 (로컬에서 사용하는 경로로 수정)
csv_file_path = './news_summary_more.csv'  # CSV 파일 경로를 로컬에 맞게 수정하세요.

# 데이터 로드
df = pd.read_csv(csv_file_path)

# 'text'와 'headlines' 컬럼만 사용
text_data = df['text'].astype(str).values  # 'text' 컬럼
summary_data = df['headlines'].astype(str).values  # 'headlines' 컬럼

# 데이터셋 샘플링 (데이터 크기를 줄여 Colab 리소스 절약)
sample_size = int(len(text_data) * 0.1)  # 데이터의 10%만 샘플링
text_data = text_data[:sample_size]
summary_data = summary_data[:sample_size]

# 데이터셋 분리 (80% train, 20% validation)
text_train, text_val, summary_train, summary_val = train_test_split(
    text_data, summary_data, test_size=0.2, random_state=42
)

# 토크나이저 생성 및 훈련
MAX_TEXT_LEN = 50  # 줄여서 리소스 절약
MAX_SUMMARY_LEN = 15  # 줄여서 리소스 절약
VOCAB_SIZE = 15000  # 어휘 크기 증가 (기존 10000에서 15000으로)

# 텍스트 토크나이저
text_tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<UNK>')  # OOV 토큰을 추가하여 범위 외의 단어 처리
text_tokenizer.fit_on_texts(text_train)

# 요약 토크나이저
summary_tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<UNK>')
summary_tokenizer.fit_on_texts(summary_train)

# 토큰화 및 시퀀스 변환
text_train_seq = text_tokenizer.texts_to_sequences(text_train)
text_val_seq = text_tokenizer.texts_to_sequences(text_val)

# 인덱스를 VOCAB_SIZE 이하로 필터링
text_train_seq = [[token if token < VOCAB_SIZE else text_tokenizer.word_index['<UNK>'] for token in seq] for seq in text_train_seq]
text_val_seq = [[token if token < VOCAB_SIZE else text_tokenizer.word_index['<UNK>'] for token in seq] for seq in text_val_seq]

# 시퀀스 패딩
text_train_seq = pad_sequences(text_train_seq, maxlen=MAX_TEXT_LEN, padding='post')
text_val_seq = pad_sequences(text_val_seq, maxlen=MAX_TEXT_LEN, padding='post')

summary_train_seq = summary_tokenizer.texts_to_sequences(summary_train)
summary_val_seq = summary_tokenizer.texts_to_sequences(summary_val)

# 인덱스를 VOCAB_SIZE 이하로 필터링
summary_train_seq = [[token if token < VOCAB_SIZE else summary_tokenizer.word_index['<UNK>'] for token in seq] for seq in summary_train_seq]
summary_val_seq = [[token if token < VOCAB_SIZE else summary_tokenizer.word_index['<UNK>'] for token in seq] for seq in summary_val_seq]

# 시퀀스 패딩
summary_train_seq = pad_sequences(summary_train_seq, maxlen=MAX_SUMMARY_LEN, padding='post')
summary_val_seq = pad_sequences(summary_val_seq, maxlen=MAX_SUMMARY_LEN, padding='post')

# 요약 시퀀스에서 시작 토큰 및 종료 토큰 추가
START_TOKEN = '<start>'
END_TOKEN = '<end>'

# 시작 및 종료 토큰 인덱스 추가
start_token_idx = len(summary_tokenizer.word_index) + 1
end_token_idx = len(summary_tokenizer.word_index) + 2

summary_tokenizer.word_index[START_TOKEN] = start_token_idx
summary_tokenizer.word_index[END_TOKEN] = end_token_idx
summary_tokenizer.index_word[start_token_idx] = START_TOKEN
summary_tokenizer.index_word[end_token_idx] = END_TOKEN

# 디코더 입력 데이터 및 타겟 데이터 생성
decoder_input_train = np.zeros((len(summary_train), MAX_SUMMARY_LEN))
decoder_target_train = np.zeros((len(summary_train), MAX_SUMMARY_LEN))

for i, seq in enumerate(summary_train_seq):
    decoder_input_train[i, 0] = start_token_idx
    decoder_input_train[i, 1:] = seq[:-1]  # Shift to create input
    decoder_target_train[i, :] = seq

# 검증 데이터에 대해서도 디코더 입력과 타겟 데이터 생성
decoder_input_val = np.zeros((len(summary_val), MAX_SUMMARY_LEN))
decoder_target_val = np.zeros((len(summary_val), MAX_SUMMARY_LEN))

for i, seq in enumerate(summary_val_seq):
    decoder_input_val[i, 0] = start_token_idx
    decoder_input_val[i, 1:] = seq[:-1]
    decoder_target_val[i, :] = seq

# Bahdanau Attention 클래스 정의
class BahdanauAttention(Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        # query: Decoder hidden state (batch_size, hidden_size)
        # values: Encoder outputs (batch_size, max_length, hidden_size)
        
        # Expand query to (batch_size, 1, hidden_size)
        query_with_time_axis = tf.expand_dims(query, 1)

        # Score 계산
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

        # Attention weights 계산
        attention_weights = tf.nn.softmax(score, axis=1)

        # context vector 계산
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

# 모델 정의 (Attention 추가)
EMBEDDING_DIM = 64  # 줄여서 리소스 절약
HIDDEN_UNITS = 128  # 줄여서 리소스 절약

# 인코더 정의
encoder_inputs = Input(shape=(MAX_TEXT_LEN,))
encoder_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(HIDDEN_UNITS, return_state=True, return_sequences=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# 디코더 정의
decoder_inputs = Input(shape=(MAX_SUMMARY_LEN,))
decoder_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(HIDDEN_UNITS, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Attention 적용
attention = BahdanauAttention(HIDDEN_UNITS)
context_vector, _ = attention(state_h, encoder_outputs)

# context_vector의 차원을 디코더 출력의 타임스텝 수와 맞추기 위해 RepeatVector 사용
context_vector = RepeatVector(MAX_SUMMARY_LEN)(context_vector)

# context_vector의 차원을 디코더 출력과 일치시키기 위해 Reshape 사용
context_vector = Reshape((MAX_SUMMARY_LEN, HIDDEN_UNITS))(context_vector)

# 데이터 유형을 맞추기 위해 Lambda 레이어를 사용하여 float32로 변환
context_vector = Lambda(lambda x: tf.cast(x, dtype=tf.float32))(context_vector)
decoder_outputs = Lambda(lambda x: tf.cast(x, dtype=tf.float32))(decoder_outputs)

# 디코더 출력과 Attention 결과 결합
decoder_concat_input = Concatenate(axis=-1)([context_vector, decoder_outputs])

# Dense Layer
decoder_dense = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'))
decoder_outputs = decoder_dense(decoder_concat_input)

# 모델 컴파일 및 학습
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

# 모델 학습
model.fit(
    [text_train_seq, decoder_input_train], 
    np.expand_dims(decoder_target_train, -1), 
    epochs=30,  # 에포크 수를 줄여서 리소스 절약
    batch_size=16,  # 배치 크기를 줄여서 리소스 절약
    validation_data=([text_val_seq, decoder_input_val], np.expand_dims(decoder_target_val, -1))
)

# 모델 및 토크나이저 저장
model.save("news_summary_model_with_attention.keras")
with open('text_tokenizer_v3.pkl', 'wb') as f:
    pickle.dump(text_tokenizer, f)
with open('summary_tokenizer_V3.pkl', 'wb') as f:
    pickle.dump(summary_tokenizer, f)

print("모델과 토크나이저 저장 완료!")




Epoch 1/30




[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 200ms/step - accuracy: 0.3119 - loss: 6.4703 - val_accuracy: 0.3285 - val_loss: 5.6641
Epoch 2/30
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 199ms/step - accuracy: 0.3274 - loss: 5.4231 - val_accuracy: 0.3309 - val_loss: 5.5971
Epoch 3/30
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 199ms/step - accuracy: 0.3312 - loss: 5.2053 - val_accuracy: 0.3337 - val_loss: 5.5172
Epoch 4/30
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 196ms/step - accuracy: 0.3413 - loss: 4.8981 - val_accuracy: 0.3370 - val_loss: 5.4653
Epoch 5/30
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 196ms/step - accuracy: 0.3470 - loss: 4.6319 - val_accuracy: 0.3406 - val_loss: 5.4275
Epoch 6/30
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 197ms/step - accuracy: 0.3584 - loss: 4.3132 - val_accuracy: 0.3447 - val_loss: 5.3811
Epoch 7/30
[1m492/4