In [1]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score

In [2]:
# CSV 파일 로드
file_path = 'news_summary_more.csv'  # 파일 경로를 실제 위치로 변경

data = pd.read_csv(file_path)

# 필요한 컬럼만 선택 (text와 headlines)
data = data[['text', 'headlines']]

# 결측값 제거
data = data.dropna()

# 데이터 정제 함수 정의
def clean_text(text):
    # HTML 태그 제거
    text = re.sub(r'<[^>]+>', '', text)
    # 괄호로 묶인 텍스트 제거
    text = re.sub(r'\([^)]*\)', '', text)
    # 알파벳과 숫자 외의 문자 제거
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # 소문자 변환
    text = text.lower()
    return text

# 텍스트와 요약 정제
data['text'] = data['text'].apply(clean_text)
data['headlines'] = data['headlines'].apply(clean_text)

# 학습 및 테스트 데이터 분할
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [6]:
# 토크나이저 정의 및 훈련
text_tokenizer = Tokenizer(num_words=15000) # 상위 15000개 단어만 사용
text_tokenizer.fit_on_texts(train_data['text'])
summary_tokenizer = Tokenizer(num_words=5000) # 상위 5000개 단어만 사용
summary_tokenizer.fit_on_texts(train_data['headlines'])

# 정수 인덱스 시퀀스로 변환
train_text_seq = text_tokenizer.texts_to_sequences(train_data['text'])
train_summary_seq = summary_tokenizer.texts_to_sequences(train_data['headlines'])
test_text_seq = text_tokenizer.texts_to_sequences(test_data['text'])
test_summary_seq = summary_tokenizer.texts_to_sequences(test_data['headlines'])

# 패딩 추가
max_text_len = 100
max_summary_len = 15
train_text_seq = pad_sequences(train_text_seq, maxlen=max_text_len, padding='post')
train_summary_seq = pad_sequences(train_summary_seq, maxlen=max_summary_len, padding='post')
test_text_seq = pad_sequences(test_text_seq, maxlen=max_text_len, padding='post')
test_summary_seq = pad_sequences(test_summary_seq, maxlen=max_summary_len, padding='post')

# 인코더-디코더 모델 설계 (어텐션 메커니즘 추가)
embedding_dim = 128
hidden_size = 128
src_vocab = len(text_tokenizer.word_index) + 1
tar_vocab = len(summary_tokenizer.word_index) + 1

# 인코더 설계
encoder_inputs = Input(shape=(max_text_len,))
enc_emb = Embedding(src_vocab, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(hidden_size, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.2)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# 디코더 설계
decoder_inputs = Input(shape=(max_summary_len,))
dec_emb_layer = Embedding(tar_vocab, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(hidden_size, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# 어텐션 메커니즘 추가
attention = Attention()([decoder_outputs, encoder_outputs])
concatenate = Concatenate(axis=-1)([decoder_outputs, attention])
decoder_dense = Dense(tar_vocab, activation='softmax')
decoder_outputs = decoder_dense(concatenate)

In [7]:
# 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# 모델 요약 출력
model.summary()

In [8]:
# 모델 학습
es = EarlyStopping(monitor='val_loss', patience=2, verbose=1)
history = model.fit(
    [train_text_seq, train_summary_seq],
    train_summary_seq.reshape(train_summary_seq.shape[0], train_summary_seq.shape[1], 1),
    epochs=50,
    callbacks=[es],
    batch_size=128, # batch_size 줄이기
    validation_data=(
        [test_text_seq, test_summary_seq],
        test_summary_seq.reshape(test_summary_seq.shape[0], test_summary_seq.shape[1], 1)
    )
)

Epoch 1/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 427ms/step - loss: 4.8415 - val_loss: 3.6214
Epoch 2/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 433ms/step - loss: 3.5790 - val_loss: 3.1994
Epoch 3/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m319s[0m 428ms/step - loss: 3.1530 - val_loss: 2.8694
Epoch 4/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 430ms/step - loss: 2.8430 - val_loss: 2.6228
Epoch 5/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 430ms/step - loss: 2.6101 - val_loss: 2.3968
Epoch 6/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 428ms/step - loss: 2.3870 - val_loss: 2.1770
Epoch 7/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 430ms/step - loss: 2.1634 - val_loss: 1.9625
Epoch 8/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 429ms/step - loss: 1.9648 - val_loss: 1.7815
Epoch 9/

In [9]:
# 모델을 HDF5 형식으로 저장
model.save('news_summary_model.keras')

In [11]:
import pickle
from google.colab import files # google.colab에서 'files' 객체를 import합니다.

# 토크나이저 저장
with open('text_tokenizer.pkl', 'wb') as f:
    pickle.dump(text_tokenizer, f)

with open('summary_tokenizer.pkl', 'wb') as f:
    pickle.dump(summary_tokenizer, f)

# Colab에서 다운로드 가능하도록 설정
files.download('text_tokenizer.pkl')
files.download('summary_tokenizer.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>