In [8]:
import os
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# CSV 파일 경로 (로컬에서 사용하는 경로로 수정)
csv_file_path = './news_summary_more.csv'  # CSV 파일 경로를 로컬에 맞게 수정하세요.

# 데이터 로드
df = pd.read_csv(csv_file_path)

# 'text'와 'headlines' 컬럼만 사용
text_data = df['text'].astype(str).values  # 'text' 컬럼
summary_data = df['headlines'].astype(str).values  # 'headlines' 컬럼

# 데이터셋 샘플링 (데이터 크기를 줄여 Colab 리소스 절약)
sample_size = int(len(text_data) * 0.1)  # 데이터의 10%만 샘플링
text_data = text_data[:sample_size]
summary_data = summary_data[:sample_size]

# 데이터셋 분리 (80% train, 20% validation)
text_train, text_val, summary_train, summary_val = train_test_split(
    text_data, summary_data, test_size=0.2, random_state=42
)

# 토크나이저 생성 및 훈련
MAX_TEXT_LEN = 50  # 줄여서 리소스 절약
MAX_SUMMARY_LEN = 15  # 줄여서 리소스 절약
VOCAB_SIZE = 15000  # 어휘 크기 증가 (기존 10000에서 15000으로)

# 텍스트 토크나이저
text_tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<UNK>')  # OOV 토큰을 추가하여 범위 외의 단어 처리
text_tokenizer.fit_on_texts(text_train)

# 요약 토크나이저
summary_tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<UNK>')
summary_tokenizer.fit_on_texts(summary_train)

# 토큰화 및 시퀀스 변환
text_train_seq = text_tokenizer.texts_to_sequences(text_train)
text_val_seq = text_tokenizer.texts_to_sequences(text_val)

# 인덱스를 VOCAB_SIZE 이하로 필터링
text_train_seq = [[token if token < VOCAB_SIZE else text_tokenizer.word_index['<UNK>'] for token in seq] for seq in text_train_seq]
text_val_seq = [[token if token < VOCAB_SIZE else text_tokenizer.word_index['<UNK>'] for token in seq] for seq in text_val_seq]

# 시퀀스 패딩
text_train_seq = pad_sequences(text_train_seq, maxlen=MAX_TEXT_LEN, padding='post')
text_val_seq = pad_sequences(text_val_seq, maxlen=MAX_TEXT_LEN, padding='post')

summary_train_seq = summary_tokenizer.texts_to_sequences(summary_train)
summary_val_seq = summary_tokenizer.texts_to_sequences(summary_val)

# 인덱스를 VOCAB_SIZE 이하로 필터링
summary_train_seq = [[token if token < VOCAB_SIZE else summary_tokenizer.word_index['<UNK>'] for token in seq] for seq in summary_train_seq]
summary_val_seq = [[token if token < VOCAB_SIZE else summary_tokenizer.word_index['<UNK>'] for token in seq] for seq in summary_val_seq]

# 시퀀스 패딩
summary_train_seq = pad_sequences(summary_train_seq, maxlen=MAX_SUMMARY_LEN, padding='post')
summary_val_seq = pad_sequences(summary_val_seq, maxlen=MAX_SUMMARY_LEN, padding='post')

# 요약 시퀀스에서 시작 토큰 및 종료 토큰 추가
START_TOKEN = '<start>'
END_TOKEN = '<end>'

# 시작 및 종료 토큰 인덱스 추가
start_token_idx = len(summary_tokenizer.word_index) + 1
end_token_idx = len(summary_tokenizer.word_index) + 2

summary_tokenizer.word_index[START_TOKEN] = start_token_idx
summary_tokenizer.word_index[END_TOKEN] = end_token_idx
summary_tokenizer.index_word[start_token_idx] = START_TOKEN
summary_tokenizer.index_word[end_token_idx] = END_TOKEN

# 디코더 입력 데이터 및 타겟 데이터 생성
decoder_input_train = np.zeros((len(summary_train), MAX_SUMMARY_LEN))
decoder_target_train = np.zeros((len(summary_train), MAX_SUMMARY_LEN))

for i, seq in enumerate(summary_train_seq):
    decoder_input_train[i, 0] = start_token_idx
    decoder_input_train[i, 1:] = seq[:-1]  # Shift to create input
    decoder_target_train[i, :] = seq

# 검증 데이터에 대해서도 디코더 입력과 타겟 데이터 생성
decoder_input_val = np.zeros((len(summary_val), MAX_SUMMARY_LEN))
decoder_target_val = np.zeros((len(summary_val), MAX_SUMMARY_LEN))

for i, seq in enumerate(summary_val_seq):
    decoder_input_val[i, 0] = start_token_idx
    decoder_input_val[i, 1:] = seq[:-1]
    decoder_target_val[i, :] = seq

# 모델 정의
EMBEDDING_DIM = 64  # 줄여서 리소스 절약
HIDDEN_UNITS = 128  # 줄여서 리소스 절약

# 인코더 정의
encoder_inputs = Input(shape=(MAX_TEXT_LEN,))
encoder_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(HIDDEN_UNITS, return_state=True, return_sequences=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# 디코더 정의
decoder_inputs = Input(shape=(MAX_SUMMARY_LEN,))
decoder_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(HIDDEN_UNITS, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
decoder_dense = Dense(VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# 모델 컴파일 및 학습
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

# 모델 학습
model.fit(
    [text_train_seq, decoder_input_train], 
    np.expand_dims(decoder_target_train, -1), 
    epochs=50,  
    batch_size=16,  # 배치 크기를 줄여서 리소스 절약
    validation_data=([text_val_seq, decoder_input_val], np.expand_dims(decoder_target_val, -1))
)

# 모델 및 토크나이저 저장
model.save("news_summary_model_v2.keras")
with open('text_tokenizer_v2.pkl', 'wb') as f:
    pickle.dump(text_tokenizer, f)
with open('summary_tokenizer_v2.pkl', 'wb') as f:
    pickle.dump(summary_tokenizer, f)

print("모델과 토크나이저 저장 완료!")


Epoch 1/50




[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 100ms/step - accuracy: 0.3027 - loss: 8.0789 - val_accuracy: 0.3293 - val_loss: 7.4661
Epoch 2/50
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 98ms/step - accuracy: 0.3274 - loss: 7.1378 - val_accuracy: 0.3304 - val_loss: 7.3885
Epoch 3/50
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 98ms/step - accuracy: 0.3344 - loss: 6.8570 - val_accuracy: 0.3316 - val_loss: 7.3582
Epoch 4/50
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 98ms/step - accuracy: 0.3374 - loss: 6.6401 - val_accuracy: 0.3336 - val_loss: 7.3534
Epoch 5/50
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 99ms/step - accuracy: 0.3394 - loss: 6.4373 - val_accuracy: 0.3350 - val_loss: 7.3663
Epoch 6/50
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 97ms/step - accuracy: 0.3521 - loss: 6.1971 - val_accuracy: 0.3358 - val_loss: 7.3759
Epoch 7/50
[1m492/492[0m

In [10]:
import numpy as np
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 저장된 모델 및 토크나이저 로드
model = load_model("news_summary_model_v2.keras")

with open('text_tokenizer_v2.pkl', 'rb') as f:
    text_tokenizer = pickle.load(f)

with open('summary_tokenizer_v2.pkl', 'rb') as f:
    summary_tokenizer = pickle.load(f)

# 테스트할 예시 텍스트
example_text = "Apple has announced the release of its new iPhone 15, which features an upgraded camera system, faster processor, and a new design with titanium edges. The phone will be available for pre-order starting next week, with shipping expected by the end of the month."


# 입력 텍스트를 시퀀스로 변환하고 패딩 적용
MAX_TEXT_LEN = 50  # 학습 시 사용했던 MAX_TEXT_LEN과 동일해야 함
input_sequence = text_tokenizer.texts_to_sequences([example_text])
input_sequence = pad_sequences(input_sequence, maxlen=MAX_TEXT_LEN, padding='post')

# 디코더 입력 시퀀스 초기화 (START_TOKEN으로 시작)
MAX_SUMMARY_LEN = 15  # 학습 시 사용했던 MAX_SUMMARY_LEN과 동일해야 함
start_token_idx = summary_tokenizer.word_index['<start>']
decoder_input_seq = np.zeros((1, MAX_SUMMARY_LEN))
decoder_input_seq[0, 0] = start_token_idx

# 요약 생성 (단어 단위로 반복 예측)
predicted_summary = ""
for i in range(1, MAX_SUMMARY_LEN):
    # 예측 수행
    predictions = model.predict([input_sequence, decoder_input_seq], verbose=0)
    predicted_id = np.argmax(predictions[0, i - 1, :])

    # 예측된 토큰이 END_TOKEN이면 요약 종료
    if predicted_id == summary_tokenizer.word_index['<end>']:
        break

    # 예측된 토큰을 디코더 입력 시퀀스에 추가
    decoder_input_seq[0, i] = predicted_id

    # 예측된 단어를 요약에 추가
    if predicted_id != 0:
        predicted_summary += summary_tokenizer.index_word[predicted_id] + ' '

print("Predicted Summary:", predicted_summary.strip())




Predicted Summary: rbi allows 'tokenisation' for data troops over over video threat


In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# T5 모델과 토크나이저 로드
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 테스트할 텍스트
example_text = """
The Indian stock market witnessed a significant drop today amid growing concerns over the global economic downturn.
Apple has announced the release of its new iPhone 15, which features an upgraded camera system, faster processor, 
and a new design with titanium edges. The phone will be available for pre-order starting next week, 
with shipping expected by the end of the month.
"""

input_text = "summarize: " + example_text
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

# 요약 생성
summary_ids = model.generate(input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Predicted Summary:", summary)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Predicted Summary: the Indian stock market saw a significant drop today amid growing concerns over the global economic downturn. the new iphone 15 features an upgraded camera system, faster processor, and a new design with titanium edges.
