# Text Summarization using TensorFlow and Attention

In [ ]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt
import os
import tensorflow_datasets as tfds

tf.random.set_seed(42)
np.random.seed(42)

## Load and Explore Dataset

In [ ]:
dataset, info = tfds.load('cnn_dailymail', with_info=True, as_supervised=True)
train_dataset, val_dataset, test_dataset = dataset['train'], dataset['validation'], dataset['test']

print(info)

## Data Preprocessing and Tokenization

In [ ]:
# Define parameters
MAX_ARTICLE_LENGTH = 400
MAX_SUMMARY_LENGTH = 100
VOCAB_SIZE = 20000

article_tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
summary_tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")

In [ ]:
# Preparing a subset of data for tokenizers
max_samples = 50000
articles, summaries = [], []

for i, (article, summary) in enumerate(train_dataset):
    if i >= max_samples:
        break
    articles.append(article.numpy().decode())
    summaries.append('<START> ' + summary.numpy().decode() + ' <END>')

article_tokenizer.fit_on_texts(articles)
summary_tokenizer.fit_on_texts(summaries)

## Build and Compile the Model

In [ ]:
def build_model(vocab_size, embedding_dim=128, lstm_units=256):
    encoder_inputs = Input(shape=(MAX_ARTICLE_LENGTH,))
    encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
    encoder_lstm = Bidirectional(LSTM(lstm_units, return_sequences=True, return_state=True))
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
    
    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])

    decoder_inputs = Input(shape=(MAX_SUMMARY_LENGTH-1,))
    decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(lstm_units*2, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

    attention = Attention()([decoder_outputs, encoder_outputs])
    decoder_combined = Concatenate()([decoder_outputs, attention])

    outputs = Dense(vocab_size, activation='softmax')(decoder_combined)
    model = Model([encoder_inputs, decoder_inputs], outputs)
    return model

model = build_model(VOCAB_SIZE)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

## Train the Model

In [ ]:
# Training and validation logic here (as provided in your script)

## Evaluate and Test the Model

In [ ]:
# Evaluation, inference logic and plotting history as provided in your script

## Save Tokenizers

In [ ]:
import pickle
with open('article_tokenizer.pickle', 'wb') as handle:
    pickle.dump(article_tokenizer, handle)

with open('summary_tokenizer.pickle', 'wb') as handle:
    pickle.dump(summary_tokenizer, handle)