In [1]:
!pip install datasets
import datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pickle

# File path
file_path = '/content/drive/MyDrive/1_Data/ds_clean.pkl'

# Load the dataset
with open(file_path, 'rb') as f:
    ds_clean = pickle.load(f)

# Check the dataset
print(ds_clean)

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})


After a lot of research and reading I decided to use tensorflow based encoder-decoder for to build my document summarization model.

I will be following instructions for this following process.

In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

**Step 1: Tokenize the text:**

Tokenizers for documents and summaries.

In [7]:
# Initialize tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")  # Limit to top 10,000 words

In [8]:
# Fit tokenizer on both documents and summaries of train data
all_texts = ds_clean['train']['document'] + ds_clean['train']['summary']
tokenizer.fit_on_texts(all_texts)

In [9]:
# Tokenize the documents and summaries
X_train_docs = tokenizer.texts_to_sequences(ds_clean['train']['document'])
y_train_summaries = tokenizer.texts_to_sequences(ds_clean['train']['summary'])
X_val_docs = tokenizer.texts_to_sequences(ds_clean['validation']['document'])
y_val_summaries = tokenizer.texts_to_sequences(ds_clean['validation']['summary'])

Now we can convert documents and summaries to sequence as per instructions.

We can check the firs tokenized documents and summary.

In [10]:
print("Tokenized document:", X_train_docs[0])
print("Tokenized summary:", y_train_summaries[0])

Tokenized document: [158, 1, 1099, 181, 8, 68, 335, 3294, 181, 2, 80, 224, 4, 2, 341, 49, 9, 46, 1, 900, 2, 2444, 4, 682, 431, 213, 6, 699, 431, 26, 54, 1, 10, 46, 407, 959, 5, 2483, 1439, 10, 2, 57, 3591, 772, 9, 689, 3060, 2, 1, 18, 1, 250, 52, 167, 68, 1922, 46, 4, 2, 100, 471, 1, 10, 94, 2, 1198, 1, 103, 2, 872, 341, 4329, 26, 9, 1, 52, 959, 3, 22, 894, 108, 3, 1, 1, 21, 1, 1444, 1, 1213, 3, 1, 959, 318, 7, 928, 2, 3943, 954, 11, 506, 3, 873, 1, 18, 1, 192, 18, 1, 1, 3747, 22, 1922, 2679, 1, 93, 6, 47, 2, 291, 1, 2, 1, 545, 522, 3, 1, 2939, 739, 2, 759, 1376, 6, 99, 1332, 192, 254, 331, 44, 639, 55, 3, 771, 57, 10, 303, 3315, 2, 177, 4853, 1430, 119, 1, 10, 4691, 89, 341, 5, 1, 8, 2, 57, 1198, 91, 28, 22, 1210, 1862, 420, 3, 1, 1, 524, 1492, 2, 3943, 954, 1, 27, 1620, 88, 41, 534, 1325, 508, 9, 150, 2, 1986, 220, 69, 2, 1198, 318, 1, 959, 7, 377, 131, 20, 41, 60, 1, 7, 249, 4, 2, 872, 130, 271, 2, 3943, 954, 1216, 3, 1, 206, 2, 4199, 132, 689, 984, 2, 954, 1590, 88, 2604, 51, 3930,

**Step 2: Padding and Truncating:**

Since the sequences generated by tokenizers vary in length, we need to pad them to a fixed length to esnure the same shape for training the model.

In [11]:
# Set max length for padding
max_doc_length = 100
max_summary_length = 100

In [12]:
# Pad the sequences
X_train_docs_padded = pad_sequences(X_train_docs, maxlen=max_doc_length, padding='post', truncating='post')
y_train_summaries_padded = pad_sequences(y_train_summaries, maxlen=max_summary_length, padding='post', truncating='post')
X_val_docs_padded = pad_sequences(X_val_docs, maxlen=max_doc_length, padding='post', truncating='post')
y_val_summaries_padded = pad_sequences(y_val_summaries, maxlen=max_summary_length, padding='post', truncating='post')

In [13]:
# Check the shape
print("Padded X_train shape:", X_train_docs_padded.shape)
print("Padded y_train shape:", y_train_summaries_padded.shape)

Padded X_train shape: (44972, 100)
Padded y_train shape: (44972, 100)


**Step 3: Build the Model:**

In this step, we'll create a simple encoder-decoder model using TensorFlow's Sequential API. We'll use the LSTM (Long Short-Term Memory) layers for both encoding and decoding.

In [14]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed

In [15]:
# Parameters
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
embedding_dim = 32  # Size of embedding vector
latent_dim = 32  # Latent dimensionality of the encoding space

In [16]:
# Encoder
encoder_inputs = Input(shape=(max_doc_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

In [17]:
# Decoder
decoder_inputs = Input(shape=(max_summary_length,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

In [18]:
# Model
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [19]:
# Compile
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

**Step 4: Training the Model:**

Now we will train the model, for training we will use teacher forcing algorithm which is common in sequence to sequence model tasks like summarization. We need to shift the target summaries to the right (the target sequence at time step t is the summary at time step t+1 during training).

In [20]:
import numpy as np

In [21]:
# Shift the target sequence by one timestep for teacher forcing
y_train_summaries_shifted = np.zeros_like(y_train_summaries_padded)
y_train_summaries_shifted[:, :-1] = y_train_summaries_padded[:, 1:]

In [22]:
y_val_summaries_shifted = np.zeros_like(y_val_summaries_padded)
y_val_summaries_shifted[:, :-1] = y_val_summaries_padded[:, 1:]

In [23]:
# Train the model
model.fit([X_train_docs_padded, y_train_summaries_padded],
          np.expand_dims(y_train_summaries_shifted, -1),
          epochs=5, batch_size=16, validation_data=([X_val_docs_padded, y_val_summaries_padded], np.expand_dims(y_val_summaries_shifted, -1)))

Epoch 1/5
[1m2811/2811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1174s[0m 414ms/step - loss: 7.1245 - val_loss: 5.9317
Epoch 2/5
[1m2811/2811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1163s[0m 414ms/step - loss: 5.8407 - val_loss: 5.5110
Epoch 3/5
[1m2811/2811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1164s[0m 414ms/step - loss: 5.4382 - val_loss: 5.2808
Epoch 4/5
[1m2811/2811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1164s[0m 414ms/step - loss: 5.2500 - val_loss: 5.1785
Epoch 5/5
[1m2811/2811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1164s[0m 414ms/step - loss: 5.1386 - val_loss: 5.0809


<keras.src.callbacks.history.History at 0x7e805cd6c070>

In [26]:
# Save the model
model.save('summary_model_v1.keras')

**Step 5: Model Evaluation:**

In [2]:
from tensorflow.keras.models import load_model

# Reload the saved model
model = load_model('/content/drive/MyDrive/1_Data/summary_model_v1.keras')

ValueError: File not found: filepath=/content/drive/MyDrive/1_Data/summary_model_v1.keras. Please ensure the file is an accessible `.keras` zip file.

Preparing the test data now.

In [28]:
# Tokenize the test data
X_test_docs = tokenizer.texts_to_sequences(ds_clean['test']['document'])
y_test_summaries = tokenizer.texts_to_sequences(ds_clean['test']['summary'])

In [29]:
# Pad the sequences for the test data
X_test_docs_padded = pad_sequences(X_test_docs, maxlen=max_doc_length, padding='post', truncating='post')
y_test_summaries_padded = pad_sequences(y_test_summaries, maxlen=max_summary_length, padding='post', truncating='post')

In [31]:
y_test_summaries_shifted = np.roll(y_test_summaries_padded, shift=1, axis=1)

In [1]:
test_loss = model.evaluate([X_test_docs_padded, y_test_summaries_padded],
                           np.expand_dims(y_test_summaries_shifted, -1))

print(f"Test Loss: {test_loss}")

NameError: name 'model' is not defined