In [1]:
%%capture
!pip install -q datasets

In [2]:
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2025-10-06 15:01:23.206249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759762883.520770      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759762883.626836      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# load_dataset("xsum") downloads and loads the XSum dataset using the Hugging Face datasets library.
# Each split is a Hugging Face `Dataset` object, similar to a DataFrame, with columns like "document" and "summary".

dataset = load_dataset("xsum", trust_remote_code=True)
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

README.md: 0.00B [00:00, ?B/s]

xsum.py: 0.00B [00:00, ?B/s]

data/XSUM-EMNLP18-Summary-Data-Original.(…):   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [4]:
train_data

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 204045
})

In [5]:
# The Tokenizer converts raw text into sequences of integers that can be processed by a neural network.
# Each unique word in the dataset is assigned a unique integer index.
# When we call texts_to_sequences(), each word in a sentence is replaced by its corresponding index.
# This allows the model to work with numbers instead of raw text, which is required for embeddings and LSTM layers.
# Padding is applied to ensure all sequences have the same length, so they can be processed in batches.

doc_tokenizer = Tokenizer()
doc_tokenizer.fit_on_texts([d['document'] for d in train_data])

summary_tokenizer = Tokenizer()
summary_tokenizer.fit_on_texts([d['summary'] for d in train_data])

In [6]:
# pad_sequences ensures that all input sequences have the same length by either
# truncating longer sequences or padding shorter ones with a special value (usually 0).
# This is necessary because neural networks, like LSTMs, require fixed-length input sequences.

max_doc_len = 400 
max_summary_len = 50

X_train = pad_sequences(doc_tokenizer.texts_to_sequences([d['document'] for d in train_data]), maxlen=max_doc_len, padding='post')
y_train = pad_sequences(summary_tokenizer.texts_to_sequences([d['summary'] for d in train_data]), maxlen=max_summary_len, padding='post')


In [7]:
# In seq2seq models, the decoder predicts the next token in the target sequence given the previous tokens. 
#
# y_train_input = y_train[:, :-1] -> takes all tokens of the target sequence except the last one. 
#    The decoder learns to predict the next token based on these inputs.
#
# y_train_output = y_train[:, 1:] -> takes all tokens of the target sequence except the first one.
#    The decoder is trained to produce these tokens step by step.

y_train_input = y_train[:, :-1]
y_train_output = y_train[:, 1:]
