1. Load all the necessary functions

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from sklearn.model_selection import train_test_split
import requests
import torch.nn as nn
import torch.optim as optim

2. import CNN/Daily Mail dataset, from https://huggingface.co/datasets/cnn_dailymail

In [None]:
import requests

API_URL = "https://datasets-server.huggingface.co/splits?dataset=cnn_dailymail"
TRAIN_API_URL = "https://datasets-server.huggingface.co/rows?dataset=cnn_dailymail&config=1.0.0&split=train&offset=0&limit=100"
VAL_API_URL = "https://datasets-server.huggingface.co/rows?dataset=cnn_dailymail&config=1.0.0&split=validation&offset=0&limit=100"
TEST_API_URL = "https://datasets-server.huggingface.co/rows?dataset=cnn_dailymail&config=1.0.0&split=test&offset=0&limit=100"

def fetch_data(api_url):
    response = requests.get(api_url)
    return response.json()

# Fetch train data and target summaries
print("Fetching train data...")
train_data = fetch_data(TRAIN_API_URL)
train_summaries = [example['row']['highlights'] for example in train_data['rows']]
train_texts = [example['row']['article'] for example in train_data['rows']]
print("Train data fetched successfully.")

# Print all the summaries and texts for the train set
print("Train Summaries and Texts:")
for i, (summary, text) in enumerate(zip(train_summaries, train_texts), start=1):
    print(f"Example {i}:")
    print(f"Summary: {summary}")
    print(f"Text: {text}")
    print()

# Fetch val data and target summaries
print("Fetching validation data...")
val_data = fetch_data(VAL_API_URL)
val_summaries = [example['row']['highlights'] for example in val_data['rows']]
val_texts = [example['row']['article'] for example in val_data['rows']]
print("Validation data fetched successfully.")

# Print all the summaries and texts for the validation set
print("Validation Summaries and Texts:")
for i, (summary, text) in enumerate(zip(val_summaries, val_texts), start=1):
    print(f"Example {i}:")
    print(f"Summary: {summary}")
    print(f"Text: {text}")
    print()

# Fetch test data and target summaries
print("Fetching test data...")
test_data = fetch_data(TEST_API_URL)
test_summaries = [example['row']['highlights'] for example in test_data['rows']]
test_texts = [example['row']['article'] for example in test_data['rows']]
print("Test data fetched successfully.")

# Print all the summaries and texts for the test set
print("Test Summaries and Texts:")
for i, (summary, text) in enumerate(zip(test_summaries, test_texts), start=1):
    print(f"Example {i}:")
    print(f"Summary: {summary}")
    print(f"Text: {text}")
    print()




In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import requests

nltk.download('punkt')
nltk.download('stopwords')

# Preprocessing functions

def preprocess_text(text):
    words = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    words = [word.translate(table) for word in words]
    words = [word.lower() for word in words]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    words = [word for word in words if len(word) > 1]
    return words

def pad_or_truncate_text(text, max_length):
    preprocessed_words = preprocess_text(text)
    if len(preprocessed_words) > max_length:
        preprocessed_words = preprocessed_words[:max_length]
    elif len(preprocessed_words) < max_length:
        padding_length = max_length - len(preprocessed_words)
        for _ in range(padding_length):
            preprocessed_words.append('<PAD>')
    return preprocessed_words


def preprocess_target_summaries(summaries, max_length):
    preprocessed_summaries = []
    for summary in summaries:
        preprocessed_summary = pad_or_truncate_text(summary, max_length)
        preprocessed_summaries.append(preprocessed_summary)
    return preprocessed_summaries



# Fetch train data and target summaries
train_data = fetch_data(TRAIN_API_URL)
train_summaries = [example['row']['highlights'] for example in train_data['rows']]
train_texts = [example['row']['article'] for example in train_data['rows']]

# Fetch val data and target summaries
val_data = fetch_data(VAL_API_URL)
val_summaries = [example['row']['highlights'] for example in val_data['rows']]
val_texts = [example['row']['article'] for example in val_data['rows']]

# Fetch test data and target summaries
test_data = fetch_data(TEST_API_URL)
test_summaries = [example['row']['highlights'] for example in test_data['rows']]
test_texts = [example['row']['article'] for example in test_data['rows']]

# Preprocess article texts for each split
max_length = 512
train_texts = [pad_or_truncate_text(article_text, max_length) for article_text in train_texts]
val_texts = [pad_or_truncate_text(article_text, max_length) for article_text in val_texts]
test_texts = [pad_or_truncate_text(article_text, max_length) for article_text in test_texts]

# Preprocess target summaries for each split
train_targets = preprocess_target_summaries(train_summaries, max_length)
val_targets = preprocess_target_summaries(val_summaries, max_length)
test_targets = preprocess_target_summaries(test_summaries, max_length)

# Vocabulary size
vocabulary = set()
for text in train_texts:
    vocabulary.update(text)
vocab_size = len(vocabulary)
print("Vocabulary size:", vocab_size)

# Print preprocessed data
print("Train Texts:", train_texts)
print("Train Targets:", train_targets)
print("Val Texts:", val_texts)
print("Val Targets:", val_targets)
print("Test Texts:", test_texts)
print("Test Targets:", test_targets)


In [None]:
# Represent text as word embeddings

# -Word embeddings examples are Word2Vec, GloVe, FastText, and represent each word as a dense vector.
# follow https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html.

import torch
import torch.nn as nn
import torch.optim as optim

vocab_size = 100000
embedding_dim = 256
hidden_dim = 256
torch.manual_seed(1)

# List of train texts
train_preprocessed_text = []
for example in train_data['rows']:
    article_text = example['row']['article']
    train_preprocessed_text.append(pad_or_truncate_text(article_text, max_length))

# List of val texts
val_preprocessed_text = []
for example in val_data['rows']:
    article_text = example['row']['article']
    val_preprocessed_text.append(pad_or_truncate_text(article_text, max_length))

# List of test texts
test_preprocessed_text = []
for example in test_data['rows']:
    article_text = example['row']['article']
    test_preprocessed_text.append(pad_or_truncate_text(article_text, max_length))

# Build vocabulary from both texts and summaries
all_texts_and_summaries = train_preprocessed_text + val_preprocessed_text + test_preprocessed_text + train_summaries + val_summaries + test_summaries
word_freq = {}  # A dictionary to store word frequencies

for sentence in all_texts_and_summaries:
    for word in sentence:
        if word not in word_freq:
            word_freq[word] = 0
        word_freq[word] += 1

# Sort words based on frequency in descending order
sorted_words = sorted(word_freq.keys(), key=lambda x: word_freq[x], reverse=True)

# Take only the top vocab_size - 1 words, and add an OOV token
sorted_words = sorted_words[:vocab_size - 1] + ['<OOV>']

voc = set(sorted_words)
word_to_num = {word: i for i, word in enumerate(voc)}
word_to_num['<SOS>'] = len(word_to_num)


def sentence_to_indices(sentence):
    return [word_to_num.get(word, word_to_num['<OOV>']) for word in sentence]

class wordembedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(wordembedding, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.vocab_size = vocab_size


    def forward(self, inputs):
        return self.embeddings(inputs)

# Create the model and the optimizer
word_model = wordembedding(len(voc) + 1, embedding_dim)
optimizer = optim.SGD(word_model.parameters(), lr=0.001)
print(word_model.embeddings.weight.size(0))


# ... (previous code remains unchanged)

# Loop over train_targets and generate embeddings for summaries
train_summary_embeddings_list = []
for summary in train_targets:
    summary_idxs = [word_to_num.get(word, word_to_num['<OOV>']) for word in summary]  # Get the index of each word in the summary
    summary_idxs = torch.tensor(summary_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
    embeddings = word_model(summary_idxs)
    train_summary_embeddings_list.append(embeddings)  # Store the embeddings

# Loop over val_targets and generate embeddings for summaries
val_summary_embeddings_list = []
for summary in val_targets:
    summary_idxs = [word_to_num.get(word, word_to_num['<OOV>']) for word in summary]  # Get the index of each word in the summary
    summary_idxs = torch.tensor(summary_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
    embeddings = word_model(summary_idxs)
    val_summary_embeddings_list.append(embeddings)  # Store the embeddings

# Loop over test_targets and generate embeddings for summaries
test_summary_embeddings_list = []
for summary in test_targets:
    summary_idxs = [word_to_num.get(word, word_to_num['<OOV>']) for word in summary]  # Get the index of each word in the summary
    summary_idxs = torch.tensor(summary_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
    embeddings = word_model(summary_idxs)
    test_summary_embeddings_list.append(embeddings)  # Store the embeddings

# Convert lists to PyTorch tensors for both texts and summaries
train_summary_embeddings_tensor = torch.stack(train_summary_embeddings_list)
val_summary_embeddings_tensor = torch.stack(val_summary_embeddings_list)
test_summary_embeddings_tensor = torch.stack(test_summary_embeddings_list)

train_text_embeddings_list = []
val_text_embeddings_list = []
test_text_embeddings_list = []

# Loop over train_preprocessed_text and generate embeddings for texts
for sentence in train_preprocessed_text:
    sentence_idxs = [word_to_num[word] for word in sentence]  # Get the index of each word in the sentence
    sentence_idxs = torch.tensor(sentence_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
    embeddings = word_model(sentence_idxs)
    train_text_embeddings_list.append(embeddings)  # Store the embeddings

# Loop over val_preprocessed_text and generate embeddings for texts
for sentence in val_preprocessed_text:
    sentence_idxs = [word_to_num[word] for word in sentence]  # Get the index of each word in the sentence
    sentence_idxs = torch.tensor(sentence_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
    embeddings = word_model(sentence_idxs)
    val_text_embeddings_list.append(embeddings)  # Store the embeddings

# Loop over test_preprocessed_text and generate embeddings for texts
for sentence in test_preprocessed_text:
    sentence_idxs = [word_to_num[word] for word in sentence]  # Get the index of each word in the sentence
    sentence_idxs = torch.tensor(sentence_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
    embeddings = word_model(sentence_idxs)
    test_text_embeddings_list.append(embeddings)  # Store the embeddings

# Convert lists to PyTorch tensors for texts
train_text_embeddings_tensor = torch.stack(train_text_embeddings_list).to(torch.long)  # Specify dtype=torch.long
val_text_embeddings_tensor = torch.stack(val_text_embeddings_list).to(torch.long)  # Specify dtype=torch.long
test_text_embeddings_tensor = torch.stack(test_text_embeddings_list).to(torch.long)  # Specify dtype=torch.long


# print embeddings
print("Train Summary Embeddings:")
print(train_summary_embeddings_tensor)

print("Validation Summary Embeddings:")
print(val_summary_embeddings_tensor)

print("Test Summary Embeddings:")
print(test_summary_embeddings_tensor)

print("Train Text Embeddings:")
print(train_text_embeddings_tensor)

print("Validation Text Embeddings:")
print(val_text_embeddings_tensor)

print("Test Text Embeddings:")
print(test_text_embeddings_tensor)

train_summary_embeddings = train_summary_embeddings_tensor
val_summary_embeddings = val_summary_embeddings_tensor
test_summary_embeddings = test_summary_embeddings_tensor
train_text_embeddings = train_text_embeddings_tensor
val_text_embeddings = val_text_embeddings_tensor
test_text_embeddings = test_text_embeddings_tensor



In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# batch size
batch_size = 32

def pad_sequence(emb_list, max_length):
    padded_tensors = []
    for embeddings in emb_list:
        # Pad with zero embeddings if the sequence is shorter than max_length
        padding_length = max_length - embeddings.shape[0]
        if padding_length > 0:
            padding = torch.zeros((padding_length, embeddings.shape[1]))
            padded_embeddings = torch.cat([embeddings, padding], dim=0)
        else:
            padded_embeddings = embeddings[:max_length]
        padded_tensors.append(padded_embeddings)
    return torch.stack(padded_tensors)

def pad_collate_fn(batch):
    (src, tgt) = zip(*batch)

    # Convert to tensors
    src_tensor = torch.stack(src)
    tgt_tensor = torch.stack(tgt)

    # Find the lengths
    src_lengths = torch.tensor([len(x) for x in src])
    tgt_lengths = torch.tensor([len(x) for x in tgt])

    # Padding
    src_padded = pad_sequence(src_tensor, max_length)
    tgt_padded = pad_sequence(tgt_tensor, max_length)

    return src_padded, src_lengths, tgt_padded, tgt_lengths


# Convert lists of embeddings to padded tensors
train_text_embeddings_padded = pad_sequence(train_text_embeddings_list, max_length)
val_text_embeddings_padded = pad_sequence(val_text_embeddings_list, max_length)
test_text_embeddings_padded = pad_sequence(test_text_embeddings_list, max_length)

train_summary_embeddings_padded = pad_sequence(train_summary_embeddings_list, max_length)
val_summary_embeddings_padded = pad_sequence(val_summary_embeddings_list, max_length)
test_summary_embeddings_padded = pad_sequence(test_summary_embeddings_list, max_length)


train_dataset = TensorDataset(train_text_embeddings, train_summary_embeddings)

val_dataset = TensorDataset(val_text_embeddings, val_summary_embeddings)

test_dataset = TensorDataset(test_text_embeddings, test_summary_embeddings)

train_dataloader = DataLoader(TensorDataset(train_text_embeddings, train_summary_embeddings), batch_size=batch_size, collate_fn=pad_collate_fn)
val_dataloader = DataLoader(TensorDataset(val_text_embeddings, val_summary_embeddings), batch_size=batch_size, collate_fn=pad_collate_fn)
test_dataloader = DataLoader(TensorDataset(test_text_embeddings, test_summary_embeddings), batch_size=batch_size, collate_fn=pad_collate_fn)



In [None]:
########  NN
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib as plt

# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")


class Encoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, embedded_input):
        encoder_outputs, (hidden_state, cell_state) = self.rnn(embedded_input)
        return encoder_outputs, (hidden_state, cell_state)

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Parameter(torch.rand(hidden_dim))

    def forward(self, hidden_tuple, encoder_outputs):
        hidden, _ = hidden_tuple
        max_len = encoder_outputs.size(1)
        h = hidden.repeat(max_len, 1, 1).transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(1, 2)
        attn_scores = F.softmax(torch.bmm(h, self.attn(encoder_outputs)), dim=2)
        context = torch.bmm(attn_scores, encoder_outputs)
        return context


class Decoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(Decoder, self).__init__()
        self.rnn = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, batch_first=True)
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, embedded_target, hidden, encoder_outputs):
        context = self.attention(hidden, encoder_outputs)
        rnn_input = torch.cat((embedded_target, context.unsqueeze(1).repeat(1, embedded_target.size(1), 1)), dim=2)
        decoder_outputs, hidden = self.rnn(rnn_input, hidden)
        output_sequence = F.log_softmax(self.fc(decoder_outputs), dim=2)
        return output_sequence, hidden


class Seq2SeqAttentionWithEmbedding(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, src_embedding_dim, tgt_embedding_dim, hidden_dim):
        super(Seq2SeqAttentionWithEmbedding, self).__init__()
        self.embedding_source = nn.Embedding(src_vocab_size, src_embedding_dim)
        self.embedding_target = nn.Embedding(tgt_vocab_size, tgt_embedding_dim)
        self.encoder = Encoder(src_embedding_dim, hidden_dim)
        self.decoder = Decoder(tgt_embedding_dim, hidden_dim, tgt_vocab_size)
        self.attention = Attention(hidden_dim)

    def forward(self, src_input, tgt_input):
        embedded_source = self.embedding_source(src_input)
        embedded_target = self.embedding_target(tgt_input)

        encoder_outputs, hidden = self.encoder(embedded_source)
        context = self.attention(hidden, encoder_outputs)
        rnn_input = torch.cat((embedded_target, context), dim=2)

        decoder_outputs, hidden = self.decoder.rnn(rnn_input, hidden)
        output_sequence = F.log_softmax(self.decoder.fc(decoder_outputs), dim=2)
        return output_sequence, hidden



src_vocab_size = len(word_to_num)
tgt_vocab_size = len(word_to_num)  # Use the same vocabulary for both source and target for now, you can create separate ones if needed
model = Seq2SeqAttentionWithEmbedding(src_vocab_size, tgt_vocab_size, embedding_dim, embedding_dim, hidden_dim)


# loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.to(device)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm as tqdm_module  # Importing tqdm with a different name to avoid conflicts

# Training loop
num_epochs = 20

train_accuracy_list = []
val_accuracy_list = []
train_loss_list = []
val_loss_list = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_batches = 0
    correct_predictions = 0
    total_samples = 0

    with tqdm_module(total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch") as pbar:
        for batch_idx, (src_input, tgt_input) in enumerate(train_dataloader):
            for batch_idx, (src_input, src_lengths, tgt_input) in enumerate(train_dataloader):


              batch_size = src_input.size(0)

            # Calculate max_tgt_length dynamically for the current batch
            max_tgt_length = tgt_input.size(1)

            sos_token_idx = torch.tensor([word_to_num['<SOS>']], dtype=torch.long).to(device)  # Shape: [1]
            sos_token_emb = word_model(sos_token_idx)  # Shape: [1, embedding_dim]
            sos_token_emb = sos_token_emb.repeat(batch_size, 1, 1)  # Shape: [batch_size, 1, embedding_dim]

            print(f"Shape of sos_token_emb: {sos_token_emb.shape}")
            print(f"Shape of tgt_input: {tgt_input.shape}")
            tgt_input_with_sos = torch.cat([sos_token_emb, tgt_input], dim=1)

            optimizer.zero_grad()

            output_sequence, _ = model(src_input, tgt_input_with_sos[:, :-1, :])
            tgt_input = tgt_input.contiguous().view(-1).to(torch.long)

            loss = criterion(output_sequence, tgt_input)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_batches += 1

            # calculate training accuracy
            _, predicted = torch.max(output_sequence, 2)  # Use dim=2 to find max along time steps
            correct_predictions += (predicted == tgt_input.view_as(predicted)).sum().item()
            total_samples += tgt_input.size(0) * tgt_input.size(1)

            pbar.update(1)

    avg_loss = total_loss / total_batches
    train_accuracy = correct_predictions / total_samples

    train_accuracy_list.append(train_accuracy)
    train_loss_list.append(avg_loss)

    print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

    # Validation
    model.eval()
    total_loss = 0
    total_batches = 0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch_idx, (src_input, src_lengths, tgt_input) in enumerate(val_dataloader):
            src_input, tgt_input = src_input.to(device).to(torch.long), tgt_input.to(device).to(torch.long)


            output_sequence, _ = model(src_input, tgt_input[:, :-1])

            output_sequence = output_sequence.contiguous().view(-1, output_sequence.size(-1))
            tgt_input = tgt_input.contiguous().view(-1).to(torch.long)  # Convert to 1D LongTensor

            loss = criterion(output_sequence, tgt_input)

            total_loss += loss.item()
            total_batches += 1

            # Calculate validation accuracy
            _, predicted = torch.max(output_sequence, 1)
            correct_predictions += (predicted == tgt_input).sum().item()
            total_samples += tgt_input.size(0)

    avg_loss = total_loss / total_batches
    val_accuracy = correct_predictions / total_samples

    val_accuracy_list.append(val_accuracy)
    val_loss_list.append(avg_loss)

    print(f"Epoch [{epoch + 1}/{num_epochs}], Validation Loss: {avg_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    # Graphs every 5 epochs
    if (epoch + 1) % 5 == 0:
        plt.figure(figsize=(12, 4))

        plt.subplot(1, 2, 1)
        plt.plot(range(1, epoch + 2), train_accuracy_list, label="Train Accuracy")
        plt.plot(range(1, epoch + 2), val_accuracy_list, label="Validation Accuracy")
        plt.xlabel("Epoch")
        plt.ylabel("Accuracy")
        plt.legend()
        plt.title("Training and Validation Accuracy vs. Epoch")

        plt.subplot(1, 2, 2)
        plt.plot(range(1, epoch + 2), train_loss_list, label="Train Loss")
        plt.plot(range(1, epoch + 2), val_loss_list, label="Validation Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.legend()
        plt.title("Training and Validation Loss vs. Epoch")

        plt.tight_layout()
        plt.show()

# Final graphs
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs + 1), train_accuracy_list, label="Train Accuracy")
plt.plot(range(1, num_epochs + 1), val_accuracy_list, label="Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Training and Validation Accuracy vs. Epoch")

plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs + 1), train_loss_list, label="Train Loss")
plt.plot(range(1, num_epochs + 1), val_loss_list, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.title("Training and Validation Loss vs. Epoch")

plt.tight_layout()
plt.show()


Epoch 1/20:   0%|          | 0/4 [00:00<?, ?batch/s]


ValueError: ignored

In [None]:
# Get the index of <SOS>
sos_index = word_to_num['<SOS>']

# Get the embedding of <SOS>
sos_embedding = word_model(torch.tensor([sos_index]))

print("Shape of <SOS> embedding:")
print(sos_embedding.shape)

# Get the embedding of a sentence
sentence_embedding = word_model(torch.tensor(sentence_to_indices(train_preprocessed_text[0]), dtype=torch.long))

print("Shape of sentence embedding:")
print(sentence_embedding.shape)


In [None]:
first_batch = next(iter(train_dataloader))
print(first_batch)
