1. Load all the necessary functions

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from sklearn.model_selection import train_test_split
import requests
import torch.nn as nn
import torch.optim as optim

2. import CNN/Daily Mail dataset, from https://huggingface.co/datasets/cnn_dailymail

In [None]:
# follow the instruction from https://huggingface.co/docs/datasets-server/quick_start
# Ziyu Geng

# This is English version now, sorry for import wrong dataset.

API_URL = "https://datasets-server.huggingface.co/splits?dataset=cnn_dailymail"  # contains train, val, and test
def query():
    response = requests.get(API_URL)
    return response.json()
data = query()
print(data)

# train
API_URL = "https://datasets-server.huggingface.co/rows?dataset=cnn_dailymail&config=1.0.0&split=train&offset=0&limit=100"  # train
def train_query():
    response = requests.get(API_URL)
    return response.json()
train_data = train_query()
print(train_data)

# val
API_URL = "https://datasets-server.huggingface.co/rows?dataset=cnn_dailymail&config=1.0.0&split=validation&offset=0&limit=100"  # validation
def val_query():
    response = requests.get(API_URL)
    return response.json()
val_data = val_query()
print(val_data)

# test
API_URL = "https://datasets-server.huggingface.co/rows?dataset=cnn_dailymail&config=1.0.0&split=test&offset=0&limit=100"  # test
def test_query():
    response = requests.get(API_URL)
    return response.json()
test_data = test_query()
print(test_data)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

#1 tokenize text to individual words or subwords, remove stopwords, punctuation and special characters, and make them all lowercase
vocabulary = set()

def preprocess_text(text):
    # tokenize text into words
    words = word_tokenize(text)

    # remove punctuation and special characters
    table = str.maketrans('', '', string.punctuation)
    words = [word.translate(table) for word in words]

    # convert words to lowercase
    words = [word.lower() for word in words]

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # remove empty strings/single characters
    words = [word for word in words if len(word) > 1]

    global vocabulary
    vocabulary.update(words)

    return words

#2 pad/truncate text to make all samples have the same length

def pad_or_truncate_text(text, max_length):
    # preprocess the text
    preprocessed_text = preprocess_text(text)

    # truncate text if longer than max_length
    if len(preprocessed_text) > max_length:
        preprocessed_text = preprocessed_text[:max_length]

    # pad text if shorter than max_length
    elif len(preprocessed_text) < max_length:
        padding_length = max_length - len(preprocessed_text)
        preprocessed_text.extend(['<PAD>'] * padding_length)

    return preprocessed_text

max_length = 800  # <-----should be roughly how many words/tokens are in each article

#
train_summaries = [example['row']['highlights'] for example in train_data['rows']]
val_summaries = [example['row']['highlights'] for example in val_data['rows']]
test_summaries = [example['row']['highlights'] for example in test_data['rows']]

# Preprocess target summaries for each split
train_targets = [preprocess_text(summary) for summary in train_summaries]
val_targets = [preprocess_text(summary) for summary in val_summaries]
test_targets = [preprocess_text(summary) for summary in test_summaries]

train_texts = []
val_texts = []
test_texts = []

for example in train_data['rows']:
    article_text = example['row']['article']
    preprocessed_text = pad_or_truncate_text(article_text, max_length)
    train_texts.append(preprocessed_text)

for example in test_data['rows']:
    article_text = example['row']['article']
    preprocessed_text = pad_or_truncate_text(article_text, max_length)
    test_texts.append(preprocessed_text)

for example in val_data['rows']:
    article_text = example['row']['article']
    preprocessed_text = pad_or_truncate_text(article_text, max_length)
    val_texts.append(preprocessed_text)

vocabulary = set()
for text in train_texts:
    vocabulary.update(text)
vocab_size = len(vocabulary)

print("Vocabulary size:", vocab_size)

print(train_texts)
print(val_texts)
print(test_texts)





In [None]:
# Represent text as word embeddigs

# -Word emeddings examples are Word2Vec, GloVe, FastText and represent each word as dense vector
# Ziyu Geng, follow https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html.

# sorry guys, I did not find the Word2Vec pytorch material, so I did not use it, if you guys find something error, feel free to correct.

embedding_dim = 200
torch.manual_seed(1)

# list of train texts
train_preprocessed_text = []
for example in train_data['rows']:
  article_text = example['row']['article']
  train_preprocessed_text.append(pad_or_truncate_text(article_text, max_length))

for i, sentence in enumerate(train_preprocessed_text):
    print(f"DEBUG_OUTPUT: Training Example {i + 1}:")
    print(sentence)
    print("=" * 50)

val_preprocessed_text = []
for example in val_data['rows']:
  article_text = example['row']['article']
  val_preprocessed_text.append(pad_or_truncate_text(article_text, max_length))

for i, sentence in enumerate(val_preprocessed_text):
    print(f"DEBUG_OUTPUT: Validation Example {i + 1}:")
    print(sentence)
    print("=" * 50)

# list of test texts
test_preprocessed_text = []
for example in test_data['rows']:
  article_text = example['row']['article']
  test_preprocessed_text.append(pad_or_truncate_text(article_text, max_length))

for i, sentence in enumerate(test_preprocessed_text):
    print(f"DEBUG_OUTPUT: Testing Example {i + 1}:")
    print(sentence)
    print("=" * 50)

# build vocabulary
voc = set()
all_texts = train_preprocessed_text + val_preprocessed_text + test_preprocessed_text
for sentence in all_texts:
    for word in sentence:
        voc.add(word)


# make word to number
word_to_num = {}
i = 0
for word in voc:
    word_to_num[word] = i
    i += 1

class wordembedding(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
    super(wordembedding, self).__init__()
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)

  def forward(self, inputs):
    return self.embeddings(inputs)

# Create the model and the optimizer
word_model = wordembedding(len(voc), embedding_dim)
optimizer = optim.SGD(word_model.parameters(), lr=0.001)

# loop train_preprocessed_text and generate embeddings
for sentence in train_preprocessed_text:
  sentence_idxs = []
  for word in sentence:
    id = word_to_num[word]  # Get the index of the word
    sentence_idxs.append(id)  # Append the index to the list

  sentence_idxs = torch.tensor(sentence_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
  word_model.zero_grad()
  embeddings = word_model(sentence_idxs)

  print(embeddings)

for sentence in val_preprocessed_text:
   sentence_idxs = []
   for word in sentence:
    id = word_to_num[word]  # Get the index of the word
    sentence_idxs.append(id)  # Append the index to the list

   sentence_idxs = torch.tensor(sentence_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
   word_model.zero_grad()
   embeddings = word_model(sentence_idxs)

   print(embeddings)


for sentence in test_preprocessed_text:
   sentence_idxs = []
   for word in sentence:
    id = word_to_num[word]  # Get the index of the word
    sentence_idxs.append(id)  # Append the index to the list

   sentence_idxs = torch.tensor(sentence_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
   word_model.zero_grad()
   embeddings = word_model(sentence_idxs)

   print(embeddings)

train_embeddings_list = []
val_embeddings_list = []
test_embeddings_list = []

# loop train_preprocessed_text and generate embeddings
for sentence in train_preprocessed_text:
    sentence_idxs = []
    for word in sentence:
        id = word_to_num[word]  # Get the index of the word
        sentence_idxs.append(id)  # Append the index to the list

    sentence_idxs = torch.tensor(sentence_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
    word_model.zero_grad()
    embeddings = word_model(sentence_idxs)
    train_embeddings_list.append(embeddings.detach())  # Store the embeddings

# loop val_preprocessed_text and generate embeddings
for sentence in val_preprocessed_text:
    sentence_idxs = []
    for word in sentence:
        id = word_to_num[word]  # Get the index of the word
        sentence_idxs.append(id)  # Append the index to the list

    sentence_idxs = torch.tensor(sentence_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
    word_model.zero_grad()
    embeddings = word_model(sentence_idxs)
    val_embeddings_list.append(embeddings.detach())  # Store the embeddings

# loop test_preprocessed_text and generate embeddings
for sentence in test_preprocessed_text:
    sentence_idxs = []
    for word in sentence:
        id = word_to_num[word]  # Get the index of the word
        sentence_idxs.append(id)  # Append the index to the list

    sentence_idxs = torch.tensor(sentence_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
    word_model.zero_grad()
    embeddings = word_model(sentence_idxs)
    test_embeddings_list.append(embeddings.detach())  # Store the embeddings

# Convert lists to PyTorch tensors
train_embeddings_tensor = torch.stack(train_embeddings_list)
val_embeddings_tensor = torch.stack(val_embeddings_list)
test_embeddings_tensor = torch.stack(test_embeddings_list)





In [5]:
import matplotlib.pyplot as plt
import tqdm
from tqdm import tqdm

In [None]:
# define NN architecture

# some suggested are RNN, LSTM, or Transformer based models

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

embedding_dim = 10
hidden_dim = 256

class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, input_sequence):
        embedded_input = self.embedding(input_sequence)
        encoder_outputs, (hidden_state, cell_state) = self.rnn(embedded_input)
        return encoder_outputs, (hidden_state, cell_state)

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Parameter(torch.rand(hidden_dim))

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(1)
        h = hidden.repeat(max_len, 1, 1).transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(1, 2)
        attn_scores = F.softmax(torch.bmm(h, self.attn(encoder_outputs)), dim=2)
        context = torch.bmm(attn_scores, encoder_outputs).squeeze(1)
        return context

class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, batch_first=True)
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_sequence, hidden, encoder_outputs):
        embedded_input = self.embedding(input_sequence)
        context = self.attention(hidden, encoder_outputs)
        rnn_input = torch.cat((embedded_input, context.unsqueeze(1)), dim=2)
        decoder_outputs, hidden = self.rnn(rnn_input, hidden)
        output_sequence = F.log_softmax(self.fc(decoder_outputs.squeeze(1)), dim=1)
        return output_sequence, hidden

class Seq2SeqAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Seq2SeqAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = Encoder(vocab_size, embedding_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, embedding_dim, hidden_dim)

    def forward(self, source_sequence, target_sequence):
        embedded_source = self.embedding(source_sequence)
        embedded_target = self.embedding(target_sequence)

        encoder_outputs, hidden = self.encoder(embedded_source)
        batch_size, target_length = target_sequence.size(0), target_sequence.size(1)
        outputs = torch.zeros(batch_size, target_length, vocab_size).to(target_sequence.device)
        decoder_input = embedded_target[:, 0].unsqueeze(1)
        for t in range(1, target_length):
            output, hidden = self.decoder(decoder_input, hidden, encoder_outputs)
            outputs[:, t] = output
            decoder_input = embedded_target[:, t].unsqueeze(1)
        return outputs

# train_embeddings_tensor, val_embeddings_tensor, test_embeddings_tensor

def text_to_indices(text, word_to_num):
    return [word_to_num[word] for word in text]

train_indices = [text_to_indices(text, word_to_num) for text in train_texts]
val_indices = [text_to_indices(text, word_to_num) for text in val_texts]
test_indices = [text_to_indices(text, word_to_num) for text in test_texts]

# Convert the lists of indices to tensors
train_texts_tensor = torch.tensor(train_indices, dtype=torch.long)
val_texts_tensor = torch.tensor(val_indices, dtype=torch.long)
test_texts_tensor = torch.tensor(test_indices, dtype=torch.long)

print("Train Texts Tensor Shape:", train_texts_tensor.shape)
print("Val Texts Tensor Shape:", val_texts_tensor.shape)
print("Test Texts Tensor Shape:", test_texts_tensor.shape)

train_input_sequences = train_texts_tensor[:, :-1]  # Remove the last token from the input
train_target_sequences = train_texts_tensor[:, 1:]   # Shift the target by one time step

val_input_sequences = val_texts_tensor[:, :-1]
val_target_sequences = val_texts_tensor[:, 1:]

test_input_sequences = test_texts_tensor[:, :-1]
test_target_sequences = test_texts_tensor[:, 1:]


final_train_dataset = TensorDataset(train_embeddings_tensor)
final_val_dataset = TensorDataset(val_embeddings_tensor)
final_test_dataset = TensorDataset(test_embeddings_tensor)

print("Final Train Dataset Size:", len(final_train_dataset))
print("Final Val Dataset Size:", len(final_val_dataset))
print("Final Test Dataset Size:", len(final_test_dataset))

batch_size = 32
train_loader = DataLoader(final_train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(final_val_dataset, batch_size=batch_size)
test_loader = DataLoader(final_test_dataset, batch_size=batch_size, shuffle = False)

# Verify Vocabulary Size and Indices
print("Vocabulary Size:", vocab_size)
print("Train Texts Sample:", train_texts[:2])
print("Val Texts Sample:", val_texts[:2])
print("Test Texts Sample:", test_texts[:2])

print("Train Indices Sample:", train_indices[:2])  # Verify the index representation of the texts

# Inspect Input Sequences and Targets
print("Train Texts Tensor Shape:", train_texts_tensor.shape)
print("Val Texts Tensor Shape:", val_texts_tensor.shape)
print("Test Texts Tensor Shape:", test_texts_tensor.shape)

data = next(iter(train_loader))
print("Train Batch Shape:", data[0].shape)

# instantiate model
vocab_size = 9094
model = Seq2SeqAttention(vocab_size, embedding_dim, hidden_dim)

# loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [8]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

num_epochs = 20
plot_interval = 5

train_loss_list = []
val_loss_list = []
train_accuracy_list = []
val_accuracy_list = []

def calculate_accuracy(output_sequence, target_sequence):
    _, predicted = torch.max(output_sequence, 1)
    correct = (predicted == target_sequence).sum().item()
    total = target_sequence.size(0)
    accuracy = correct / total
    return accuracy

for epoch in tqdm(range(num_epochs), desc="Epochs"):
    model.train()
    total_loss = 0.0
    total_accuracy = 0.0

    # train
    for total_batch in tqdm(train_loader, desc="Training Batches", leave=False):
        optimizer.zero_grad()

        # Unpack the total_batch into source and target
        source_batch, target_batch = total_batch

        source_sequence = source_batch.long()
        target_sequence = target_batch.long()

        max_index_source = torch.max(source_sequence).item()
        print("Max index in source_sequence:", max_index_source)
        max_index_target = torch.max(target_sequence).item()
        print("Max index in target_sequence:", max_index_target)
        print("Vocab size:", vocab_size)

        output_sequence = model(source_sequence, target_sequence)

        max_index = torch.max(source_sequence).item()
        print("Max index:", max_index)


        # flatten
        output_sequence = output_sequence.view(-1, vocab_size)
        target_sequence = target_batch.view(-1)

        loss = criterion(output_sequence, target_sequence)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_accuracy += calculate_accuracy(output_sequence, target_sequence)

    avg_loss = total_loss / len(train_loader)
    avg_accuracy = total_accuracy / len(train_loader)
    train_loss_list.append(avg_loss)
    train_accuracy_list.append(avg_accuracy)

    # val loop
    model.eval()
    with torch.no_grad():
        total_val_loss = 0.0
        total_val_accuracy = 0.0

        for source_batch, target_batch in tqdm(val_loader, desc="Validation Batches", leave=False):

            source_sequence = source_batch.long()
            target_sequence = target_batch.long()

            max_index_source = torch.max(source_sequence).item()
            print("Max index in source_sequence:", max_index_source)
            max_index_target = torch.max(target_sequence).item()
            print("Max index in target_sequence:", max_index_target)
            print("Vocab size:", vocab_size)

            output_sequence = model(source_sequence, target_sequence)

            # flatten
            output_sequence = output_sequence.view(-1, vocab_size)
            target_sequence = target_batch.view(-1)

            val_loss = criterion(output_sequence, target_sequence)
            total_val_loss += val_loss.item()
            total_val_accuracy += calculate_accuracy(output_sequence, target_sequence)

        avg_val_loss = total_val_loss / len(val_loader)
        avg_val_accuracy = total_val_accuracy / len(val_loader)
        val_loss_list.append(avg_val_loss)
        val_accuracy_list.append(avg_val_accuracy)

    # plot every 5 epochs
    if (epoch + 1) % plot_interval == 0 or epoch == num_epochs - 1:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Training Accuracy: {avg_accuracy:.4f}, Val Accuracy: {avg_val_accuracy:.4f}")

# final graphs
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs + 1), train_loss_list, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_loss_list, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs + 1), train_accuracy_list, label='Training Accuracy')
plt.plot(range(1, num_epochs + 1), val_accuracy_list, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()







Epochs:   0%|          | 0/20 [00:00<?, ?it/s]
Training Batches:   0%|          | 0/4 [00:00<?, ?it/s][A
Epochs:   0%|          | 0/20 [00:00<?, ?it/s]


ValueError: ignored