In [1]:
import pickle
from sklearn.model_selection import train_test_split
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
import random

# Load the preprocessed dialogue pairs
with open('../data/preprocessed_dialogue_pairs.pkl', 'rb') as file:
    preprocessed_dialogue_pairs = pickle.load(file)

In [2]:
train_pairs, val_pairs = train_test_split(preprocessed_dialogue_pairs, test_size=0.2, random_state=42)

# Define the tokenizer function
tokenizer = get_tokenizer('basic_english')

input_sequences = [pair[0] for pair in train_pairs]
target_sequences = [pair[1] for pair in train_pairs]

# Create a generator function to yield tokens
def yield_tokens(tokenized_sequences):
    for sequence in tokenized_sequences:
        yield sequence

# Create vocabulary mappings for the input and target sequences
special_tokens = ['<pad>', '<sos>', '<eos>', '<unk>']
input_vocab = build_vocab_from_iterator(yield_tokens(input_sequences), specials=special_tokens)
target_vocab = build_vocab_from_iterator(yield_tokens(target_sequences), specials=special_tokens)

# Set the default index for handling unknown tokens
input_vocab.set_default_index(input_vocab['<unk>'])
target_vocab.set_default_index(target_vocab['<unk>'])

In [127]:
from collections import Counter

first_list_lengths = [len(pair[0]) for pair in train_pairs]
second_list_lengths = [len(pair[1]) for pair in train_pairs]

# Count occurrences of each length
first_list_length_counts = Counter(first_list_lengths)
second_list_length_counts = Counter(second_list_lengths)

# Calculate percentages
total_first_list = len(first_list_lengths)
total_second_list = len(second_list_lengths)

first_list_length_percentages = {length: count / total_first_list * 100 for length, count in first_list_length_counts.items()}
second_list_length_percentages = {length: count / total_second_list * 100 for length, count in second_list_length_counts.items()}

# Order by length
ordered_first_list_percentages = sorted(first_list_length_percentages.items(), key=lambda x: x[0])
ordered_second_list_percentages = sorted(second_list_length_percentages.items(), key=lambda x: x[0])

# Calculate cumulative percentages
cumulative_first_list_percentages = []
cumulative_percentage = 0
for length, percentage in ordered_first_list_percentages:
    cumulative_percentage += percentage
    cumulative_first_list_percentages.append((length, cumulative_percentage))

cumulative_second_list_percentages = []
cumulative_percentage = 0
for length, percentage in ordered_second_list_percentages:
    cumulative_percentage += percentage
    cumulative_second_list_percentages.append((length, cumulative_percentage))

# Print the ordered and cumulative percentages
print("Ordered First List Length Percentages:", [(length, round(percentage, 4)) for length, percentage in ordered_first_list_percentages])
print("Cumulative First List Length Percentages:", [(length, round(percentage, 4)) for length, percentage in cumulative_first_list_percentages])

print("Ordered Second List Length Percentages:", [(length, round(percentage, 4)) for length, percentage in ordered_second_list_percentages])
print("Cumulative Second List Length Percentages:", [(length, round(percentage, 4)) for length, percentage in cumulative_second_list_percentages])


Ordered First List Length Percentages: [(0, 0.0925), (1, 7.6061), (2, 6.271), (3, 7.297), (4, 8.8791), (5, 8.1335), (6, 7.4459), (7, 6.3816), (8, 5.3279), (9, 4.4508), (10, 3.8614), (11, 3.4062), (12, 2.9567), (13, 2.7632), (14, 2.369), (15, 2.1614), (16, 1.8444), (17, 1.6995), (18, 1.4688), (19, 1.3012), (20, 1.2415), (21, 1.0897), (22, 0.9972), (23, 0.8816), (24, 0.8342), (25, 0.7118), (26, 0.6983), (27, 0.586), (28, 0.5499), (29, 0.5121), (30, 0.4315), (31, 0.4129), (32, 0.3835), (33, 0.3503), (34, 0.3232), (35, 0.2984), (36, 0.2775), (37, 0.2465), (38, 0.2172), (39, 0.2352), (40, 0.1923), (41, 0.1856), (42, 0.1805), (43, 0.1624), (44, 0.132), (45, 0.1393), (46, 0.1207), (47, 0.1021), (48, 0.0987), (49, 0.1066), (50, 0.0914), (51, 0.0908), (52, 0.0773), (53, 0.0705), (54, 0.0643), (55, 0.0564), (56, 0.0592), (57, 0.0615), (58, 0.0575), (59, 0.0463), (60, 0.0513), (61, 0.0434), (62, 0.0406), (63, 0.04), (64, 0.0367), (65, 0.0299), (66, 0.0338), (67, 0.0271), (68, 0.0327), (69, 0.0276

In [128]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, num_layers, dropout_p = 0.1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.gru = nn.GRU(emb_dim, hidden_dim, num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input_seq):
        embedded = self.dropout(self.embedding(input_seq))
        output, hidden = self.gru(embedded)
        return output, hidden


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, num_layers):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.gru = nn.GRU(emb_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, encoder_outputs, hidden, decoder_input, max_length, target_tensor = None, teaching_force_ratio = 0.5):
        batch_size = encoder_outputs.size(0)
        decoder_outputs = []
        decoder_hidden = hidden 

        for i in range(max_length):
            decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
            #decoder_output = decoder_output.detach()
            decoder_outputs.append(decoder_output)
            if target_tensor is not None and random.random() < teaching_force_ratio:
                # Teacher forcing
                decoder_input = target_tensor[:, i].unsqueeze(1).detach()
            else:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()
        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.fc(output)
        return output, hidden

cuda


In [139]:
input_dim = len(input_vocab)
output_dim = len(target_vocab)
emb_dim = 128
hidden_dim = 128
num_layers = 1
MAX_LEN = 100
# Training loop
num_epochs = 1
batch_size = 32  # Adjust the batch size as per your requirements
print(input_dim)
print(output_dim)

48237
48841


In [130]:
# import torch

# # Clear GPU memory cache
# torch.cuda.empty_cache()

# # Iterate over all available GPUs
# for i in range(torch.cuda.device_count()):
#     current_device = torch.device(f'cuda:{i}')
#     print(current_device)
#     # Move to the current GPU
#     with torch.cuda.device(current_device):
#         # Iterate over model parameters and buffers
#         for obj in list(torch.nn.Module().parameters()) + list(torch.nn.Module().buffers()):
#             print(obj)
#             if obj is not None and obj.is_cuda:
#                 obj.data = None  # This will release the memory associated with the tensor

# # Optionally, clear the GPU memory cache again
# torch.cuda.empty_cache()


cuda:0


In [148]:
import gc 
gc.collect()

4853

In [149]:
# Instantiate the encoder and decoder and move them to the appropriate device
encoder = Encoder(input_dim, emb_dim, hidden_dim, num_layers).to(device)
decoder = Decoder(output_dim, emb_dim, hidden_dim, num_layers).to(device)

# Define the optimizer and move the parameters to the appropriate device
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)
# criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss(ignore_index=target_vocab['<pad>'])


# Create batches of train pairs
train_batches = [train_pairs[i:i+batch_size] for i in range(0, len(train_pairs), batch_size)]
val_batches = [val_pairs[i:i+batch_size] for i in range(0, len(val_pairs), batch_size)]
print(len(train_pairs))
print(len(train_batches))
print(len(val_pairs))
print(len(val_batches))

177292
5541
44324
1386


In [150]:
for epoch in range(num_epochs):
    encoder.train()
    decoder.train()
    total_loss = 0
    total_mask = 0
    counter = 0
    start_time = time.time()

    for batch in train_batches[:]:
        counter += batch_size
        if counter % 10000 < batch_size:
            end_time = time.time()
            time_diff = end_time - start_time
            average_loss = total_loss / counter
            print(f"Time: {time_diff:.2f}s, Training Visited {counter // 10000 * 10000} lines, Loss: {average_loss:.4f}")

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        # input_seqs = [pair[0] for pair in batch]
        # target_seqs = [pair[1] for pair in batch]
        # Add the <eos> token to the end of each input sequence
        input_seqs = [pair[0] + ['<eos>'] for pair in batch]
        # Add the <sos> token to the beginning and the <eos> token to the end of each target sequence
        target_seqs = [['<sos>'] + pair[1] + ['<eos>'] for pair in batch]

        input_max_len = max(len(seq) for seq in input_seqs)
        target_max_len = max(len(seq) for seq in target_seqs)

        input_indices = [[input_vocab[token] for token in seq] + [input_vocab['<pad>']] * (input_max_len - len(seq)) for seq in input_seqs]
        target_indices = [[target_vocab[token] for token in seq] + [target_vocab['<pad>']] * (target_max_len - len(seq)) for seq in target_seqs]

        input_seq = torch.tensor(input_indices).to(device)
        target_seq = torch.tensor(target_indices).to(device)
        # mask = (target_seq != target_vocab['<pad>']).to(device)

        # print(input_seq.shape)
        # print(target_seq.shape)
        # print(mask.sum())
        encoder_outputs, encoder_hidden = encoder(input_seq)
        decoder_input = target_seq[:, 0].unsqueeze(1).to(device)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, decoder_input, min(MAX_LEN, target_max_len), target_tensor = target_seq, teaching_force_ratio = 0.5)
        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_seq[:, :MAX_LEN].reshape(-1)
        )

        # loss = 0
        # for token in range(target_seq.shape[1]):
        #     decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell, encoder_outputs)
        #     # step_loss = criterion(decoder_output, target_seq[:, token])
        #     step_loss = criterion(decoder_output[mask[:, token]], target_seq[:, token][mask[:, token]])
        #     loss += step_loss
        #     decoder_input = target_seq[:, token].unsqueeze(1)
        total_loss += loss.item()
        # total_mask += mask.sum()
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

    # average_loss = total_loss / total_mask
    average_loss = total_loss / len(train_pairs)
    # average_loss = total_loss / 10
    end_time = time.time()
    time_diff = end_time - start_time
    print(f"Time: {time_diff:.2f}s, Epoch: {epoch+1}, Training Loss: {average_loss:.4f}")

    encoder.eval()
    decoder.eval()
    with torch.no_grad():
        total_loss = 0
        total_mask = 0
        counter = 0
        for batch in val_batches[:]:
            counter += batch_size
            if counter % 10000 < batch_size:
                end_time = time.time()
                time_diff = end_time - start_time
                average_loss = total_loss / counter
                print(f"Time: {time_diff:.2f}s, Validation Visited {counter // 10000 * 10000} lines, Loss: {average_loss:.4f}")

            # input_seqs = [pair[0] for pair in batch]
            # target_seqs = [pair[1] for pair in batch]
            # Add the <eos> token to the end of each input sequence
            input_seqs = [pair[0] + ['<eos>'] for pair in batch]
            # Add the <sos> token to the beginning and the <eos> token to the end of each target sequence
            target_seqs = [['<sos>'] + pair[1] + ['<eos>'] for pair in batch]

            input_max_len = max(len(seq) for seq in input_seqs)
            target_max_len = max(len(seq) for seq in target_seqs)

            input_indices = [[input_vocab[token] for token in seq] + [input_vocab['<pad>']] * (input_max_len - len(seq)) for seq in input_seqs]
            target_indices = [[target_vocab[token] for token in seq] + [target_vocab['<pad>']] * (target_max_len - len(seq)) for seq in target_seqs]

            input_seq = torch.tensor(input_indices).to(device)
            target_seq = torch.tensor(target_indices).to(device)
            # for simplicity, mask validation in the same way. Ignore all <pad>.
            # mask = (target_seq != target_vocab['<pad>']).to(device)
            encoder_outputs, encoder_hidden = encoder(input_seq)
            decoder_input = torch.tensor([target_vocab["<sos>"]] * input_seq.shape[0]).unsqueeze(1).to(device)
            decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, decoder_input, min(MAX_LEN, target_max_len), target_tensor = None)
            loss = criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                target_seq[:, :MAX_LEN].reshape(-1)
            )

            # loss = 0
            # for token in range(target_seq.shape[1]):
            #     decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell, encoder_outputs)
            #     # step_loss = criterion(decoder_output.squeeze(1), target_seq[:, token])
            #     step_loss = criterion(decoder_output[mask[:, token]], target_seq[:, token][mask[:, token]])
            #     loss += step_loss
            #     decoder_input = torch.argmax(decoder_output, dim=1).unsqueeze(1)
            total_loss += loss.item()
            # total_mask += mask.sum()

        # average_loss = total_loss / total_mask
        average_loss = total_loss / len(train_pairs)
        # average_loss = total_loss / 10
        end_time = time.time()
        time_diff = end_time - start_time
        print(f"Time: {time_diff:.2f}s, Epoch: {epoch+1}, Validation Loss: {average_loss:.4f}")

Time: 32.05s, Training Visited 10000 lines, Loss: 0.1909
Time: 65.53s, Training Visited 20000 lines, Loss: 0.1844
Time: 100.11s, Training Visited 30000 lines, Loss: 0.1804
Time: 131.05s, Training Visited 40000 lines, Loss: 0.1777
Time: 163.56s, Training Visited 50000 lines, Loss: 0.1760
Time: 198.80s, Training Visited 60000 lines, Loss: 0.1746
Time: 231.64s, Training Visited 70000 lines, Loss: 0.1736


KeyboardInterrupt: 