In [4]:
import wget, os, gzip, pickle, random, re, sys
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import numpy as np

IMDB_URL = 'http://dlvu.github.io/data/imdb.{}.pkl.gz'
IMDB_FILE = 'imdb.{}.pkl.gz'

PAD, START, END, UNK = '.pad', '.start', '.end', '.unk'

def load_imdb(final=False, val=5000, seed=0, voc=None, char=False):

    cst = 'char' if char else 'word'

    imdb_url = IMDB_URL.format(cst)
    imdb_file = IMDB_FILE.format(cst)

    if not os.path.exists(imdb_file):
        wget.download(imdb_url)

    with gzip.open(imdb_file) as file:
        sequences, labels, i2w, w2i = pickle.load(file)

    if voc is not None and voc < len(i2w):
        nw_sequences = {}

        i2w = i2w[:voc]
        w2i = {w: i for i, w in enumerate(i2w)}

        mx, unk = voc, w2i['.unk']
        for key, seqs in sequences.items():
            nw_sequences[key] = []
            for seq in seqs:
                seq = [s if s < mx else unk for s in seq]
                nw_sequences[key].append(seq)

        sequences = nw_sequences

    if final:
        return (sequences['train'], labels['train']), (sequences['test'], labels['test']), (i2w, w2i), 2

    # Make a validation split
    random.seed(seed)

    x_train, y_train = [], []
    x_val, y_val = [], []

    val_ind = set( random.sample(range(len(sequences['train'])), k=val) )
    for i, (s, l) in enumerate(zip(sequences['train'], labels['train'])):
        if i in val_ind:
            x_val.append(s)
            y_val.append(l)
        else:
            x_train.append(s)
            y_train.append(l)

    return (x_train, y_train), \
           (x_val, y_val), \
           (i2w, w2i), 2


def gen_sentence(sent, g):

    symb = '_[a-z]*'

    while True:

        match = re.search(symb, sent)
        if match is None:
            return sent

        s = match.span()
        sent = sent[:s[0]] + random.choice(g[sent[s[0]:s[1]]]) + sent[s[1]:]

def gen_dyck(p):
    open = 1
    sent = '('
    while open > 0:
        if random.random() < p:
            sent += '('
            open += 1
        else:
            sent += ')'
            open -= 1

    return sent

def gen_ndfa(p):

    word = random.choice(['abc!', 'uvw!', 'klm!'])

    s = ''
    while True:
        if random.random() < p:
            return 's' + s + 's'
        else:
            s+= word

def load_brackets(n=50_000, seed=0):
    return load_toy(n, char=True, seed=seed, name='dyck')

def load_ndfa(n=50_000, seed=0):
    return load_toy(n, char=True, seed=seed, name='ndfa')

def load_toy(n=50_000, char=True, seed=0, name='lang'):

    random.seed(0)

    if name == 'lang':
        sent = '_s'

        toy = {
            '_s': ['_s _adv', '_np _vp', '_np _vp _prep _np', '_np _vp ( _prep _np )', '_np _vp _con _s' , '_np _vp ( _con _s )'],
            '_adv': ['briefly', 'quickly', 'impatiently'],
            '_np': ['a _noun', 'the _noun', 'a _adj _noun', 'the _adj _noun'],
            '_prep': ['on', 'with', 'to'],
            '_con' : ['while', 'but'],
            '_noun': ['mouse', 'bunny', 'cat', 'dog', 'man', 'woman', 'person'],
            '_vp': ['walked', 'walks', 'ran', 'runs', 'goes', 'went'],
            '_adj': ['short', 'quick', 'busy', 'nice', 'gorgeous']
        }

        sentences = [ gen_sentence(sent, toy) for _ in range(n)]
        sentences.sort(key=lambda s : len(s))

    elif name == 'dyck':

        sentences = [gen_dyck(7./16.) for _ in range(n)]
        sentences.sort(key=lambda s: len(s))

    elif name == 'ndfa':

        sentences = [gen_ndfa(1./4.) for _ in range(n)]
        sentences.sort(key=lambda s: len(s))

    else:
        raise Exception(name)

    tokens = set()
    for s in sentences:

        if char:
            for c in s:
                tokens.add(c)
        else:
            for w in s.split():
                tokens.add(w)

    i2t = [PAD, START, END, UNK] + list(tokens)
    t2i = {t:i for i, t in enumerate(i2t)}

    sequences = []
    for s in sentences:
        if char:
            tok = list(s)
        else:
            tok = s.split()
        sequences.append([t2i[t] for t in tok])

    return sequences, (i2t, t2i)

In [44]:

#x_train, (i2w, w2i) = load_ndfa(n=150_000)

x_train, (i2w, w2i) = load_brackets(n=150_000)

In [45]:
w2i

{'.pad': 0, '.start': 1, '.end': 2, '.unk': 3, ')': 4, '(': 5}

In [46]:


# Add start and end tokens to each sequence
x_train_preprocessed = [[w2i['.start']] + seq + [w2i['.end']] for seq in x_train]


# Set a maximum number of tokens per batch
max_tokens_per_batch = 50000  # Adjust this value as needed

# Split data into batches
batches = []
current_batch = []
current_batch_tokens = 0

for seq in x_train_preprocessed:
    # Check if adding the sequence exceeds the maximum tokens per batch
    if current_batch_tokens + len(seq) <= max_tokens_per_batch:
        current_batch.append(seq)
        current_batch_tokens += len(seq)
    else:
        batches.append(current_batch)
        current_batch = [seq]
        current_batch_tokens = len(seq)

# Add the last batch
if current_batch:
    batches.append(current_batch)

# Now, 'batches' contains lists of sequences with start and end tokens, respecting the maximum tokens per batch

# Pad sequences within each batch to the same length
padded_batches = [pad_sequence([torch.tensor(s) for s in batch], batch_first=True, padding_value=w2i['.pad']) for batch in batches]

# Convert lists of batches to PyTorch tensors
tensor_batches = [torch.tensor(batch, dtype=torch.long) for batch in padded_batches]

# Now, 'tensor_batches' contains tensors of sequences with start and end tokens, all with the same length

# Example: Print the first batch
tensor_batches


  tensor_batches = [torch.tensor(batch, dtype=torch.long) for batch in padded_batches]


[tensor([[1, 5, 4, 2],
         [1, 5, 4, 2],
         [1, 5, 4, 2],
         ...,
         [1, 5, 4, 2],
         [1, 5, 4, 2],
         [1, 5, 4, 2]]),
 tensor([[1, 5, 4, 2],
         [1, 5, 4, 2],
         [1, 5, 4, 2],
         ...,
         [1, 5, 4, 2],
         [1, 5, 4, 2],
         [1, 5, 4, 2]]),
 tensor([[1, 5, 4, 2],
         [1, 5, 4, 2],
         [1, 5, 4, 2],
         ...,
         [1, 5, 4, 2],
         [1, 5, 4, 2],
         [1, 5, 4, 2]]),
 tensor([[1, 5, 4, 2],
         [1, 5, 4, 2],
         [1, 5, 4, 2],
         ...,
         [1, 5, 4, 2],
         [1, 5, 4, 2],
         [1, 5, 4, 2]]),
 tensor([[1, 5, 4, 2],
         [1, 5, 4, 2],
         [1, 5, 4, 2],
         ...,
         [1, 5, 4, 2],
         [1, 5, 4, 2],
         [1, 5, 4, 2]]),
 tensor([[1, 5, 4, 2],
         [1, 5, 4, 2],
         [1, 5, 4, 2],
         ...,
         [1, 5, 4, 2],
         [1, 5, 4, 2],
         [1, 5, 4, 2]]),
 tensor([[1, 5, 4, 2, 0, 0],
         [1, 5, 4, 2, 0, 0],
         [1, 5, 4,

In [47]:

# Remove the first column (start token) from each sequence in tensor_batches to get target_batches
target_batches = [batch[:, 1:] for batch in tensor_batches]

# Append a column of zeros to each sequence in target_batches
target_batches = [torch.cat((batch, torch.zeros(batch.size(0), 1, dtype=torch.long)), dim=1) for batch in target_batches]

# Now, target_batches contains tensors of sequences with the same length as input sequences, but shifted one token to the left

# Example: Print the first target batch
print(target_batches[2])

tensor([[5, 4, 2, 0],
        [5, 4, 2, 0],
        [5, 4, 2, 0],
        ...,
        [5, 4, 2, 0],
        [5, 4, 2, 0],
        [5, 4, 2, 0]])


In [61]:
# Define the LSTM model
class NDFA_LSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size):
        super(NDFA_LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(input_size=emb_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        output = self.linear(lstm_out)
        return output

# Initialize the model
vocab_size = len(i2w)
emb_size = 32
hidden_size = 16

model = NDFA_LSTM(vocab_size, emb_size, hidden_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Assuming 0 represents the padding index
optimizer = optim.Adam(model.parameters(), lr=0.0005)

#tensor_batches = [torch.tensor(batch, dtype=torch.long) for batch in padded_batches]
#target_batches = [torch.tensor(batch, dtype=torch.long) for batch in target_batches]


num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for input_batch, target_batch in zip(tensor_batches, target_batches):
        optimizer.zero_grad()

        # Forward pass
        output = model(input_batch)

        # Reshape output and target to (batch_size * sequence_length, vocab_size)
        output = output.view(-1, vocab_size)
        target = target_batch.view(-1)

        # Compute the loss
        loss = criterion(output, target)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')





Epoch [1/10], Loss: 1.3987
Epoch [2/10], Loss: 1.2094
Epoch [3/10], Loss: 1.0598
Epoch [4/10], Loss: 0.9559
Epoch [5/10], Loss: 0.8993
Epoch [6/10], Loss: 0.8691
Epoch [7/10], Loss: 0.8503
Epoch [8/10], Loss: 0.8368
Epoch [9/10], Loss: 0.8250
Epoch [10/10], Loss: 0.8148


<class 'torch.nn.parameter.Parameter'> torch.Size([6, 32])
<class 'torch.nn.parameter.Parameter'> torch.Size([64, 32])
<class 'torch.nn.parameter.Parameter'> torch.Size([64, 16])
<class 'torch.nn.parameter.Parameter'> torch.Size([64])
<class 'torch.nn.parameter.Parameter'> torch.Size([64])
<class 'torch.nn.parameter.Parameter'> torch.Size([6, 16])
<class 'torch.nn.parameter.Parameter'> torch.Size([6])


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.tensorboard import SummaryWriter

# Assuming tensor_batches and target_batches for training and validation are already prepared

# Convert lists of batches to PyTorch tensors
tensor_batches_train = [torch.tensor(batch, dtype=torch.long) for batch in padded_batches]
target_batches_train = [torch.tensor(batch, dtype=torch.long) for batch in target_batches]

#tensor_batches_val = [torch.tensor(batch, dtype=torch.long) for batch in padded_batches_val]
#target_batches_val = [torch.tensor(batch, dtype=torch.long) for batch in target_batches_val]

# Define the LSTM model
class NDFA_LSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, num_layers=1, dropout=0.0):
        super(NDFA_LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(input_size=emb_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        output = self.linear(lstm_out)
        return output

# Initialize the model
vocab_size = len(i2w)
emb_size = 32
hidden_size = 16

model = NDFA_LSTM(vocab_size, emb_size, hidden_size)


# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')  # Assuming 0 represents the padding index
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Set up TensorBoard for visualization
writer = SummaryWriter()

# Training loop
num_epochs = 50  # Adjust as needed

for epoch in range(num_epochs):
    # Training
    model.train()
    total_tokens = 0
    total_loss = 0
    total_norm = 0

    for input_batch, target_batch in zip(tensor_batches_train, target_batches_train):
        optimizer.zero_grad()

        # Forward pass
        output = model(input_batch)

        # Reshape output and target to (batch_size * sequence_length, vocab_size)
        output = output.view(-1, vocab_size)
        target = target_batch.view(-1)

        # Compute the loss
        loss = criterion(output, target)

        # Backward pass and optimization
        loss.backward()

        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        # Track total tokens and loss
        total_tokens += target_batch.numel()
        total_loss += loss.item()

        # Track gradient norm
        total_norm += nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    # Calculate average training loss for the epoch
    average_train_loss = total_loss / total_tokens

    # Log training loss, gradient norm, and epoch
    writer.add_scalar('Train/Loss', average_train_loss, epoch)
    writer.add_scalar('Train/Gradient_Norm', total_norm, epoch)

    print(f'Training Epoch [{epoch + 1}/{num_epochs}], Average Loss: {average_train_loss:.4f}, Average Gradient Norm: {total_norm:.4f}')

# Close the TensorBoard writer
writer.close()

ModuleNotFoundError: No module named 'tensorboard'

In [69]:
import torch
import torch.nn.functional as F
import torch.distributions as dist

def sample(lnprobs, temperature=1.0):
    """
    Sample an element from a categorical distribution
    :param lnprobs: Outcome logits
    :param temperature: Sampling temperature. 1.0 follows the given distribution, 0.0 returns the maximum probability element.
    :return: The index of the sampled element.
    """
    if temperature == 0.0:
        return lnprobs.argmax()
    p = F.softmax(lnprobs / temperature, dim=0)
    cd = dist.Categorical(p)
    return cd.sample()

def generate_sequence(model, seed_sequence, max_length=100, temperature=1.0, num_samples=10):
    model.eval()

    # Convert seed_sequence to tensor and add a singleton batch dimension
    seed_sequence_tensor = torch.tensor([seed_sequence], dtype=torch.long)
    
    num_epochs = 10
    
    for epoch in range(num_epochs):
        print(f"Generated Samples after Epoch {epoch + 1}:")

        for _ in range(num_samples):
            generated_sequence = seed_sequence.copy()

            with torch.no_grad():
                for _ in range(max_length):
                    # Forward pass
                    output = model(seed_sequence_tensor)

                    # Get the last predicted logits
                    last_logits = output[0, -1, :]

                    # Sample the next token using the provided sample function
                    next_token = sample(last_logits, temperature).item()

                    # Append the sampled token to the generated sequence
                    generated_sequence.append(next_token)

                    # Check for the end token
                    if next_token == w2i['.end']:
                        break

                    # Prepare the next input for the model
                    seed_sequence_tensor = torch.tensor([[next_token]], dtype=torch.long)

            # Convert the generated sequence back to tokens
            generated_tokens = [i2w[token] for token in generated_sequence]

            # Print the generated sequence
            print(" ".join(generated_tokens))

# Example usage:
seed_sequence = [w2i['.start'], w2i['('], w2i['('], w2i[')']]
generate_sequence(model, seed_sequence, max_length=40, temperature=0.15, num_samples=10)



Generated Samples after Epoch 1:
.start ( ( ) ) .end
.start ( ( ) ( ) ( ( ( ( ) ( ) ( ) .end
.start ( ( ) ( ) ( ) ( ( ( ) ( ( ( ) ( ) ( ) ) ) ( ( ( ( ( ( ( ) ( ) ) ( ) ( ) ) ( ( ) ( ) (
.start ( ( ) ) ( ) ( ) ( ) .end
.start ( ( ) .end
.start ( ( ) ( ( ) ( ( ( ( ) ( ) ( ) ( ( ) .end
.start ( ( ) ( ) ( ) ( ) ( ( ( ) ( ) .end
.start ( ( ) .end
.start ( ( ) ) ( ) ( ) ( ) ( ) ( ( ) ( ( ) ( ) ( ) ( ) .end
.start ( ( ) ( ) ( ( ) ) .end
Generated Samples after Epoch 2:
.start ( ( ) .end
.start ( ( ) ( ) ( ) .end
.start ( ( ) .end
.start ( ( ) ) ( ( ( ) .end
.start ( ( ) ( ( ) .end
.start ( ( ) ( ) ( ( ( ) ( ( ( ) ( ( ( ) .end
.start ( ( ) .end
.start ( ( ) ( ) ( ( ( ) .end
.start ( ( ) ( ) ( ) ( ) .end
.start ( ( ) .end
Generated Samples after Epoch 3:
.start ( ( ) ( ) ( ) ( ( ( ) ( ) ( ( ) ( ) ( ( ( ( ) ( ) ( ) .end
.start ( ( ) ( ) ( ( ) ( ( ( ) .end
.start ( ( ) ( ( ( ) ( ) .end
.start ( ( ) ( ) ( ( ( ( ( ) ( ) ( ) .end
.start ( ( ) ( ) ( ( ( ) ( ) ( ( ( ( ) ( ( ) ( ( ) ( ( ) ) .end
.start