In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 512 # max number of words going into the model?

epochs = 50

learning_rate = 0.5e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'

n_embd = 576  # It's square root should be divisible by n_head
sqrt_d = torch.sqrt(torch.tensor(n_embd)).int().item()
n_head = 4

n_layer = 6
dropout = 0.01

In [2]:
class Feebler(nn.Module):
    ''' 
    T: Number of words going into the model
    C: Embedding dimension
    B: Batch size
    
    input: B, T, C
    output: B, T, sqrt(C)
    '''
    def __init__(self, sqrt_d):
        super().__init__()
        self.weights = nn.Parameter(
            torch.randn(sqrt_d, sqrt_d, block_size)
        )
        self.sqrt_d = sqrt_d

    def forward(self, data):
        # Data is of shape (b, n, d)
        data_reshaped = data.view(batch_size, n_embd, block_size)  # set up data for feebler
        data_reshaped = data.view(batch_size, self.sqrt_d, self.sqrt_d, block_size)  # reshape incoming data
        product = data_reshaped * self.weights  # multiply data with weights
        # perform columnwise sum inside each window
        updated_product = torch.sum(product, dim=2, keepdim=False)  # finally we have converted from dxn to sqrt(d)xn
        return updated_product.view(batch_size, block_size, self.sqrt_d)
    

class Booster(nn.Module):
    ''' 
    input: B, T, sqrt(C)
    output: B, T, C
    '''
    def __init__(self, sqrt_d):
        super(Booster, self).__init__()
        self.weights = nn.Parameter(
            torch.randn(sqrt_d, sqrt_d, block_size)
        )
        self.sqrt_d = sqrt_d

    def forward(self, attention_output):
        # attention_output is of shape (batch, n, sqrt_d)
        # set up data shape for the booster
        attention_output = attention_output.view(batch_size, self.sqrt_d, block_size)
        attention_output_reshaped = attention_output.view(batch_size, 1, -1) # flatten all rows into one row
        attention_output_reshaped = attention_output_reshaped.repeat(1, self.sqrt_d, 1)  # repeat each row sqrt_d times
        attention_output_reshaped = attention_output_reshaped.view(batch_size, self.sqrt_d, self.sqrt_d, block_size)
        # multiply
        revived_output = self.weights * attention_output_reshaped
        revived_output = revived_output.view(-1, block_size)
        return revived_output.view(batch_size, block_size, n_embd)

class QuickHead(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(sqrt_d, head_size, bias=False)
        self.query = nn.Linear(sqrt_d, head_size, bias=False)
        self.value = nn.Linear(sqrt_d, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x is of shape (batch_size, n, sqrt_d)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        v = self.value(x) # (B,T,C)

        collective_k = k.sum(1, keepdim=True)
        # Broadcast explicitly
        collective_k_bc = collective_k.repeat(1, block_size, 1)
        # q multiply k
        qk = q * collective_k_bc
        attention_weights = torch.softmax(qk, dim=1)
        collective_v = v.sum(dim=1, keepdim=True)
        collective_v_bc = collective_v.repeat(1, block_size, 1)
        output = collective_v_bc * attention_weights
        return output
    
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([QuickHead(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(sqrt_d, sqrt_d) # global variable sqrt_d
        self.dropout = nn.Dropout(dropout)  # global variable dropout

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
    
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, sqrt_d):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(sqrt_d, 4 * sqrt_d),
            nn.ReLU(),
            nn.Linear(4 * sqrt_d, sqrt_d),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = sqrt_d // n_head
        self.feebler = Feebler(sqrt_d)
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(sqrt_d)
        self.ln1 = nn.LayerNorm(sqrt_d)
        self.ln2 = nn.LayerNorm(sqrt_d)
        self.booster = Booster(sqrt_d)

    def forward(self, x):
        x = self.feebler(x)
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        x = self.booster(x)
        return x

In [3]:
b = Block(n_embd, n_head)
b(torch.rand(batch_size, block_size, n_embd)).shape

torch.Size([16, 512, 576])

In [4]:
vocab_size = 100000

# super simple quickformer model
class Quickformer(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, 1)
        self.logits_maker = nn.Linear(block_size, 1)
        self.classifier = nn.Sigmoid()

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        x = self.lm_head(x) # (B,T,1)
        x = x.squeeze(2) # (B,T)
        logits = self.logits_maker(x) # (B,1)
        results = self.classifier(logits) # (B, 1)

        if targets is None:
            loss = None
        else:
            loss = F.binary_cross_entropy(results, targets)

        return results, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = Quickformer()
model = model.to(device)

In [5]:
inp = torch.rand(batch_size, block_size).long().to(device)
print(inp.shape)
l, ll = model(inp)
l.shape

torch.Size([16, 512])


torch.Size([16, 1])

# Data handling

In [6]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

# Load IMDb dataset from Hugging Face
dataset = load_dataset("imdb")

# Use a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define a custom PyTorch Dataset
class IMDbDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=block_size):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        text = item["text"]
        label = item["label"]

        # Tokenize and encode the text
        inputs = self.tokenizer(
            text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        return {"input_ids": input_ids, "attention_mask": attention_mask, "label": torch.tensor(label)}

# Create Train dataset (pytorch)
imdb_dataset_train = IMDbDataset(dataset["train"], tokenizer)
# Create Test dataset (pytorch)
imdb_dataset_test = IMDbDataset(dataset["test"], tokenizer)

# Create PyTorch DataLoader for train set
dataloader_train = DataLoader(imdb_dataset_train, batch_size=batch_size, shuffle=True)
# Create PyTorch DataLoader for test set
dataloader_test = DataLoader(imdb_dataset_test, batch_size=batch_size, shuffle=True)

In [7]:
# Inspect the dataset info to see the details
print(dataset["train"].info)  # shows 0 = neg, 1 = pos

print(dataset["train"][0].keys())

DatasetInfo(description='', citation='', homepage='', license='', features={'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name='parquet', dataset_name='imdb', config_name='plain_text', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=33435948, num_examples=25000, shard_lengths=None, dataset_name='imdb'), 'test': SplitInfo(name='test', num_bytes=32653810, num_examples=25000, shard_lengths=None, dataset_name='imdb'), 'unsupervised': SplitInfo(name='unsupervised', num_bytes=67113044, num_examples=50000, shard_lengths=None, dataset_name='imdb')}, download_checksums={'hf://datasets/imdb@e6281661ce1c48d982bc483cf8a173c1bbeb5d31/plain_text/train-00000-of-00001.parquet': {'num_bytes': 20979968, 'checksum': None}, 'hf://datasets/imdb@e6281661ce1c48d982bc483cf8a173c1bbeb5d31/plain_text/test-00000-of-00001.parquet': {'num_bytes': 20470363, 'checksum': None}, 'hf:

In [8]:
# Testing if my model works well with this dataset
# Example of how to iterate through the dataloader
for batch in dataloader_train:
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["label"].unsqueeze(1).float()

    print('input ids:', input_ids.shape)
    print('labels:', labels.shape)

    print('loss:', model(input_ids, labels)[1])
    break

input ids: torch.Size([16, 512])
labels: torch.Size([16, 1])


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
# THIS DOES NOT WORK
# # Testing if my model works well with this dataset
# # Example of how to iterate through the dataloader
# count = 0
# for idx, targets in dataloader_train:
#     idx = idx.to(device)
#     targets = targets.to(device)
#     print('idx:', idx.shape)
#     print('targets:', targets.shape)

#     print(model(idx, targets)[1])

#     count += 1

#     if count > 5:
#         break

In [None]:
input_ids = input_ids.to(device)
print(input_ids.shape)

torch.Size([16, 512])


In [None]:
model(input_ids)

(tensor([[0.4877],
         [0.4690],
         [0.4618],
         [0.4999],
         [0.4327],
         [0.3809],
         [0.3770],
         [0.4431],
         [0.4776],
         [0.4195],
         [0.4573],
         [0.4276],
         [0.4817],
         [0.4940],
         [0.3807],
         [0.4110]], grad_fn=<SigmoidBackward0>),
 None)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

def train_model(model, train_loader, test_loader, optimizer, loss_function, num_epochs=10, device='cuda'):
    model.to(device)  # Move the model to the specified device (GPU or CPU)

    for epoch in range(num_epochs):
        # Training
        model.train()
        total_loss = 0
        total_samples = 0

        # Use tqdm for progress bar
        train_loader = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}, Training')

        for batch in train_loader:
            try:
                idx = batch["input_ids"]
                targets = batch["label"].unsqueeze(1).float()
                idx, targets = idx.to(device), targets.to(device)

                optimizer.zero_grad()
                results, loss = model(idx)
                loss = loss_function(results, targets)
                loss.backward()
                optimizer.step()

                total_loss += loss.item() * len(idx)
                total_samples += len(idx)

                # Update tqdm progress bar
                train_loader.set_postfix({'Train Loss': total_loss / total_samples})
            except:
                pass

        average_train_loss = total_loss / total_samples

        # Validation
        model.eval()
        with torch.no_grad():
            total_loss = 0
            total_samples = 0

            # Use tqdm for progress bar
            test_loader = tqdm(test_loader, desc=f'Epoch {epoch + 1}/{num_epochs}, Validation')

            for batch in test_loader:
                try:
                    idx = batch["input_ids"]
                    targets = batch["label"].unsqueeze(1).float()
                    idx, targets = idx.to(device), targets.to(device)

                    results, loss = model(idx)
                    loss = loss_function(results, targets)

                    total_loss += loss.item() * len(idx)
                    total_samples += len(idx)

                    # Update tqdm progress bar
                    test_loader.set_postfix({'Test Loss': total_loss / total_samples})
                except:
                    pass

        average_test_loss = total_loss / total_samples

        print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {average_train_loss:.4f}, Test Loss: {average_test_loss:.4f}')

    print('Training complete!')

model = Quickformer()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.BCELoss()

train_model(model, dataloader_train, dataloader_test, optimizer, loss_function, num_epochs=epochs, device=device)


Epoch 1/50, Training: 100%|██████████| 1563/1563 [01:08<00:00, 22.95it/s, Train Loss=0.696]
Epoch 1/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 47.30it/s, Test Loss=0.699]


Epoch 1/50, Train Loss: 0.6959, Test Loss: 0.6991


Epoch 2/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.51it/s, Train Loss=0.66] 
Epoch 2/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.53it/s, Test Loss=0.674]


Epoch 2/50, Train Loss: 0.6604, Test Loss: 0.6738


Epoch 3/50, Training: 100%|██████████| 1563/1563 [01:05<00:00, 23.70it/s, Train Loss=0.506]
Epoch 3/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 47.28it/s, Test Loss=0.732]


Epoch 3/50, Train Loss: 0.5060, Test Loss: 0.7325


Epoch 4/50, Training: 100%|██████████| 1563/1563 [01:05<00:00, 23.98it/s, Train Loss=0.309]
Epoch 4/50, Validation: 100%|██████████| 1563/1563 [00:32<00:00, 47.55it/s, Test Loss=0.927]


Epoch 4/50, Train Loss: 0.3089, Test Loss: 0.9269


Epoch 5/50, Training: 100%|██████████| 1563/1563 [01:05<00:00, 23.98it/s, Train Loss=0.158]
Epoch 5/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.21it/s, Test Loss=1.18]


Epoch 5/50, Train Loss: 0.1578, Test Loss: 1.1839


Epoch 6/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.66it/s, Train Loss=0.0757]
Epoch 6/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.51it/s, Test Loss=1.81]


Epoch 6/50, Train Loss: 0.0757, Test Loss: 1.8111


Epoch 7/50, Training: 100%|██████████| 1563/1563 [01:05<00:00, 23.70it/s, Train Loss=0.0385]
Epoch 7/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.99it/s, Test Loss=2.15]


Epoch 7/50, Train Loss: 0.0385, Test Loss: 2.1470


Epoch 8/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.23it/s, Train Loss=0.0333]
Epoch 8/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.63it/s, Test Loss=2.65]


Epoch 8/50, Train Loss: 0.0333, Test Loss: 2.6487


Epoch 9/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.50it/s, Train Loss=0.0197]
Epoch 9/50, Validation: 100%|██████████| 1563/1563 [00:35<00:00, 43.79it/s, Test Loss=2.84]


Epoch 9/50, Train Loss: 0.0197, Test Loss: 2.8400


Epoch 10/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.29it/s, Train Loss=0.0156]
Epoch 10/50, Validation: 100%|██████████| 1563/1563 [00:32<00:00, 47.62it/s, Test Loss=4.25]


Epoch 10/50, Train Loss: 0.0156, Test Loss: 4.2452


Epoch 11/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.24it/s, Train Loss=0.0182]
Epoch 11/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.07it/s, Test Loss=4.02]


Epoch 11/50, Train Loss: 0.0182, Test Loss: 4.0159


Epoch 12/50, Training: 100%|██████████| 1563/1563 [01:08<00:00, 22.91it/s, Train Loss=0.0182] 
Epoch 12/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.43it/s, Test Loss=6.28]


Epoch 12/50, Train Loss: 0.0182, Test Loss: 6.2814


Epoch 13/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.15it/s, Train Loss=0.0142]
Epoch 13/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.86it/s, Test Loss=3.44]


Epoch 13/50, Train Loss: 0.0142, Test Loss: 3.4420


Epoch 14/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.01it/s, Train Loss=0.0123] 
Epoch 14/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.54it/s, Test Loss=4.92]


Epoch 14/50, Train Loss: 0.0123, Test Loss: 4.9190


Epoch 15/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.33it/s, Train Loss=0.00606]
Epoch 15/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.37it/s, Test Loss=4.39]


Epoch 15/50, Train Loss: 0.0061, Test Loss: 4.3888


Epoch 16/50, Training: 100%|██████████| 1563/1563 [01:08<00:00, 22.94it/s, Train Loss=0.0101] 
Epoch 16/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 44.88it/s, Test Loss=5.14]


Epoch 16/50, Train Loss: 0.0101, Test Loss: 5.1439


Epoch 17/50, Training: 100%|██████████| 1563/1563 [01:09<00:00, 22.56it/s, Train Loss=0.01]  
Epoch 17/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 44.98it/s, Test Loss=5.03]


Epoch 17/50, Train Loss: 0.0100, Test Loss: 5.0272


Epoch 18/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.23it/s, Train Loss=0.0101] 
Epoch 18/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.00it/s, Test Loss=4.71]


Epoch 18/50, Train Loss: 0.0101, Test Loss: 4.7051


Epoch 19/50, Training: 100%|██████████| 1563/1563 [01:08<00:00, 22.80it/s, Train Loss=0.00268]
Epoch 19/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.25it/s, Test Loss=4.49]


Epoch 19/50, Train Loss: 0.0027, Test Loss: 4.4875


Epoch 20/50, Training: 100%|██████████| 1563/1563 [01:08<00:00, 22.95it/s, Train Loss=0.0123] 
Epoch 20/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.08it/s, Test Loss=4.58]


Epoch 20/50, Train Loss: 0.0123, Test Loss: 4.5786


Epoch 21/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.51it/s, Train Loss=0.00835]
Epoch 21/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.61it/s, Test Loss=4.17]


Epoch 21/50, Train Loss: 0.0084, Test Loss: 4.1698


Epoch 22/50, Training: 100%|██████████| 1563/1563 [01:09<00:00, 22.56it/s, Train Loss=0.00472]
Epoch 22/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 47.14it/s, Test Loss=5.76]


Epoch 22/50, Train Loss: 0.0047, Test Loss: 5.7566


Epoch 23/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.63it/s, Train Loss=0.00902]
Epoch 23/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.70it/s, Test Loss=4.84]


Epoch 23/50, Train Loss: 0.0090, Test Loss: 4.8366


Epoch 24/50, Training: 100%|██████████| 1563/1563 [01:05<00:00, 23.68it/s, Train Loss=0.00851]
Epoch 24/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.49it/s, Test Loss=5.19]


Epoch 24/50, Train Loss: 0.0085, Test Loss: 5.1932


Epoch 25/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.54it/s, Train Loss=0.0046] 
Epoch 25/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.72it/s, Test Loss=5.85]


Epoch 25/50, Train Loss: 0.0046, Test Loss: 5.8517


Epoch 26/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.63it/s, Train Loss=0.00857]
Epoch 26/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.94it/s, Test Loss=4.5] 


Epoch 26/50, Train Loss: 0.0086, Test Loss: 4.4989


Epoch 27/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.26it/s, Train Loss=0.000234]
Epoch 27/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.42it/s, Test Loss=4.85]


Epoch 27/50, Train Loss: 0.0002, Test Loss: 4.8526


Epoch 28/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.50it/s, Train Loss=1.25e-5]
Epoch 28/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 47.02it/s, Test Loss=4.98]


Epoch 28/50, Train Loss: 0.0000, Test Loss: 4.9817


Epoch 29/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.30it/s, Train Loss=6.63e-6]
Epoch 29/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.72it/s, Test Loss=5.09]


Epoch 29/50, Train Loss: 0.0000, Test Loss: 5.0899


Epoch 30/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.60it/s, Train Loss=3.3e-6] 
Epoch 30/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.15it/s, Test Loss=5.27]


Epoch 30/50, Train Loss: 0.0000, Test Loss: 5.2729


Epoch 31/50, Training: 100%|██████████| 1563/1563 [01:10<00:00, 22.31it/s, Train Loss=1.69e-6]
Epoch 31/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 47.19it/s, Test Loss=5.53]


Epoch 31/50, Train Loss: 0.0000, Test Loss: 5.5273


Epoch 32/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.52it/s, Train Loss=8.81e-7]
Epoch 32/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.01it/s, Test Loss=5.91]


Epoch 32/50, Train Loss: 0.0000, Test Loss: 5.9073


Epoch 33/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.36it/s, Train Loss=4.06e-7]
Epoch 33/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.10it/s, Test Loss=6.42]


Epoch 33/50, Train Loss: 0.0000, Test Loss: 6.4163


Epoch 34/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.40it/s, Train Loss=1.77e-7]
Epoch 34/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.52it/s, Test Loss=6.67]


Epoch 34/50, Train Loss: 0.0000, Test Loss: 6.6737


Epoch 35/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.40it/s, Train Loss=8.55e-8]
Epoch 35/50, Validation: 100%|██████████| 1563/1563 [00:35<00:00, 44.09it/s, Test Loss=6.8] 


Epoch 35/50, Train Loss: 0.0000, Test Loss: 6.7951


Epoch 36/50, Training: 100%|██████████| 1563/1563 [01:09<00:00, 22.46it/s, Train Loss=4.32e-8]
Epoch 36/50, Validation: 100%|██████████| 1563/1563 [00:35<00:00, 44.62it/s, Test Loss=7.15]


Epoch 36/50, Train Loss: 0.0000, Test Loss: 7.1483


Epoch 37/50, Training: 100%|██████████| 1563/1563 [01:09<00:00, 22.65it/s, Train Loss=1.94e-8]
Epoch 37/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.05it/s, Test Loss=7.45]


Epoch 37/50, Train Loss: 0.0000, Test Loss: 7.4460


Epoch 38/50, Training: 100%|██████████| 1563/1563 [01:05<00:00, 23.69it/s, Train Loss=9.77e-9]
Epoch 38/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.93it/s, Test Loss=7.79]


Epoch 38/50, Train Loss: 0.0000, Test Loss: 7.7878


Epoch 39/50, Training: 100%|██████████| 1563/1563 [01:08<00:00, 22.96it/s, Train Loss=4.65e-9]
Epoch 39/50, Validation: 100%|██████████| 1563/1563 [00:35<00:00, 44.37it/s, Test Loss=7.98]


Epoch 39/50, Train Loss: 0.0000, Test Loss: 7.9777


Epoch 40/50, Training: 100%|██████████| 1563/1563 [01:09<00:00, 22.51it/s, Train Loss=2.3e-9] 
Epoch 40/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.32it/s, Test Loss=8.39]


Epoch 40/50, Train Loss: 0.0000, Test Loss: 8.3861


Epoch 41/50, Training: 100%|██████████| 1563/1563 [01:08<00:00, 22.79it/s, Train Loss=1.21e-9]
Epoch 41/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.97it/s, Test Loss=8.39]


Epoch 41/50, Train Loss: 0.0000, Test Loss: 8.3935


Epoch 42/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.14it/s, Train Loss=4.72e-10]
Epoch 42/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.63it/s, Test Loss=8.65]


Epoch 42/50, Train Loss: 0.0000, Test Loss: 8.6523


Epoch 43/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.15it/s, Train Loss=2.26e-10]
Epoch 43/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 44.66it/s, Test Loss=8.66]


Epoch 43/50, Train Loss: 0.0000, Test Loss: 8.6607


Epoch 44/50, Training: 100%|██████████| 1563/1563 [01:08<00:00, 22.73it/s, Train Loss=1.28e-10]
Epoch 44/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.56it/s, Test Loss=8.9] 


Epoch 44/50, Train Loss: 0.0000, Test Loss: 8.8967


Epoch 45/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.01it/s, Train Loss=9.39e-11]
Epoch 45/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.32it/s, Test Loss=9.23]


Epoch 45/50, Train Loss: 0.0000, Test Loss: 9.2325


Epoch 46/50, Training: 100%|██████████| 1563/1563 [01:09<00:00, 22.58it/s, Train Loss=2.05e-10]
Epoch 46/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.46it/s, Test Loss=10.1]


Epoch 46/50, Train Loss: 0.0000, Test Loss: 10.1056


Epoch 47/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.44it/s, Train Loss=6.63e-11]
Epoch 47/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.14it/s, Test Loss=10.3]


Epoch 47/50, Train Loss: 0.0000, Test Loss: 10.3239


Epoch 48/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.19it/s, Train Loss=1.64e-11]
Epoch 48/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.44it/s, Test Loss=10.1]


Epoch 48/50, Train Loss: 0.0000, Test Loss: 10.1226


Epoch 49/50, Training: 100%|██████████| 1563/1563 [01:06<00:00, 23.43it/s, Train Loss=4.34e-12]
Epoch 49/50, Validation: 100%|██████████| 1563/1563 [00:33<00:00, 46.24it/s, Test Loss=9.99]


Epoch 49/50, Train Loss: 0.0000, Test Loss: 9.9912


Epoch 50/50, Training: 100%|██████████| 1563/1563 [01:07<00:00, 23.18it/s, Train Loss=2.27e-12]
Epoch 50/50, Validation: 100%|██████████| 1563/1563 [00:34<00:00, 45.54it/s, Test Loss=9.94]

Epoch 50/50, Train Loss: 0.0000, Test Loss: 9.9386
Training complete!



