In [None]:
!pip install torch
!pip install numpy
!pip install tqdm

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from tqdm import tqdm
import os
import logging
from typing import List, Tuple
import time

# Ensure reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x7e0b50350470>

Colab drive configuration

In [4]:
# Set up logging with a fixed filename
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    log_filename = "/content/drive/My Drive/training_log.log"
else:
    log_filename = "training_log.log"
#logging.basicConfig(filename=log_filename, level=logging.DEBUG, format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

Mounted at /content/drive


# Data Preprocessing and Dataset Class

In [5]:
class CharacterDataset(Dataset):
    def __init__(self, data: List[str], vocab: dict, seq_len: int):
        self.data = data
        self.vocab = vocab
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx: int):
        input_seq = self.data[idx:idx + self.seq_len]
        target_seq = self.data[idx + 1:idx + self.seq_len + 1]

        input_ids = [self.vocab.get(char, self.vocab['[UNK]']) for char in input_seq]
        target_ids = [self.vocab.get(char, self.vocab['[UNK]']) for char in target_seq]

        return torch.tensor(input_ids), torch.tensor(target_ids)

def build_vocab(data: str) -> dict:
    unique_chars = sorted(set(data))
    vocab = {char: idx for idx, char in enumerate(unique_chars, 4)}  # Start at 4 to account for special tokens
    vocab['[UNK]'] = 0
    vocab['[PAD]'] = 1
    vocab['[SOS]'] = 2
    vocab['[EOS]'] = 3
    return vocab

def load_data(file_path: str, max_data_size: int = None) -> str:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = f.read()
            if max_data_size is not None:
                data = data[:max_data_size]  # Limit the amount of data loaded
        return data
    except FileNotFoundError:
        logging.error(f"File not found: {file_path}")
        raise
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {e}")
        raise

def prepare_datasets(train_file: str, val_file: str, test_file: str, seq_len: int, max_data_size: int = None):
    train_data = load_data(train_file, max_data_size)
    val_data = load_data(val_file, max_data_size)
    test_data = load_data(test_file, max_data_size)

    vocab = build_vocab(train_data + val_data + test_data)

    train_dataset = CharacterDataset(train_data, vocab, seq_len)
    val_dataset = CharacterDataset(val_data, vocab, seq_len)
    test_dataset = CharacterDataset(test_data, vocab, seq_len)

    return train_dataset, val_dataset, test_dataset, vocab

# Transformer Model Implementation

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class MultiQueryAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiQueryAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads

        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)

        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        batch_size, seq_len, _ = q.size()

        q = self.q_proj(q).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_proj(k).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_proj(v).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1)) / np.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attn = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn, v).transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        output = self.out_proj(output)

        return output

class SparseAttention(nn.Module):
    def __init__(self, d_model, num_heads, block_size):
        super(SparseAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        self.block_size = block_size

        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)

        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        batch_size, seq_len, _ = q.size()

        q = self.q_proj(q).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_proj(k).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_proj(v).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)

        scores = torch.zeros_like(q @ k.transpose(-2, -1))
        for i in range(0, seq_len, self.block_size):
            scores[:, :, i:i+self.block_size] = q[:, :, i:i+self.block_size] @ k[:, :, i:i+self.block_size].transpose(-2, -1)
        scores /= np.sqrt(self.d_k)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attn = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn, v).transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        output = self.out_proj(output)

        return output

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, layer_norm_after=True, attention_type='default', block_size=32):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.layer_norm_after = layer_norm_after
        self.attention_type = attention_type

        # Define transformer layers
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout) for _ in range(num_encoder_layers)
        ])
        self.decoder_layers = nn.ModuleList([
            nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout) for _ in range(num_decoder_layers)
        ])

        # Initialize attention mechanisms based on the attention type
        if attention_type == 'multi-query':
            self.attention = MultiQueryAttention(d_model, nhead)
        elif attention_type == 'sparse':
            self.attention = SparseAttention(d_model, nhead, block_size)
        else:
            self.attention = None  # Default attention (standard PyTorch Transformer)

        self.fc_out = nn.Linear(d_model, vocab_size)

        # Layer normalization
        if layer_norm_after:
            self.layer_norm_encoder = nn.LayerNorm(d_model)
            self.layer_norm_decoder = nn.LayerNorm(d_model)
        else:
            self.layer_norm_encoder = None
            self.layer_norm_decoder = None

        # Weight tying
        self.fc_out.weight = self.embedding.weight

    def forward(self, src, tgt):
        src = self.embedding(src) * np.sqrt(self.d_model)
        tgt = self.embedding(tgt) * np.sqrt(self.d_model)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)

        # Pass through encoder layers
        for layer in self.encoder_layers:
            src = layer(src)
            if self.layer_norm_after and self.layer_norm_encoder is not None:
                src = self.layer_norm_encoder(src)

        # Pass through decoder layers
        for layer in self.decoder_layers:
            tgt = layer(tgt, src)
            if self.layer_norm_after and self.layer_norm_decoder is not None:
                tgt = self.layer_norm_decoder(tgt)

        # Apply alternative attention if specified
        if self.attention_type == 'multi-query':
            tgt = self.attention(tgt, src, src)
        elif self.attention_type == 'sparse':
            tgt = self.attention(tgt, src, src)

        output = self.fc_out(tgt)
        return output

def collate_fn(batch):
    src, tgt = zip(*batch)  # Unzip the batch into src and tgt
    src = pad_sequence(src, batch_first=True, padding_value=1)  # Pad src sequences
    tgt = pad_sequence(tgt, batch_first=True, padding_value=1)  # Pad tgt sequences
    return src, tgt

# Training and Evaluation Functions

In [7]:
def train_epoch(model, dataloader, criterion, optimizer, device, vocab, epoch, log_interval):
    model.train()
    total_loss = 0
    start_time = time.time()
    num_batches = len(dataloader)

    progress_bar = tqdm(dataloader, desc=f"Training Epoch {epoch}", leave=False)

    for batch_idx, (src, tgt) in enumerate(progress_bar):
        src, tgt = src.to(device), tgt.to(device)  # Move src and tgt to the same device
        optimizer.zero_grad()
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        tgt_input = pad_sequence([torch.cat((torch.tensor([vocab['[SOS]']]).to(device), x), dim=0) for x in tgt_input],
                                 batch_first=True, padding_value=vocab['[PAD]'])
        tgt_output = pad_sequence([torch.cat((x, torch.tensor([vocab['[EOS]']]).to(device)), dim=0) for x in tgt_output],
                                 batch_first=True, padding_value=vocab['[PAD]'])
        output = model(src, tgt_input)
        loss = criterion(output.view(-1, output.shape[-1]), tgt_output.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        total_loss += loss.item()

        # Log training progress
        if (batch_idx + 1) % log_interval == 0:
            elapsed = time.time() - start_time
            lr = optimizer.param_groups[0]['lr']
            ms_per_batch = elapsed * 1000 / log_interval
            loss_value = total_loss / (batch_idx + 1)
            bpc = calculate_bpc(loss_value)
            progress_bar.set_postfix(loss=loss_value, bpc=bpc)
            #logging.info(f"| epoch {epoch} | {batch_idx + 1:5d}/{num_batches:5d} batches | lr {lr:.2e} | ms/batch {ms_per_batch:5.2f} | loss {loss_value:5.2f} | bpc {bpc:8.4f}")
            print(f"| epoch {epoch} | {batch_idx + 1:5d}/{num_batches:<5d} batches | lr {lr:.2e} | ms/batch {ms_per_batch:5.2f} | loss {loss_value:5.2f} | bpc {bpc:8.4f}")
            log_message( f"| epoch {epoch} | {batch_idx + 1:5d}/{num_batches:<5d} batches | lr {lr:.2e} | ms/batch {ms_per_batch:5.2f} | loss {loss_value:5.2f} | bpc {bpc:8.4f}", log_filename)
            start_time = time.time()

    return total_loss / num_batches

def evaluate(model, dataloader, criterion, device, vocab):
    model.eval()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
    with torch.no_grad():
        for src, tgt in progress_bar:
            src, tgt = src.to(device), tgt.to(device)

            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            # Ensure SOS, EOS, and PAD tokens are on the correct device
            tgt_input = pad_sequence([torch.cat((torch.tensor([vocab['[SOS]']]).to(device), x), dim=0) for x in tgt_input],
                                     batch_first=True, padding_value=vocab['[PAD]']).to(device)
            tgt_output = pad_sequence([torch.cat((x, torch.tensor([vocab['[EOS]']]).to(device)), dim=0) for x in tgt_output],
                                     batch_first=True, padding_value=vocab['[PAD]']).to(device)

            output = model(src, tgt_input)
            # Make sure the output and target are in the correct shape for the loss function
            loss = criterion(output.view(-1, output.shape[-1]), tgt_output.view(-1))

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

    return total_loss / len(dataloader)

def calculate_bpc(loss):
    # Convert loss (cross entropy) to bits-per-character
    return loss / np.log(2)

def save_model(model, filepath):
    torch.save(model.state_dict(), filepath)
    logging.info(f'Model saved to {filepath}')

def load_model(model, filepath):
    model.load_state_dict(torch.load(filepath))
    model.eval()
    logging.info(f'Model loaded from {filepath}')

# Main Training Loop
Specify data location.

In [8]:
# Check if running on Google Colab
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_path = '/content/drive/My Drive/LMDatasets/'
else:
    base_path = './LMDatasets/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
def log_message(message, log_file):
  with open(log_file, 'a') as file:
    file.write(message+'\n')
  file.close()

## Hyperparameters

Change attention type to change  type and  activate  layer normalisation or not.

In [10]:
def main():
    # Hyperparameters and file paths
    train_file = os.path.join(base_path, 'nchlt_text.nr.train')
    val_file = os.path.join(base_path, 'nchlt_text.nr.valid')
    test_file = os.path.join(base_path, 'nchlt_text.nr.test')
    seq_len = 128
    batch_size = 128
    log_interval = 200
    epochs = 10
    learning_rate = 1e-2
    dropout = 0.1
    max_data_size = None# Set this to None to load all data or specify a limit
    layer_norm_after= True  # Set to True to apply layer normalization after residual connections
    attention_type = 'default'  # Options: 'default', 'multi-query', 'sparse'
    block_size = 128  # Used for sparse attention

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Prepare data
    train_dataset, val_dataset, test_dataset, vocab = prepare_datasets(train_file, val_file, test_file, seq_len, max_data_size)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=8)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=8)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=8)

    # Initialize model, criterion, and optimizer
    model = TransformerModel(vocab_size=len(vocab), dropout=dropout, layer_norm_after=layer_norm_after, attention_type=attention_type, block_size=block_size).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=vocab['[PAD]'])
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    # scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_loader) * epochs)  # Set T_max based on total batches

   # Training loop
    best_val_loss = float('inf')
    patience = 3  # Early stopping patience
    no_improvement = 0
    for epoch in range(1, epochs + 1):
        print("Start Training")
        logging.info("Start Training")
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device, vocab, epoch, log_interval)
        val_loss = evaluate(model, val_loader, criterion, device, vocab)
        train_bpc = calculate_bpc(train_loss)
        val_bpc = calculate_bpc(val_loss)

        # Log training progress
        #print(f'| end of epoch {epoch} | valid loss {val_loss:.4f} | valid ppl {val_bpc:.4f}')
        #logging.info(f'| end of epoch {epoch} | valid loss {val_loss:.4f} | valid ppl {val_bpc:.4f}')
        log_message(f'| end of epoch {epoch} | valid loss {val_loss:.4f} | valid ppl {val_bpc:.4f}', log_filename)
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improvement = 0
            save_model(model, f'model_epoch_{epoch}.pt')
        else:
            no_improvement += 1
            if no_improvement >= patience:
                logging.info(f'Early stopping at epoch {epoch}')
                break

        # Learning rate scheduler
        scheduler.step()

    # Evaluate on test set
    test_loss = evaluate(model, test_loader, criterion, device, vocab)
    test_bpc = calculate_bpc(test_loss)

    print(f'| End of training | test loss {test_loss:.4f} | test ppl {test_bpc:.4f}')
    logging.info(f'| End of training | test loss {test_loss:.4f} | test ppl {test_bpc:.4f}')

In [11]:
if __name__ == "__main__":
    #log_message('Start', log_filename)
    main()



Start Training


Training Epoch 1:  17%|█▋        | 200/1171 [05:27<26:56,  1.67s/it, bpc=617, loss=428]

| epoch 1 |   200/1171  batches | lr 1.00e-02 | ms/batch 1638.01 | loss 427.96 | bpc 617.4182


Training Epoch 1:  34%|███▍      | 400/1171 [10:59<21:22,  1.66s/it, bpc=311, loss=216]

| epoch 1 |   400/1171  batches | lr 1.00e-02 | ms/batch 1659.74 | loss 215.83 | bpc 311.3815


Training Epoch 1:  51%|█████     | 600/1171 [16:31<15:49,  1.66s/it, bpc=209, loss=145]

| epoch 1 |   600/1171  batches | lr 1.00e-02 | ms/batch 1660.42 | loss 144.93 | bpc 209.0895


Training Epoch 1:  68%|██████▊   | 800/1171 [22:03<10:14,  1.66s/it, bpc=158, loss=109]

| epoch 1 |   800/1171  batches | lr 1.00e-02 | ms/batch 1660.27 | loss 109.48 | bpc 157.9433


Training Epoch 1:  85%|████████▌ | 1000/1171 [27:35<04:44,  1.66s/it, bpc=127, loss=88.2]

| epoch 1 |  1000/1171  batches | lr 1.00e-02 | ms/batch 1659.98 | loss 88.21 | bpc 127.2546




Start Training


Training Epoch 2:  17%|█▋        | 200/1171 [05:32<26:52,  1.66s/it, bpc=4.5, loss=3.12]

| epoch 2 |   200/1171  batches | lr 1.00e-02 | ms/batch 1663.64 | loss  3.12 | bpc   4.4996


Training Epoch 2:  34%|███▍      | 400/1171 [11:05<21:29,  1.67s/it, bpc=16.6, loss=11.5]

| epoch 2 |   400/1171  batches | lr 1.00e-02 | ms/batch 1661.41 | loss 11.48 | bpc  16.5687


Training Epoch 2:  51%|█████     | 600/1171 [16:36<15:48,  1.66s/it, bpc=14.7, loss=10.2]

| epoch 2 |   600/1171  batches | lr 1.00e-02 | ms/batch 1659.48 | loss 10.21 | bpc  14.7267


Training Epoch 2:  68%|██████▊   | 800/1171 [22:09<10:16,  1.66s/it, bpc=12.2, loss=8.44]

| epoch 2 |   800/1171  batches | lr 1.00e-02 | ms/batch 1661.24 | loss  8.44 | bpc  12.1822


Training Epoch 2:  85%|████████▌ | 1000/1171 [27:40<04:44,  1.66s/it, bpc=10.6, loss=7.38]

| epoch 2 |  1000/1171  batches | lr 1.00e-02 | ms/batch 1657.64 | loss  7.38 | bpc  10.6451




Start Training


Training Epoch 3:  17%|█▋        | 200/1171 [05:32<26:53,  1.66s/it, bpc=4.5, loss=3.12]

| epoch 3 |   200/1171  batches | lr 1.00e-02 | ms/batch 1660.46 | loss  3.12 | bpc   4.4959


Training Epoch 3:  34%|███▍      | 400/1171 [11:03<21:20,  1.66s/it, bpc=4.5, loss=3.12]

| epoch 3 |   400/1171  batches | lr 1.00e-02 | ms/batch 1658.30 | loss  3.12 | bpc   4.4950


Training Epoch 3:  51%|█████     | 600/1171 [16:35<15:50,  1.66s/it, bpc=4.5, loss=3.12]

| epoch 3 |   600/1171  batches | lr 1.00e-02 | ms/batch 1659.75 | loss  3.12 | bpc   4.4953


Training Epoch 3:  68%|██████▊   | 800/1171 [22:07<10:17,  1.67s/it, bpc=4.5, loss=3.12]

| epoch 3 |   800/1171  batches | lr 1.00e-02 | ms/batch 1659.47 | loss  3.12 | bpc   4.4951


Training Epoch 3:  85%|████████▌ | 1000/1171 [27:39<04:44,  1.66s/it, bpc=4.49, loss=3.12]

| epoch 3 |  1000/1171  batches | lr 1.00e-02 | ms/batch 1657.65 | loss  3.12 | bpc   4.4949




Start Training


Training Epoch 4:  17%|█▋        | 200/1171 [05:32<26:48,  1.66s/it, bpc=4.5, loss=3.12]

| epoch 4 |   200/1171  batches | lr 1.00e-02 | ms/batch 1662.63 | loss  3.12 | bpc   4.4961


Training Epoch 4:  34%|███▍      | 400/1171 [11:04<21:20,  1.66s/it, bpc=4.5, loss=3.12]

| epoch 4 |   400/1171  batches | lr 1.00e-02 | ms/batch 1658.76 | loss  3.12 | bpc   4.4956


Training Epoch 4:  51%|█████     | 600/1171 [16:36<15:48,  1.66s/it, bpc=4.5, loss=3.12]

| epoch 4 |   600/1171  batches | lr 1.00e-02 | ms/batch 1658.68 | loss  3.12 | bpc   4.4953


Training Epoch 4:  68%|██████▊   | 800/1171 [22:07<10:16,  1.66s/it, bpc=4.49, loss=3.12]

| epoch 4 |   800/1171  batches | lr 1.00e-02 | ms/batch 1659.42 | loss  3.12 | bpc   4.4950


Training Epoch 4:  85%|████████▌ | 1000/1171 [27:39<04:45,  1.67s/it, bpc=4.49, loss=3.12]

| epoch 4 |  1000/1171  batches | lr 1.00e-02 | ms/batch 1659.23 | loss  3.12 | bpc   4.4948




Start Training


Training Epoch 5:  17%|█▋        | 200/1171 [05:31<26:50,  1.66s/it, bpc=34.5, loss=23.9]

| epoch 5 |   200/1171  batches | lr 1.00e-02 | ms/batch 1657.95 | loss 23.92 | bpc  34.5124


Training Epoch 5:  34%|███▍      | 400/1171 [11:03<21:19,  1.66s/it, bpc=21.4, loss=14.8]

| epoch 5 |   400/1171  batches | lr 1.00e-02 | ms/batch 1658.45 | loss 14.84 | bpc  21.4165


Training Epoch 5:  51%|█████     | 600/1171 [16:34<15:46,  1.66s/it, bpc=15.8, loss=10.9]

| epoch 5 |   600/1171  batches | lr 1.00e-02 | ms/batch 1658.24 | loss 10.94 | bpc  15.7759


Training Epoch 5:  68%|██████▊   | 800/1171 [22:06<10:13,  1.65s/it, bpc=13, loss=8.98]

| epoch 5 |   800/1171  batches | lr 1.00e-02 | ms/batch 1657.31 | loss  8.98 | bpc  12.9554


Training Epoch 5:  85%|████████▌ | 1000/1171 [27:37<04:43,  1.66s/it, bpc=11.3, loss=7.81]

| epoch 5 |  1000/1171  batches | lr 1.00e-02 | ms/batch 1657.10 | loss  7.81 | bpc  11.2632




Start Training


Training Epoch 6:  17%|█▋        | 200/1171 [05:31<26:54,  1.66s/it, bpc=4.49, loss=3.12]

| epoch 6 |   200/1171  batches | lr 1.00e-02 | ms/batch 1658.84 | loss  3.12 | bpc   4.4945


Training Epoch 6:  34%|███▍      | 400/1171 [11:03<21:18,  1.66s/it, bpc=4.49, loss=3.11]

| epoch 6 |   400/1171  batches | lr 1.00e-02 | ms/batch 1657.81 | loss  3.11 | bpc   4.4938


Training Epoch 6:  51%|█████     | 600/1171 [16:34<15:49,  1.66s/it, bpc=4.49, loss=3.12]

| epoch 6 |   600/1171  batches | lr 1.00e-02 | ms/batch 1657.21 | loss  3.12 | bpc   4.4943


Training Epoch 6:  68%|██████▊   | 800/1171 [22:06<10:16,  1.66s/it, bpc=4.49, loss=3.12]

| epoch 6 |   800/1171  batches | lr 1.00e-02 | ms/batch 1657.48 | loss  3.12 | bpc   4.4941


Training Epoch 6:  85%|████████▌ | 1000/1171 [27:38<04:44,  1.66s/it, bpc=4.49, loss=3.11]

| epoch 6 |  1000/1171  batches | lr 1.00e-02 | ms/batch 1659.31 | loss  3.11 | bpc   4.4938




Start Training


Training Epoch 7:  17%|█▋        | 200/1171 [05:32<26:47,  1.66s/it, bpc=4.49, loss=3.12]

| epoch 7 |   200/1171  batches | lr 1.00e-02 | ms/batch 1660.73 | loss  3.12 | bpc   4.4946


Training Epoch 7:  34%|███▍      | 400/1171 [11:03<21:20,  1.66s/it, bpc=4.49, loss=3.12]

| epoch 7 |   400/1171  batches | lr 1.00e-02 | ms/batch 1657.95 | loss  3.12 | bpc   4.4945


Training Epoch 7:  51%|█████     | 600/1171 [16:35<15:50,  1.66s/it, bpc=4.49, loss=3.12]

| epoch 7 |   600/1171  batches | lr 1.00e-02 | ms/batch 1658.67 | loss  3.12 | bpc   4.4941


Training Epoch 7:  68%|██████▊   | 800/1171 [22:07<10:16,  1.66s/it, bpc=4.49, loss=3.11]

| epoch 7 |   800/1171  batches | lr 1.00e-02 | ms/batch 1657.68 | loss  3.11 | bpc   4.4939


Training Epoch 7:  85%|████████▌ | 1000/1171 [27:38<04:44,  1.66s/it, bpc=9.08, loss=6.29]

| epoch 7 |  1000/1171  batches | lr 1.00e-02 | ms/batch 1655.73 | loss  6.29 | bpc   9.0780




Start Training


Training Epoch 8:  17%|█▋        | 200/1171 [05:31<26:51,  1.66s/it, bpc=5.12, loss=3.55]

| epoch 8 |   200/1171  batches | lr 1.00e-02 | ms/batch 1659.88 | loss  3.55 | bpc   5.1184


Training Epoch 8:  34%|███▍      | 400/1171 [11:03<21:22,  1.66s/it, bpc=4.81, loss=3.33]

| epoch 8 |   400/1171  batches | lr 1.00e-02 | ms/batch 1655.55 | loss  3.33 | bpc   4.8066


Training Epoch 8:  51%|█████     | 600/1171 [16:34<15:42,  1.65s/it, bpc=4.7, loss=3.26]

| epoch 8 |   600/1171  batches | lr 1.00e-02 | ms/batch 1654.79 | loss  3.26 | bpc   4.7015


Training Epoch 8:  68%|██████▊   | 800/1171 [22:05<10:12,  1.65s/it, bpc=4.65, loss=3.22]

| epoch 8 |   800/1171  batches | lr 1.00e-02 | ms/batch 1656.65 | loss  3.22 | bpc   4.6497


Training Epoch 8:  85%|████████▌ | 1000/1171 [27:36<04:44,  1.66s/it, bpc=4.62, loss=3.2]

| epoch 8 |  1000/1171  batches | lr 1.00e-02 | ms/batch 1657.76 | loss  3.20 | bpc   4.6184


                                                                          

| End of training | test loss 3.1467 | test ppl 4.5397


