In [None]:
import os
import json
from datetime import datetime

import torch
import torch.nn as nn
import torch.optim as optim

import sentencepiece as spm

from utils import (
    seed_everything,
    trainable_model_params,
    total_model_params,
    save_train_metrics,
    process_train_eval,
    Loader
)

from transformer import TransformerEncoder

In [None]:
seed_everything(1234)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = 'cpu'

tokenizer = spm.SentencePieceProcessor('spm_wiki.model')

# In Domain: 20 Newsgroups Dataset

In [None]:
BATCH_SIZE = 64
MAX_LEN = 512
NUM_WORKERS = 16

N_EPOCHS = 50
LR = 1e-3

loader = Loader(batch_size=BATCH_SIZE, max_len=MAX_LEN, num_workers=NUM_WORKERS)
train_loader = loader.load('20news', 'train')
val_loader = loader.load('20news', 'val')

## Without Spectral Normalisation

In [None]:
params_20ng = {
    'vocab_size': len(tokenizer),
    'emb_dim': 1024,
    'n_layers': 1,
    'n_heads': 8,
    'forward_dim': 1024,
    'dropout': 0.2,
    'max_len': MAX_LEN,
    'pad_idx': tokenizer.pad_id(),
    'kind': 'sto',
    'tau2': 10,
    'spectral': False,
    'n_classes': 20,
    'device': device
}

params_to_save = {
    'lr': LR,
    'batch_size': BATCH_SIZE,
    'optim': 'adam'
}

model = TransformerEncoder(**params_20ng).to(device)
print(f"Total model params: {total_model_params(model):,d}")
print(f"Trainable model params: {trainable_model_params(model):,d}")

optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [None]:
path = '../../../params/udl/sto_20news/'
now = datetime.now()
path_now = path + now.strftime("%Y-%m-%d %H:%M:%S") + '/'
os.makedirs(path_now)
logging_path = path_now + 'results.csv'

for k, v in params_20ng.items():
    params_to_save[k] = v

with open(path_now + 'params.json', 'w') as f:
    json.dump(params_to_save, f, indent=4)
    
highest_val_acc = 0

# normal training loop
for epoch in range(1, N_EPOCHS + 1):
    if epoch == 10:
        for g in optimizer.param_groups:
            g['lr'] = 1e-4
            print('updated learning rate')
    model.train()
    train_loss, train_acc = process_train_eval(
        model, train_loader, criterion, optimizer
    )

    model.eval()
    with torch.no_grad():
        val_loss, val_acc = process_train_eval(model, val_loader, criterion)
        
    # save metrics
    save_train_metrics(
        epoch,
        train_loss,
        train_acc,
        val_loss,
        val_acc,
        path=logging_path,
    )
    

    if val_acc > highest_val_acc:
        highest_val_acc = val_acc
        _path = path_now + f"acc{val_acc:.4f}_epoch{epoch}.pt"
        torch.save(model.state_dict(), _path)

    print(
        f"Training:   [Epoch {epoch:2d}, Loss: {train_loss:8.6f}, Acc: {train_acc:.6f}]"
    )
    print(f"Evaluation: [Epoch {epoch:2d}, Loss: {val_loss:8.6f}, Acc: {val_acc:.6f}]")

## With Spectral Normalisation

In [None]:
params_20ng_sn = {
    'vocab_size': len(tokenizer),
    'emb_dim': 1024,
    'n_layers': 1,
    'n_heads': 8,
    'forward_dim': 1024,
    'dropout': 0.2,
    'max_len': MAX_LEN,
    'pad_idx': tokenizer.pad_id(),
    'kind': 'sto',
    'tau2': 40,
    'spectral': True,
    'n_classes': 20,
    'device': device
}

params_to_save = {
    'lr': LR,
    'batch_size': BATCH_SIZE,
    'optim': 'adam'
}

model = TransformerEncoder(**params_20ng_sn).to(device)
print(f"Total model params: {total_model_params(model):,d}")
print(f"Trainable model params: {trainable_model_params(model):,d}")

optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [None]:
path = '../../../params/udl/sto_20news_sn/'
now = datetime.now()
path_now = path + now.strftime("%Y-%m-%d %H:%M:%S") + '/'
os.makedirs(path_now)
logging_path = path_now + 'results.csv'

for k, v in params_20ng_sn.items():
    params_to_save[k] = v

with open(path_now + 'params.json', 'w') as f:
    json.dump(params_to_save, f, indent=4)
    
highest_val_acc = 0

# normal training loop
for epoch in range(1, N_EPOCHS + 1):
    if epoch == 10:
        for g in optimizer.param_groups:
            g['lr'] = 1e-4
            print('updated learning rate')
    model.train()
    train_loss, train_acc = process_train_eval(
        model, train_loader, criterion, optimizer
    )

    model.eval()
    with torch.no_grad():
        val_loss, val_acc = process_train_eval(model, val_loader, criterion)
        
    # save metrics
    save_train_metrics(
        epoch,
        train_loss,
        train_acc,
        val_loss,
        val_acc,
        path=logging_path,
    )
    

    if val_acc > highest_val_acc:
        highest_val_acc = val_acc
        _path = path_now + f"acc{val_acc:.4f}_epoch{epoch}.pt"
        torch.save(model.state_dict(), _path)

    print(
        f"Training:   [Epoch {epoch:2d}, Loss: {train_loss:8.6f}, Acc: {train_acc:.6f}]"
    )
    print(f"Evaluation: [Epoch {epoch:2d}, Loss: {val_loss:8.6f}, Acc: {val_acc:.6f}]")

# In Domain: TREC

In [None]:
BATCH_SIZE = 64
MAX_LEN = 512
NUM_WORKERS = 16

N_EPOCHS = 100
LR = 1e-3

loader = Loader(batch_size=BATCH_SIZE, max_len=MAX_LEN, num_workers=NUM_WORKERS)
train_loader = loader.load('trec', 'train')
val_loader = loader.load('trec', 'val')

## Without Spectral Normalisation

In [None]:
params_trec = {
    'vocab_size': len(tokenizer),
    'emb_dim': 1024,
    'n_layers': 1,
    'n_heads': 8,
    'forward_dim': 1024,
    'dropout': 0.1,
    'max_len': MAX_LEN,
    'pad_idx': tokenizer.pad_id(),
    'kind': 'sto',
    'tau2': 10,
    'spectral': False,
    'n_classes': 50,
    'device': device
}

params_to_save = {
    'lr': LR,
    'batch_size': BATCH_SIZE,
    'optim': 'adam'
}

model = TransformerEncoder(**params_trec).to(device)
print(f"Total model params: {total_model_params(model):,d}")
print(f"Trainable model params: {trainable_model_params(model):,d}")

optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [None]:
path = '../../../params/udl/sto_trec/'
now = datetime.now()
path_now = path + now.strftime("%Y-%m-%d %H:%M:%S") + '/'
os.makedirs(path_now)
logging_path = path_now + 'results.csv'

for k, v in params_trec.items():
    params_to_save[k] = v

with open(path_now + 'params.json', 'w') as f:
    json.dump(params_to_save, f, indent=4)

highest_val_acc = 0

# normal training loop
for epoch in range(1, N_EPOCHS + 1):
    if epoch == 50:
        for g in optimizer.param_groups:
            g['lr'] = 1e-4
            print('updated learning rate')
    model.train()
    train_loss, train_acc = process_train_eval(
        model, train_loader, criterion, optimizer
    )

    model.eval()
    with torch.no_grad():
        val_loss, val_acc = process_train_eval(model, val_loader, criterion)

    # save metrics
    save_train_metrics(
        epoch,
        train_loss,
        train_acc,
        val_loss,
        val_acc,
        path=logging_path,
    )


    if val_acc > highest_val_acc:
        highest_val_acc = val_acc
        _path = path_now + f"acc{val_acc:.4f}_epoch{epoch}.pt"
        torch.save(model.state_dict(), _path)

## With Spectral Normalisation

In [None]:
params_trec_sn = {
    'vocab_size': len(tokenizer),
    'emb_dim': 1024,
    'n_layers': 1,
    'n_heads': 8,
    'forward_dim': 1024,
    'dropout': 0.1,
    'max_len': MAX_LEN,
    'pad_idx': tokenizer.pad_id(),
    'kind': 'sto',
    'tau2': 10,
    'spectral': True,
    'n_classes': 50,
    'device': device
}

params_to_save = {
    'lr': LR,
    'batch_size': BATCH_SIZE,
    'optim': 'adam'
}

model = TransformerEncoder(**params_trec_sn).to(device)
print(f"Total model params: {total_model_params(model):,d}")
print(f"Trainable model params: {trainable_model_params(model):,d}")

optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [None]:
path = '../../../params/udl/sto_trec_sn/'
now = datetime.now()
path_now = path + now.strftime("%Y-%m-%d %H:%M:%S") + '/'
os.makedirs(path_now)
logging_path = path_now + 'results.csv'

for k, v in params_trec_sn.items():
    params_to_save[k] = v

with open(path_now + 'params.json', 'w') as f:
    json.dump(params_to_save, f, indent=4)

highest_val_acc = 0

# normal training loop
for epoch in range(1, N_EPOCHS + 1):
    if epoch == 50:
        for g in optimizer.param_groups:
            g['lr'] = 1e-4
            print('updated learning rate')
    model.train()
    train_loss, train_acc = process_train_eval(
        model, train_loader, criterion, optimizer
    )

    model.eval()
    with torch.no_grad():
        val_loss, val_acc = process_train_eval(model, val_loader, criterion)

    # save metrics
    save_train_metrics(
        epoch,
        train_loss,
        train_acc,
        val_loss,
        val_acc,
        path=logging_path,
    )


    if val_acc > highest_val_acc:
        highest_val_acc = val_acc
        _path = path_now + f"acc{val_acc:.4f}_epoch{epoch}.pt"
        torch.save(model.state_dict(), _path)

# In Domain: SST

In [None]:
BATCH_SIZE = 64
MAX_LEN = 512
NUM_WORKERS = 16

N_EPOCHS = 50
LR = 1e-4

loader = Loader(batch_size=BATCH_SIZE, max_len=MAX_LEN, num_workers=NUM_WORKERS)
train_loader = loader.load('sst', 'train')
val_loader = loader.load('sst', 'val')

## Without Spectral Normalisation

In [None]:
params_sst = {
    'vocab_size': len(tokenizer),
    'emb_dim': 1024,
    'n_layers': 1,
    'n_heads': 8,
    'forward_dim': 2048,
    'dropout': 0.5,
    'max_len': MAX_LEN,
    'pad_idx': tokenizer.pad_id(),
    'kind': 'sto',
    'tau2': 40,
    'spectral': False,
    'n_classes': 2,
    'device': device
}

params_to_save = {
    'lr': LR,
    'batch_size': BATCH_SIZE,
    'optim': 'adam'
}

model = TransformerEncoder(**params_sst).to(device)
print(f"Total model params: {total_model_params(model):,d}")
print(f"Trainable model params: {trainable_model_params(model):,d}")

optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [None]:
path = '../../../params/udl/sto_sst/'
now = datetime.now()
path_now = path + now.strftime("%Y-%m-%d %H:%M:%S") + '/'
os.makedirs(path_now)
logging_path = path_now + 'results.csv'

for k, v in params_sst.items():
    params_to_save[k] = v

with open(path_now + 'params.json', 'w') as f:
    json.dump(params_to_save, f, indent=4)

highest_val_acc = 0

# normal training loop
for epoch in range(1, N_EPOCHS + 1):
    if epoch == 30:
        for g in optimizer.param_groups:
            g['lr'] = 5e-5
            print('updated learning rate')
    model.train()
    train_loss, train_acc = process_train_eval(
        model, train_loader, criterion, optimizer
    )

    model.eval()
    with torch.no_grad():
        val_loss, val_acc = process_train_eval(model, val_loader, criterion)

    # save metrics
    save_train_metrics(
        epoch,
        train_loss,
        train_acc,
        val_loss,
        val_acc,
        path=logging_path,
    )


    if val_acc > highest_val_acc:
        highest_val_acc = val_acc
        _path = path_now + f"acc{val_acc:.4f}_epoch{epoch}.pt"
        torch.save(model.state_dict(), _path)
        
    print(
    f"Training:   [Epoch {epoch:2d}, Loss: {train_loss:8.6f}, Acc: {train_acc:.6f}]"
    )
    print(f"Evaluation: [Epoch {epoch:2d}, Loss: {val_loss:8.6f}, Acc: {val_acc:.6f}]")

## With Spectral Normalisation

In [None]:
params_sst_sn = {
    'vocab_size': len(tokenizer),
    'emb_dim': 1024,
    'n_layers': 1,
    'n_heads': 8,
    'forward_dim': 2048,
    'dropout': 0.4,
    'max_len': MAX_LEN,
    'pad_idx': tokenizer.pad_id(),
    'kind': 'sto',
    'tau2': 40,
    'spectral': True,
    'n_classes': 2,
    'device': device
}

params_to_save = {
    'lr': LR,
    'batch_size': BATCH_SIZE,
    'optim': 'adam'
}

model = TransformerEncoder(**params_sst_sn).to(device)
print(f"Total model params: {total_model_params(model):,d}")
print(f"Trainable model params: {trainable_model_params(model):,d}")

optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [None]:
path = '../../../params/udl/sto_sst_sn/'
now = datetime.now()
path_now = path + now.strftime("%Y-%m-%d %H:%M:%S") + '/'
os.makedirs(path_now)
logging_path = path_now + 'results.csv'

for k, v in params_sst_sn.items():
    params_to_save[k] = v

with open(path_now + 'params.json', 'w') as f:
    json.dump(params_to_save, f, indent=4)

highest_val_acc = 0

# normal training loop
for epoch in range(1, N_EPOCHS + 1):
    if epoch == 30:
        for g in optimizer.param_groups:
            g['lr'] = 5e-5
            print('updated learning rate')
    model.train()
    train_loss, train_acc = process_train_eval(
        model, train_loader, criterion, optimizer
    )

    model.eval()
    with torch.no_grad():
        val_loss, val_acc = process_train_eval(model, val_loader, criterion)

    # save metrics
    save_train_metrics(
        epoch,
        train_loss,
        train_acc,
        val_loss,
        val_acc,
        path=logging_path,
    )


    if val_acc > highest_val_acc:
        highest_val_acc = val_acc
        _path = path_now + f"acc{val_acc:.4f}_epoch{epoch}.pt"
        torch.save(model.state_dict(), _path)

    print(
        f"Training:   [Epoch {epoch:2d}, Loss: {train_loss:8.6f}, Acc: {train_acc:.6f}]"
    )
    print(f"Evaluation: [Epoch {epoch:2d}, Loss: {val_loss:8.6f}, Acc: {val_acc:.6f}]")