In [1]:
!git clone https://github.com/whyamistudyingcs/FYP_smoothing.git

Cloning into 'FYP_smoothing'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 35 (delta 11), reused 12 (delta 3), pack-reused 0[K
Unpacking objects: 100% (35/35), 12.41 KiB | 1.24 MiB/s, done.


In [2]:
import torch
args = {
    'exp_dir': "./experiments/cls/",
    'exp_msg': "CLS Transformer",
    'gpu_idx': 10,

    'eval': True, #
    'model_dir_path': '/kaggle/input/imdb-bert-5', #
    'save_model': 'cls_trans', #
    'load_model': 'cls_trans_4', #
    'save': True, #

    'seed': 42, # 
    'device': "cuda" if torch.cuda.is_available() else "cpu", #

    'w_gn': 1.0,                    # embedding noise inference
    'noise_eps': 0.2,               # noise inference
    'single_layer': False,  #
    'nth_layers': 2,                # interval of injecting noise

    'num_ensemble': 10, # k_0
    'binom_ensemble': 50, # k_1
    'pooler_output': False, #
    'custom_forward': True,         # noise forwarding
    'binom_n_eval': 5,

    'sample_mask': False,
    'mask_batch_ratio': 1.0,        # percentage of masking , max is multi mask
    'rand_mask': False,
    'grad_mask_sample': False,
    'ens_grad_mask': 'rand',
    'two_step': False,
    'multi_mask': 2,                # number of masks
    'mask_idx': 103, #
    'vote_type': 'avg',

    'epochs': 5,
    'batch_size': 16, #
    'model': 'bert',                # model
    'optim': 'adamw',
    'scheduler': 'linear',
    'lr': 0.00001,
    'dropout': 0.1,
    'clip': 1.0,                    # clip gradient 
    'margin': 0.5,

    'embed_dim': 768,

    'dataset': 'imdb',#
    'num_classes': 2, #
    'pad_idx': 0,
    'max_seq_length': 256, #

    'binom_p': False, #
    'alpha_p': 0.70 #
}


In [3]:
import sys
sys.path.append('/kaggle/working/FYP_smoothing')

import torch
from transformers import AdamW
import sys
import numpy as np
import random
import time, datetime
from datetime import timedelta
from model.adv_model import *
from model.load_model import *

from utils.utils import *
from utils.dataloader import *

if args['seed']>0:
    SEED = args['seed']
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

tokenizer = load_tokenizer(args)
train_dataloader, test_dataloader, dev_dataloader = trans_dataloader(args['dataset'], tokenizer, args)

print(f"Dataset classes: {args['num_classes']}") 
train_niter = len(train_dataloader)
total_iter = len(train_dataloader) * args['epochs']
# Create Model 
print(f"Load Model...")
model = noisy_forward_loader(args)
model = SeqClsWrapper(model, args)

if args['eval'] == True:
    model = load_checkpoint(model, args['load_model'], args['model_dir_path'])
    model.to(args['device'])
    model.eval()
    
    TP = 0
    n_samples = len(test_dataloader.dataset)
    
    start_t_gen = time.perf_counter()
    print("Start Evaluation....")
    for batch_idx, batch in enumerate(test_dataloader):
        
        input_ids = batch['input_ids'].to(args['device'])
        attention_mask = batch['attention_mask'].to(args['device'])
        labels = batch['labels'].to(args['device'])
        
        if args['num_ensemble'] > 1:
            mask_indices, _ = model.grad_mask(input_ids, attention_mask)
            logits = model.two_step_ensemble(input_ids, attention_mask, mask_indices, args['num_ensemble'], args['binom_ensemble'])
            correct = logits.argmax(dim=-1).eq(labels)
            TP += correct.sum().item()
        else:
            if args['multi_mask'] > 0:
                mask_indices, _ = model.grad_mask(input_ids, attention_mask)
                masked_ids = input_masking_function(input_ids, mask_indices, args)
                with torch.no_grad():
                    output = model(masked_ids, attention_mask)
            
            else:
                with torch.no_grad():
                    output = model(input_ids, attention_mask)
            
            preds = output['logits']
            correct = preds.argmax(dim=-1).eq(labels)
            TP += correct.sum().item()
    
    acc = 100 * (TP / n_samples)
    
    eval_t = time.perf_counter() - start_t_gen
    log = f"Test Acc: {acc:.4f}"
    print(log)
    print(f"Total Evaluation Time: {timedelta(seconds=eval_t)}") 
    sys.exit(0)
    
else:
    model.to(args['device'])
    model.train()
    optimizer = AdamW(model.parameters(), lr=args['lr'])

print("Start Training...")
start_train = time.perf_counter()

# TODO: print args

best_dev_epoch = 0
best_dev_acc = 0
for epoch in range(args['epochs']):
    model.train()
    loss_epoch = []
    loss_ood_epoch = []
    
    for batch_idx, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(args['device'])
        attention_mask = batch['attention_mask'].to(args['device'])
        labels = batch['labels'].to(args['device'])
        
        model.eval()
        indices, delta_grad = model.grad_mask(input_ids, attention_mask, pred=labels, mask_filter=True)
        model.zero_grad() # ensure no gradient left after grad mask call
        # print(delta_grad.shape)
        
        masked_ids = input_masking_function(input_ids, indices, args) # [batch, seq_length]
        
        model.train()
        output = model(masked_ids, attention_mask, labels, delta_grad, indices)
        
        loss = output['loss']
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), args['clip'])
        optimizer.step()
        loss_epoch.append(loss.item())
        
        if batch_idx % 100 == 0:
            log = f"Epoch: {epoch} || Iter: {batch_idx} || Loss: {np.mean(loss_epoch[-100:]):.3f}"
            print(log)
        
        # scheduler step
        curr = epoch * train_niter + batch_idx
        LinearScheduler(optimizer, total_iter, curr, args['lr'])
        
    log = f"\nEpoch: {epoch} || Loss: {np.mean(loss_epoch):.3f}"
    print(log)
    
    dev_acc = model_evaluation(model, dev_dataloader,args, eval_mode='dev')
    
    if dev_acc > best_dev_acc:
        best_dev_acc = dev_acc
        best_dev_epoch = epoch
        if args["save"]:
            save_checkpoint(args["save_model"], model, epoch, ckpt_dir=args['model_dir_path'])
    
    log = f"Epoch: {epoch} || Dev Acc: {dev_acc:.4f} || BestDevAcc: {best_dev_acc:.4f} || BestEpoch: {best_dev_epoch}"
    print(log)

end_train = time.perf_counter() - start_train
log = f"Total Training Time: {timedelta(seconds=end_train)}"
print(log)

print("Start testSet Evaluation...")
load_model_name = args['save_model'] + f"_{best_dev_epoch}"
print(f"Load BestDev Model...: {load_model_name}") 

model = noisy_forward_loader(args)
model = SeqClsWrapper(model, args)
model = load_checkpoint(model, load_model_name, args['model_dir_path'])
model.to(args['device'])
model.eval()

test_acc = model_evaluation(model, test_dataloader, args, eval_mode='test')
log = f"TestAcc: {test_acc:.4f} || BestDevAcc: {best_dev_acc:.4f}"
print(log) 
print("End Training...")

Load Tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizer: bert || PAD: 0 || MASK: 103


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 21.0M/21.0M [00:00<00:00, 38.0MB/s]
Downloading data: 100%|██████████| 20.5M/20.5M [00:00<00:00, 76.9MB/s]
Downloading data: 100%|██████████| 42.0M/42.0M [00:00<00:00, 127MB/s] 


Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Trainset Size: 1125
Testset Size: 750
Devset Size: 125
Dataset classes: 2
Load Model...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load: cls_trans_4
Start Evaluation....
Test Acc: 92.0000
Total Evaluation Time: 0:01:15.892583


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
