# Train model notebook

## Import libraries and setup

In [1]:
DEBUG = True
YSDP = True
KAGGLE = False

In [2]:
#!g1.1
import warnings
if DEBUG:
    warnings.filterwarnings('ignore', category=UserWarning)
import os
import gc
gc.enable()
import math
import json
import time
import random
import multiprocessing
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from sklearn import model_selection
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
from torch.utils.data.distributed import DistributedSampler
import transformers
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    logging,
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
)
logging.set_verbosity_warning()
logging.set_verbosity_error()
os.environ['CUDA_VISIBLE_DEVICES'] = '0' if KAGGLE else '1'

In [3]:
VER = 'vdsp1'
DATA_PATH = './data'
MDLS_PATH = f'./models_{VER}'
ROBERTA_TYPE = 'deepset/xlm-roberta-large-squad2' # 'deepset/xlm-roberta-base-squad2'
CONFIG = {
    'folds': 5,
    'fold_train': None, # 'None' or '0', '1', ..., '4'
    'model_type': 'xlm_roberta',
    'model_name_or_path':ROBERTA_TYPE,
    'config_name': ROBERTA_TYPE,
    'apex': False,
    'grad_accum_steps': 2,
    'tokenizer_name': ROBERTA_TYPE,
    'max_seq_length': 256, # 256 or 384
    'doc_stride': 128, 
    'epochs': 100,
    'max_patience': 2,
    'train_batch_size': 4,
    'eval_batch_size': 8,
    'optimizer_type': 'AdamW',
    'learning_rate': 1.5e-5,
    'weight_decay': 1e-2,
    'epsilon': 1e-8,
    'max_grad_norm': 1, # '1' or 'None'
    'decay_name': 'linear-warmup',
    'optimizer_grouped_parameters': False,
    'warmup_ratio': .1,
    'logging_steps': 500,
    'output_dir': MDLS_PATH,
    'seed': None
}
if not os.path.exists(MDLS_PATH):
    os.mkdir(MDLS_PATH)
with open(f'{MDLS_PATH}/base_config.json', 'w') as file:
    json.dump(CONFIG, file)
        
def seed_all(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def optimal_workers():
    num_cpus = multiprocessing.cpu_count()
    num_gpus = torch.cuda.device_count()
    optimal_value = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
    print('optimal number of workers is', optimal_value)
    return optimal_value

start_time = time.time()

## Load and preprocess data

In [4]:
train = pd.read_csv(f'{DATA_PATH}/train.csv')
test = pd.read_csv(f'{DATA_PATH}/test.csv')
external_mlqa = pd.read_csv(f'{DATA_PATH}/mlqa_hindi.csv')
external_xquad = pd.read_csv(f'{DATA_PATH}/xquad.csv')
external_train = pd.concat([external_mlqa, external_xquad])

def create_folds(data, num_splits):
    data['kfold'] = -1
    kf = model_selection.StratifiedKFold(
        n_splits=num_splits, 
        shuffle=True, 
        random_state=CONFIG['seed']
    )
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data['language'])):
        data.loc[v_, 'kfold'] = f
    return data

train = create_folds(train, num_splits=CONFIG['folds'])
external_train['kfold'] = -1
external_train['id'] = list(np.arange(1, len(external_train) + 1))
train = pd.concat([train, external_train]).reset_index(drop=True)

def convert_answers(row):
    return {'answer_start': [row[0]], 'text': [row[1]]}

train['answers'] = train[['answer_start', 'answer_text']].apply(convert_answers, axis=1)

In [5]:
#!g1.1
def prepare_train_features(config, example, tokenizer):
    example['question'] = example['question'].lstrip()
    tokenized_example = tokenizer(
        example['question'],
        example['context'],
        truncation='only_second',
        max_length=config['max_seq_length'],
        stride=config['doc_stride'],
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding='max_length'
    )
    sample_mapping = tokenized_example.pop('overflow_to_sample_mapping')
    offset_mapping = tokenized_example.pop('offset_mapping')
    features = []
    for i, offsets in enumerate(offset_mapping):
        feature = {}
        input_ids = tokenized_example['input_ids'][i]
        attention_mask = tokenized_example['attention_mask'][i]
        feature['input_ids'] = input_ids
        feature['attention_mask'] = attention_mask
        feature['offset_mapping'] = offsets
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_example.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = example['answers']
        if len(answers['answer_start']) == 0:
            feature['start_position'] = cls_index
            feature['end_position'] = cls_index
        else:
            start_char = answers['answer_start'][0]
            end_char = start_char + len(answers['text'][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                feature['start_position'] = cls_index
                feature['end_position'] = cls_index
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                feature['start_position'] = token_start_index - 1
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                feature['end_position'] = token_end_index + 1
        features.append(feature)
    return features

## Dataset retriever

In [6]:
#!g1.1
class DatasetRetriever(Dataset):
    def __init__(self, features, mode='train'):
        super(DatasetRetriever, self).__init__()
        self.features = features
        self.mode = mode
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, item):   
        feature = self.features[item]
        if self.mode == 'train':
            return {
                'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':torch.tensor(feature['offset_mapping'], dtype=torch.long),
                'start_position':torch.tensor(feature['start_position'], dtype=torch.long),
                'end_position':torch.tensor(feature['end_position'], dtype=torch.long)
            }
        else:
            return {
                'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':feature['offset_mapping'],
                'sequence_ids':feature['sequence_ids'],
                'id':feature['example_id'],
                'context': feature['context'],
                'question': feature['question']
            }

## Build a model

In [7]:
#!g1.1
class Model(nn.Module):
    def __init__(self, modelname_or_path, config):
        super(Model, self).__init__()
        self.config = config
        self.xlm_roberta = AutoModel.from_pretrained(
            modelname_or_path, 
            config=config
        )
        self.qa_outputs = nn.Linear(config.hidden_size, 2)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self._init_weights(self.qa_outputs)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(
                mean=0, 
                std=self.config.initializer_range
            )
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(
        self, 
        input_ids, 
        attention_mask=None, 
        # token_type_ids=None
    ):
        outputs = self.xlm_roberta(
            input_ids,
            attention_mask=attention_mask,
        )
        sequence_output = outputs[0]
        pooled_output = outputs[1]
        # sequence_output = self.dropout(sequence_output)
        qa_logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = qa_logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        return start_logits, end_logits

## Loss

In [8]:
#!g1.1
def loss_fn(preds, labels):
    start_preds, end_preds = preds
    start_labels, end_labels = labels
    start_loss = nn.CrossEntropyLoss(ignore_index=-1)(
        start_preds,
        start_labels
    )
    end_loss = nn.CrossEntropyLoss(ignore_index=-1)(
        end_preds, 
        end_labels
    )
    total_loss = (start_loss + end_loss) / 2
    return total_loss

## Grouped Layerwise Learning Rate Decay

In [9]:
#!g1.1
def get_optimizer_grouped_parameters(config, model):
    no_decay = ['bias', 'LayerNorm.weight']
    group1=['layer.0.','layer.1.','layer.2.','layer.3.']
    group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
    group3=['layer.8.','layer.9.','layer.10.','layer.11.']
    group_all=['layer.0.','layer.1.','layer.2.','layer.3.',
               'layer.4.','layer.5.','layer.6.','layer.7.',
               'layer.8.','layer.9.','layer.10.','layer.11.']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.xlm_roberta.named_parameters() 
                    if not any(nd in n for nd in no_decay) 
                    and not any(nd in n for nd in group_all)],
         'weight_decay': config['weight_decay']},
        {'params': [p for n, p in model.xlm_roberta.named_parameters() 
                    if not any(nd in n for nd in no_decay) 
                    and any(nd in n for nd in group1)],
         'weight_decay': config['weight_decay'], 
         'lr': config['learning_rate'] / 2.6},
        {'params': [p for n, p in model.xlm_roberta.named_parameters() 
                    if not any(nd in n for nd in no_decay) 
                    and any(nd in n for nd in group2)],
         'weight_decay': config['weight_decay'], 
         'lr': config['learning_rate']},
        {'params': [p for n, p in model.xlm_roberta.named_parameters() 
                    if not any(nd in n for nd in no_decay) 
                    and any(nd in n for nd in group3)],
         'weight_decay': config['weight_decay'], 
         'lr': config['learning_rate'] * 2.6},
        {'params': [p for n, p in model.xlm_roberta.named_parameters() 
                    if any(nd in n for nd in no_decay) 
                    and not any(nd in n for nd in group_all)],
         'weight_decay': 0},
        {'params': [p for n, p in model.xlm_roberta.named_parameters() 
                    if any(nd in n for nd in no_decay) 
                    and any(nd in n for nd in group1)],
         'weight_decay': 0, 
         'lr': config['learning_rate'] / 2.6},
        {'params': [p for n, p in model.xlm_roberta.named_parameters() 
                    if any(nd in n for nd in no_decay) 
                    and any(nd in n for nd in group2)],
         'weight_decay': 0, 'lr': config['learning_rate']},
        {'params': [p for n, p in model.xlm_roberta.named_parameters() 
                    if any(nd in n for nd in no_decay) 
                    and any(nd in n for nd in group3)],
         'weight_decay': 0, 
         'lr': config['learning_rate'] * 2.6},
        {'params': [p for n, p in model.named_parameters() 
                    if config['model_type'] not in n], 
         'weight_decay': 0,
         'lr': config['learning_rate'] * 20},
    ]
    return optimizer_grouped_parameters

## Metric logger

In [10]:
#!g1.1
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        self.max = 0
        self.min = 1e5

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        if val > self.max:
            self.max = val
        if val < self.min:
            self.min = val

## Utilities

In [11]:
#!g1.1
def make_model(config):
    model_config = AutoConfig.from_pretrained(config['config_name'])
    model = Model(config['model_name_or_path'], config=model_config)
    return model_config, model

def make_optimizer(config, model):
    if config['optimizer_grouped_parameters']:
        optimizer_grouped_parameters = get_optimizer_grouped_parameters(config, model)
    else:
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in model.named_parameters() 
                           if not any(nd in n for nd in no_decay)],
                'weight_decay': config['weight_decay'],
            },
            {
                'params': [p for n, p in model.named_parameters() 
                           if any(nd in n for nd in no_decay)],
                'weight_decay': 0,
            },
        ]
    if config['optimizer_type'] == 'AdamW':
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=config['learning_rate'],
            eps=config['epsilon'],
            correct_bias=True
        )
    return optimizer

def make_scheduler(
    config, optimizer, 
    num_warmup_steps, 
    num_training_steps
):
    if config['decay_name'] == 'cosine-warmup':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
    else:
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
    return scheduler    

def make_loader(
    config, data, 
    tokenizer, fold
):
    train_set, val_set = data[data['kfold']!=fold], data[data['kfold']==fold]
    train_features, val_features = [[] for _ in range(2)]
    for i, row in train_set.iterrows():
        train_features += prepare_train_features(config, row, tokenizer)
    for i, row in val_set.iterrows():
        val_features += prepare_train_features(config, row, tokenizer)
    train_dataset = DatasetRetriever(train_features)
    val_dataset = DatasetRetriever(val_features)
    print(f'num examples train: {len(train_dataset)}', 
          f'num examples val: {len(val_dataset)}')
    train_sampler = RandomSampler(train_dataset)
    val_sampler = SequentialSampler(val_dataset)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=config['train_batch_size'],
        sampler=train_sampler,
        num_workers=optimal_workers(),
        pin_memory=True,
        drop_last=False 
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=config['eval_batch_size'], 
        sampler=val_sampler,
        num_workers=optimal_workers(),
        pin_memory=True, 
        drop_last=False
    )
    return train_dataloader, val_dataloader

## Trainer and evaluator

In [12]:
#!g1.1
class Trainer:
    def __init__(
        self, model, tokenizer, 
        optimizer, scheduler
    ):
        self.model = model
        self.tokenizer = tokenizer
        self.optimizer = optimizer
        self.scheduler = scheduler

    def train(
        self, config, 
        train_dataloader, 
        epoch, result_dict
    ):
        print('=' * 10, f'train {epoch} epoch', '=' * 10)
        count = 0
        losses = AverageMeter()
        self.model.zero_grad()
        self.model.train()
        if config['seed']:
            seed_all(config['seed'])
        if config['apex']:
            scaler = torch.cuda.amp.GradScaler()
        for batch_idx, batch_data in enumerate(train_dataloader):
            if config['apex']:
                with torch.cuda.amp.autocast():
                    input_ids, attention_mask, targets_start, targets_end = \
                        batch_data['input_ids'], batch_data['attention_mask'], \
                            batch_data['start_position'], batch_data['end_position']
                    input_ids, attention_mask, targets_start, targets_end = \
                        input_ids.cuda(), attention_mask.cuda(), \
                            targets_start.cuda(), targets_end.cuda()
                    outputs_start, outputs_end = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                    )
                    loss = loss_fn((outputs_start, outputs_end), 
                                   (targets_start, targets_end))
                    loss = loss / config['grad_accum_steps']
                    scaler.scale(loss).backward()
                    if config['max_grad_norm']:
                        torch.nn.utils.clip_grad_norm_(
                            self.model.parameters(), 
                            config['max_grad_norm']
                        )
                    if batch_idx % config['grad_accum_steps'] == 0 or batch_idx == len(train_dataloader) - 1:
                        scaler.step(self.optimizer)
                        scaler.update()
                        self.optimizer.zero_grad()
            else:
                input_ids, attention_mask, targets_start, targets_end = \
                    batch_data['input_ids'], batch_data['attention_mask'], \
                        batch_data['start_position'], batch_data['end_position']
                input_ids, attention_mask, targets_start, targets_end = \
                    input_ids.cuda(), attention_mask.cuda(), \
                        targets_start.cuda(), targets_end.cuda()
                outputs_start, outputs_end = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                )
                loss = loss_fn((outputs_start, outputs_end), 
                               (targets_start, targets_end))
                loss = loss / config['grad_accum_steps']
                loss.backward()
                if config['max_grad_norm']:
                    torch.nn.utils.clip_grad_norm_(
                        self.model.parameters(), 
                        config['max_grad_norm']
                    )
                if batch_idx % config['grad_accum_steps'] == 0 or batch_idx == len(train_dataloader) - 1:
                    self.optimizer.step()
                    self.optimizer.zero_grad()
            self.scheduler.step()
            count += input_ids.size(0)
            losses.update(loss.item(), input_ids.size(0))
            if (batch_idx % config['logging_steps'] == 0) or (batch_idx + 1) == len(train_dataloader):
                _s = str(len(str(len(train_dataloader.sampler))))
                ret = [
                    ('epoch: {: >2} [{: >' + _s + '}/{} ({: >3.0f}%)]').format(
                        epoch, 
                        count, 
                        len(train_dataloader.sampler), 
                        100 * count / len(train_dataloader.sampler)
                    ),
                    'train loss: {: >4.5f}'.format(losses.avg),
                ]
                print(', '.join(ret), end='\n' if YSDP else '\r')
        print()
        result_dict['train_loss'].append(losses.avg)
        return result_dict

In [13]:
#!g1.1
class Evaluator:
    def __init__(self, model):
        self.model = model
    
    def save(self, result, output_dir):
        with open(f'{output_dir}/result_dict.json', 'w') as f:
            f.write(json.dumps(
                result, 
                sort_keys=True, 
                indent=4, 
                ensure_ascii=False))

    def evaluate(
        self, config,
        val_dataloader, 
        epoch, result_dict
    ):
        count = 0
        losses = AverageMeter()
        for batch_idx, batch_data in enumerate(val_dataloader):
            self.model = self.model.eval()
            input_ids, attention_mask, targets_start, targets_end = \
                batch_data['input_ids'], batch_data['attention_mask'], \
                    batch_data['start_position'], batch_data['end_position']
            input_ids, attention_mask, targets_start, targets_end = \
                input_ids.cuda(), attention_mask.cuda(), targets_start.cuda(), \
                    targets_end.cuda()
            with torch.no_grad():            
                outputs_start, outputs_end = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                )
                loss = loss_fn((outputs_start, outputs_end), 
                               (targets_start, targets_end))
                count += input_ids.size(0)
                losses.update(loss.item(), input_ids.size(0))
            if (batch_idx % config['logging_steps'] == 0) or (batch_idx + 1) == len(val_dataloader):
                _s = str(len(str(len(val_dataloader.sampler))))
                ret = [
                    ('epoch: {: >2} [{: >' + _s + '}/{} ({: >3.0f}%)]').format(
                        epoch, 
                        count, 
                        len(val_dataloader.sampler), 
                        100 * count / len(val_dataloader.sampler)
                    ),
                    'val loss: {: >4.5f}'.format(losses.avg),
                ]
                print(', '.join(ret), end='\n' if YSDP else '\r')
        print()
        result_dict['val_loss'].append(losses.avg)        
        return result_dict

## Run engine

In [14]:
#!g1.1
def init_training(config, data, tokenizer, fold):
    if config['seed']:
        seed_all(config['seed'])
    if not os.path.exists(config['output_dir']):
        os.makedirs(config['output_dir'])
    model_config, model = make_model(config)
    if torch.cuda.device_count() >= 1:
        print('model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), 
            torch.cuda.get_device_name(0))
        )
        model = model.cuda() 
    else:
        raise ValueError('CPU training is not supported')
    train_dataloader, val_dataloader = make_loader(config, data, tokenizer, fold)
    optimizer = make_optimizer(config, model)
    num_training_steps = math.ceil(
        len(train_dataloader) / config['grad_accum_steps']
    ) * config['epochs']
    if config['warmup_ratio'] > 0:
        num_warmup_steps = int(config['warmup_ratio'] * num_training_steps)
    else:
        num_warmup_steps = 0
    print(f'total train steps: {num_training_steps}',
          f'| total warmup steps: {num_warmup_steps}')
    scheduler = make_scheduler(
        config, 
        optimizer, 
        num_warmup_steps, 
        num_training_steps
    )
    result_dict = {
        'epoch':[], 
        'train_loss': [], 
        'val_loss' : [], 
        'best_val_loss': np.inf
    }
    return (model, model_config, optimizer, 
            scheduler, train_dataloader, 
            val_dataloader, result_dict)

In [15]:
#!g1.1
def run(data, tokenizer, fold, epochs, max_patience):
    model, model_config, \
        optimizer, scheduler, train_dataloader, \
            val_dataloader, result_dict = init_training(CONFIG, data, 
                                                        tokenizer, fold)
    trainer = Trainer(model, tokenizer, optimizer, scheduler)
    evaluator = Evaluator(model)
    train_time_list = []
    val_time_list = []
    n_patience = 0
    for epoch in range(epochs):
        result_dict['epoch'].append(epoch)
        torch.cuda.synchronize()
        ep_time = time.time()
        result_dict = trainer.train(
            CONFIG, train_dataloader, 
            epoch, result_dict
        )
        torch.cuda.synchronize()
        train_time_list.append(time.time() - ep_time)
        torch.cuda.synchronize()
        ep_time = time.time()
        result_dict = evaluator.evaluate(
            CONFIG, val_dataloader, 
            epoch, result_dict
        )
        torch.cuda.synchronize()
        val_time_list.append(time.time() - ep_time)
        output_dir = os.path.join(
            CONFIG['output_dir'], 
            f'checkpoint-fold-{fold}'
        )
        if result_dict['val_loss'][-1] < result_dict['best_val_loss']:
            print('{} epoch -> best epoch updated, val loss: {: >4.5f}'.format(
                epoch, 
                result_dict['val_loss'][-1]
            ))
            result_dict['best_val_loss'] = result_dict['val_loss'][-1]        
            os.makedirs(output_dir, exist_ok=True)
            torch.save(model.state_dict(), f'{output_dir}/pytorch_model.bin')
            model_config.save_pretrained(output_dir)
            model_config.save_pretrained(MDLS_PATH)
            print(f'saving model checkpoint to {output_dir}')
            n_patience = 0
        else:
            n_patience += 1
        if n_patience >= max_patience:
            print(f'no val loss improvement for last {n_patience} epochs')
            break
    evaluator.save(result_dict, output_dir)
    print(
        f'total train time: {np.sum(train_time_list) // 60:.0f} min',
        f'{np.sum(train_time_list) % 60:.0f} secs | ', 
        f'average per epoch: {np.mean(train_time_list) // 60:.0f} min',
        f'{np.mean(train_time_list)  % 60:.0f} secs'
    )
    print(
        f'total val time: {np.sum(val_time_list) // 60:.0f} min',
        f'{np.sum(val_time_list) % 60:.0f} secs | ', 
        f'average per epoch: {np.mean(val_time_list) // 60:.0f} min',
        f'{np.mean(val_time_list)  % 60:.0f} secs'
    )
    del trainer, evaluator, model, model_config, tokenizer, \
        optimizer, scheduler, train_dataloader, val_dataloader, result_dict
    torch.cuda.empty_cache()
    gc.collect()

In [16]:
#!g1.1
TOKENIZER = AutoTokenizer.from_pretrained(CONFIG['tokenizer_name'])
TOKENIZER.save_pretrained(CONFIG['output_dir'])

if CONFIG['fold_train']:
    start_fold = CONFIG['fold_train'] 
    end_fold = start_fold + 1
else:
    start_fold = 0 
    end_fold = CONFIG['folds']
for fold in range(start_fold, end_fold):
    content = ' '.join(['=' * 20, f'FOLD: {fold}', '=' * 20])
    print('=' * len(content))
    print(content)
    print('=' * len(content))
    run(train, TOKENIZER, fold, CONFIG['epochs'], CONFIG['max_patience'])

elapsed_time = time.time() - start_time
print(f'time elapsed: {elapsed_time // 60:.0f} min {elapsed_time % 60:.0f} sec')

In [17]:
#!g1.1
