# modeling14-5-FiD-encoder-sentence-level-classifier-from-scratch
- modeling sentence-classifier
- FiD-encoder
- python script

In [1]:
from pprint import pprint
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES']='1'

In [3]:
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
import heapq
import pickle
import pathlib
import shutil
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from pprint import pprint
from tqdm.auto import tqdm
from src.data import (
    BinaryCustomDatasetShuffle,
    BinarySentenceDataset,
    BinaryCustomDatasetDecisiveBinaryGold,
    BinaryCustomDatasetPredictionShuffle,
    SentenceClassificationDataset,
    EncoderSentenceClassificationDataset
)

from functools import partial
import json
import math
import os
import logging
import sys
import evaluate
from util import utils

import transformers
import torch
import numpy as np
import random
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoModel,
    AutoConfig,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    set_seed,
    get_scheduler,
)
from util.arguments import ModelArguments, DataTrainingArguments, CustomTrainingArguments
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from sentence_transformers import SentenceTransformer
from FiD.src.model import FiDT5
from src.model import SentenceLSTM

NEW_LINE = "\n"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

DATASET_MAPPING = {
    "BinaryCustomDatasetShuffle" : BinaryCustomDatasetShuffle,
    "BinarySentenceDataset" : BinarySentenceDataset,
    'BinaryCustomDatasetDecisiveBinaryGold' : BinaryCustomDatasetDecisiveBinaryGold,
    'BinaryCustomDatasetPredictionShuffle' : BinaryCustomDatasetPredictionShuffle,
    'SentenceClassificationDataset' : SentenceClassificationDataset,
    'EncoderSentenceClassificationDataset' : EncoderSentenceClassificationDataset
}
EMBEDDING_ARC_MAPPING = {
    "SentenceTransformer" : SentenceTransformer,
     "FiDT5" : FiDT5
}

In [4]:
parser = HfArgumentParser(
    (ModelArguments, DataTrainingArguments, CustomTrainingArguments)
)

In [5]:
model_args, data_args, train_args = parser.parse_args_into_dataclasses([])

In [6]:
train_args.with_tracking = True
train_args.report_to = 'wandb'
train_args.wandb_project = 'sequence_classifier'
train_args.run_name = 'TESTING-FiD-Encoder-lstm-sequence_exclude_no_answer_exclude_indecisve-test'
train_args.output_dir = '/data/philhoon-relevance/binary-classification/results/NQ-DEV-DPR/5-fold/1/TESTING-FiD-Encoder-lstm-sequence_exclude_no_answer_exclude_indecisve-test'
train_args.seed = 42
train_args.num_layers = 2
train_args.drop_out_rate = 0.2
train_args.padding = -100
train_args.per_device_train_batch_size = 2
train_args.checkpointing_steps = '10'
train_args.num_train_epochs = 100
train_args.best_metric = 'f1'


In [7]:
print(train_args.learning_rate)
print(train_args.adam_beta1)
print(train_args.adam_beta2)
print(train_args.adam_epsilon)
print(train_args.gradient_accumulation_steps)
print(train_args.lr_scheduler_type)
print(train_args.num_warmup_steps)
# print(train_args.max_train_steps)
print(train_args.class_weights)
print(train_args.train_loss_steps)

5e-05
0.9
0.999
1e-08
1
linear
0
False
10


In [8]:
model_args.embedding = 1024
model_args.max_seq_length = 200

In [9]:
# model_args.model_architecture = "SentenceTransformer"
# model_args.model_name_or_path = 'all-MiniLM-L6-v2'

In [10]:
data_args.train_file = '/data/philhoon-relevance/binary-classification/NQ-DEV-DPR/5-fold/1/sequence_exclude_no_answer_exclude_indecisve/testing-sequence_exclude_no_answer_exclude_indecisve_ctx100id_split_train_1.pickle'
data_args.eval_file = '/data/philhoon-relevance/binary-classification/NQ-DEV-DPR/5-fold/1/sequence_exclude_no_answer_exclude_indecisve/testing-sequence_exclude_no_answer_exclude_indecisve_ctx100id_split_train_1.pickle'
data_args.dataset_class = 'EncoderSentenceClassificationDataset'


In [11]:
logger = get_logger(__name__)

accelerator = (
    Accelerator(log_with=train_args.report_to, logging_dir=train_args.output_dir) if train_args.with_tracking else Accelerator()
)

In [12]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

In [13]:
logger.info(accelerator.state, main_process_only=False)
if accelerator.is_local_main_process:
    transformers.utils.logging.set_verbosity_info()
else:
    transformers.utils.logging.set_verbosity_error()

if train_args.seed is not None:
    set_seed(train_args.seed)

01/08/2023 22:44:40 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Mixed precision type: no



In [14]:
if accelerator.is_main_process and train_args.output_dir is not None:
    os.makedirs(train_args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()

In [15]:
train_args.output_dir

'/data/philhoon-relevance/binary-classification/results/NQ-DEV-DPR/5-fold/1/TESTING-FiD-Encoder-lstm-sequence_exclude_no_answer_exclude_indecisve-test'

In [16]:
# if model_args.model_architecture in EMBEDDING_ARC_MAPPING:
#     embedding_model = EMBEDDING_ARC_MAPPING[model_args.model_architecture](model_args.model_name_or_path)
#     model_args.embedding = 384
#     model_args.max_seq_length = 256

In [17]:
model = SentenceLSTM(num_layers = train_args.num_layers, 
                     embedding_size = model_args.embedding, 
                     num_labels = data_args.num_labels,
                     drop_out_rate = train_args.drop_out_rate
                    )

In [None]:
train_file = data_args.train_file
eval_file = data_args.eval_file

In [None]:
with open(train_file, 'rb') as f:
    train_data = pickle.load(f)
    
with open(eval_file, 'rb') as f:
    eval_data = pickle.load(f)

In [None]:
# train_data = utils.open_json(train_file)
# eval_data = utils.open_json(eval_file)

In [None]:
# seq_train_data = utils.prepare_sequential_data(train_data)
# seq_eval_data = utils.prepare_sequential_data(eval_data)

In [None]:
# DataSetClass = DATASET_MAPPING[data_args.dataset_class]

In [None]:
# Shuffled Here
train_dataset = EncoderSentenceClassificationDataset(train_data)
eval_dataset = EncoderSentenceClassificationDataset(eval_data)

In [None]:
# for index in random.sample(range(len(train_dataset)), 5):
#     logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

In [None]:
def custom_collate(batch, padding):
    train_lst = [b['input_embedding'] for b in batch]
    label_lst = [b['em_pattern'] for b in batch]
    seq_len_lst = [b['em_pattern'].shape[0] for b in batch]
    max_seq_len = max(seq_len_lst)
    
    padding_train_lst = []
    for embedding in train_lst:
        if embedding.shape[0] < max_seq_len:
            post_pad = torch.full(size=(max_seq_len-embedding.shape[0], embedding.shape[1]), fill_value = padding)
            post_pad = torch.full(size=(max_seq_len-embedding.shape[0], embedding.shape[1]), fill_value = -100)
            padding_train_lst.append(torch.concat([embedding, post_pad]))
        else:
            padding_train_lst.append(embedding)
            
    inputs = torch.stack(padding_train_lst)
    
    padding_label_lst = []
    for label in label_lst:
        if label.shape[0] < max_seq_len:
            post_pad = torch.full(size=(max_seq_len-label.shape[0], ), fill_value = padding)
            post_pad = torch.full(size=(max_seq_len-label.shape[0], ), fill_value = -100)
            torch.concat([label, post_pad])
            padding_label_lst.append(torch.concat([label, post_pad]))
        else:
            padding_label_lst.append(label)
            
    labels = torch.stack(padding_label_lst)
    
    return {
        'inputs' : inputs,
        'labels' : labels,
        'sequence_len' : torch.tensor(seq_len_lst)
    }

In [None]:
train_dataloader = DataLoader(train_dataset,
                          shuffle=False,
                              collate_fn= partial(custom_collate, padding = train_args.padding),
                              batch_size=train_args.per_device_train_batch_size,
                              )

In [None]:
eval_dataloader = DataLoader(eval_dataset,
                              shuffle = False,
                              collate_fn= partial(custom_collate, padding = train_args.padding),
                              batch_size=train_args.per_device_eval_batch_size,
                            )

In [None]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": train_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]

In [None]:
optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
                              lr=train_args.learning_rate,
                              betas=(train_args.adam_beta1, train_args.adam_beta2),
                              eps=train_args.adam_epsilon,
                              )

In [None]:
len(train_dataloader)

In [None]:
# Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / train_args.gradient_accumulation_steps)
if train_args.max_train_steps is None:
    train_args.max_train_steps = train_args.num_train_epochs * num_update_steps_per_epoch
    overrode_max_train_steps = True

In [None]:
lr_scheduler = get_scheduler(
    name=train_args.lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=train_args.num_warmup_steps,
    num_training_steps=train_args.max_train_steps,
)

In [None]:
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

In [None]:
# We need to recalculate our total training steps as the size of the training dataloader may have changed
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / train_args.gradient_accumulation_steps)
if overrode_max_train_steps:
    train_args.max_train_steps = train_args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
train_args.num_train_epochs = math.ceil(train_args.max_train_steps / num_update_steps_per_epoch)

In [None]:
checkpointing_steps = train_args.checkpointing_steps
if checkpointing_steps is not None and checkpointing_steps.isdigit():
    checkpointing_steps = int(checkpointing_steps)

In [None]:
if train_args.with_tracking:
    experiment_config = vars(train_args)

    accelerator.init_trackers(train_args.wandb_project, config=experiment_config,
                              init_kwargs={"wandb": {"name": train_args.run_name}})

In [None]:
metric_acc = evaluate.load("accuracy")
metric_pre = evaluate.load('precision')
metric_re = evaluate.load('recall')
metric_f1 = evaluate.load('f1')

In [None]:
# Train!
total_batch_size = train_args.per_device_train_batch_size * accelerator.num_processes * train_args.gradient_accumulation_steps

logger.info("***** Running training *****")
logger.info(f"  Num examples = {len(train_dataset)}")
logger.info(f"  Num Epochs = {train_args.num_train_epochs}")
logger.info(f"  Instantaneous batch size per device = {train_args.per_device_train_batch_size}")
logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f"  Gradient Accumulation steps = {train_args.gradient_accumulation_steps}")
logger.info(f"  Total optimization steps = {train_args.max_train_steps}")

In [None]:
# Saving model_args, data_args, train_args
train_dict = vars(train_args)
logger.info(f"  Saving training_args = {train_dict}")
with open(os.path.join(train_args.output_dir, f"train_args.json"), "w") as f:
    json.dump(train_dict, f)

model_dict = vars(model_args)
logger.info(f"  Saving model_args = {model_dict}")
with open(os.path.join(train_args.output_dir, f"model_args.json"), "w") as f:
    json.dump(model_dict, f)

data_dict = vars(data_args)
logger.info(f"  Saving data_args = {data_dict}")
with open(os.path.join(train_args.output_dir, f"data_args.json"), "w") as f:
    json.dump(data_dict, f)

In [None]:
# Only show the progress bar once on each machine.
progress_bar = tqdm(range(train_args.max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
starting_epoch = 0

# Using heap for limiting number of saved models
model_heap = []
heapq.heapify(model_heap)

In [None]:
def eval(model, eval_dataloader, accelerator, metric_acc, metric_pre, metric_re, metric_f1, 
         train_args, epoch, steps, output_dir, logger):

    eval_progress_bar = tqdm(range(len(eval_dataloader)), disable=not accelerator.is_local_main_process)

    eval_loss = 0
    model.eval()
    samples_seen = 0
    prediction_lst = []
    reference_lst = []

    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            logits = model(batch['inputs'], batch['sequence_len'])
            if train_args.class_weights:
                criterion = torch.nn.CrossEntropyLoss(weight=class_weights, reduction='mean', ignore_index=-100).cuda()
            else:
                criterion = torch.nn.CrossEntropyLoss(ignore_index=-100).cuda() 
            loss = criterion(logits.view(-1, logits.shape[-1]), batch['labels'].view(-1))
        
        if train_args.with_tracking:
            eval_loss += loss.detach().float()

        predictions = logits.argmax(dim=-1)
        references = batch['labels']
        
        # Get mask for target values != padding index
        nonpad_mask = references != train_args.padding
        
        # Slice out non-pad values
        references = references[nonpad_mask]
        predictions = predictions[nonpad_mask]
        
        predictions, references = accelerator.gather((predictions, references))
        # If we are in a multiprocess environment, the last batch has duplicates
        if accelerator.num_processes > 1:
            if step == len(eval_dataloader) - 1:
                predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
                references = references[: len(eval_dataloader.dataset) - samples_seen]
            else:
                samples_seen += references.shape[0]

        metric_acc.add_batch(
            predictions=predictions,
            references=references,
        )
        metric_pre.add_batch(
            predictions=predictions,
            references=references,
        )
        metric_re.add_batch(
            predictions=predictions,
            references=references,
        )
        metric_f1.add_batch(
            predictions=predictions,
            references=references,
        )
        eval_progress_bar.update(1)
        prediction_lst.extend(predictions.detach().cpu().tolist())
        reference_lst.extend(references.detach().cpu().tolist())

    eval_metric = metric_acc.compute()
    eval_metric_pre = metric_pre.compute()
    eval_metric_re = metric_re.compute()
    eval_metric_f1 = metric_f1.compute()

    logger.info(f"Evaluation at Epoch : {epoch} Total Step : {steps}")
    logger.info(f"Accuracy : {eval_metric['accuracy']} Precision : {eval_metric_pre['precision']}")
    logger.info(f"Recall : {eval_metric_re['recall']} F1 : {eval_metric_f1['f1']}")
    logger.info(f"Epoch : {epoch} Step : {steps}")
    logger.info(f"Eval_loss : {eval_loss.item() / len(eval_dataloader)}")

    result_log = {
        "eval_accuracy": eval_metric['accuracy'],
        "eval_precision": eval_metric_pre['precision'],
        "eval_recall": eval_metric_re['recall'],
        "eval_f1": eval_metric_f1['f1'],
        "eval_loss": eval_loss.item() / len(eval_dataloader),
        "epoch": epoch,
        "step": steps,
    }

    output_result_path = os.path.join(output_dir, f"epoch{epoch}_steps{steps}_results.json")
    with open(output_result_path, "w") as f:
        json.dump(result_log, f)

    if train_args.with_tracking:
        accelerator.log(
            result_log,
            step=steps,
        )

    ## Extra
    prediction_np = np.array(prediction_lst)
    reference_np = np.array(reference_lst)
    y_actu = pd.Series(reference_np, name='Actual')
    y_pred = pd.Series(prediction_np, name='Predicted')

    reversey_pred = y_pred.map(lambda x: 0 if x == 1 else 1)
    reversey_actu = y_actu.map(lambda x: 0 if x == 1 else 1)
    rev_accuracy = accuracy_score(reversey_actu, reversey_pred)
    rev_precision = precision_score(reversey_actu, reversey_pred)
    rev_recall = recall_score(reversey_actu, reversey_pred)
    rev_f1 = f1_score(reversey_actu, reversey_pred)

    logger.info(f"rev Evaluation at Epoch : {epoch} Total Step : {steps}")
    logger.info(f"rev_Accuracy : {rev_accuracy} rev_Precision : {rev_precision}")
    logger.info(f"rev_Recall : {rev_recall} rev_F1 : {rev_f1}")
    logger.info(f"Epoch : {epoch} Step : {steps}")
    logger.info(f"Eval_loss : {eval_loss.item() / len(eval_dataloader)}")

    result_rev_log = {
        "eval_rev_accuracy": rev_accuracy,
        "eval_rev_precision": rev_precision,
        "eval_rev_recall": rev_recall,
        "eval_rev_f1": rev_f1,
        "eval_loss": eval_loss.item() / len(eval_dataloader),
        "epoch": epoch,
        "step": steps,
    }

    output_result_path = os.path.join(output_dir, f"epoch{epoch}_steps{steps}_rev_results.json")
    with open(output_result_path, "w") as f:
        json.dump(result_rev_log, f)

    if train_args.with_tracking:
        accelerator.log(
            result_rev_log,
            step=steps,
        )

    return result_log, output_dir


In [None]:
for epoch in range(starting_epoch, train_args.num_train_epochs):
    model.train()
    if train_args.with_tracking:
        total_loss = 0
    for step, batch in enumerate(train_dataloader):
        logits = model(batch['inputs'], batch['sequence_len'])
        
        criterion = torch.nn.CrossEntropyLoss(ignore_index=-100).cuda() 
            
        loss = criterion(logits.view(-1, logits.shape[-1]), batch['labels'].view(-1))

        # We keep track of the loss at each epoch
        if train_args.with_tracking:
            cur_loss = loss.detach().float()
            total_loss += cur_loss

        loss = loss / train_args.gradient_accumulation_steps
        accelerator.backward(loss)

        if step % train_args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            completed_steps += 1

        if completed_steps % train_args.train_loss_steps == 0 and step % train_args.gradient_accumulation_steps == 0:
            logger.info(f"Train loss {cur_loss} at current step  {completed_steps}")
            train_loss_log = {
                "train_loss": cur_loss,
                "step": completed_steps,
            }
            if train_args.with_tracking:
                accelerator.log(
                    train_loss_log,
                    step=completed_steps,
                )

        if isinstance(checkpointing_steps, int):
            if completed_steps % checkpointing_steps == 0 and step % train_args.gradient_accumulation_steps == 0:
                output_dir = f"step_{completed_steps}"
                if train_args.output_dir is not None:
                    output_dir = os.path.join(train_args.output_dir, output_dir)
                    os.makedirs(output_dir, exist_ok=True)
                result_log, model_output_path = eval(model, eval_dataloader, accelerator, 
                                                     metric_acc, metric_pre, metric_re, metric_f1, 
                                                     train_args, epoch, completed_steps, output_dir, 
                                                     logger)
                accelerator.save_state(output_dir)

                key_best_metric = f'eval_{train_args.best_metric}'
                best_metric = result_log[key_best_metric]
                logger.info(f"best_metric : {best_metric}")
                heapq.heappush(model_heap, (best_metric, completed_steps, result_log, model_output_path))

                if len(model_heap) > train_args.save_max_limit:
                    _, _, _ ,delete_path = heapq.heappop(model_heap)
                    logger.info(f"Deleting file for path : {delete_path}")
                    mydir = pathlib.Path(delete_path)
                    shutil.rmtree(mydir)
                model.train()

        if completed_steps >= train_args.max_train_steps:
            break
    
    output_dir = f"epoch_{epoch}_step_{completed_steps}"
    if train_args.output_dir is not None:
        output_dir = os.path.join(train_args.output_dir, output_dir)
        os.makedirs(output_dir, exist_ok=True)
    
    result_log, model_output_path = eval(model, eval_dataloader, accelerator, 
                                         metric_acc, metric_pre, metric_re, metric_f1, 
                                         train_args, epoch, completed_steps, output_dir, 
                                         logger)
    accelerator.save_state(output_dir)

    key_best_metric = f'eval_{train_args.best_metric}'
    best_metric = result_log[key_best_metric]
    logger.info(f"best_metric : {best_metric}")
    heapq.heappush(model_heap, (best_metric, completed_steps, result_log, model_output_path))

    if len(model_heap) > train_args.save_max_limit:
        _, _, _ ,delete_path = heapq.heappop(model_heap)
        logger.info(f"Deleting file for path : {delete_path}")
        mydir = pathlib.Path(delete_path)
        shutil.rmtree(mydir)
            
if train_args.with_tracking:
    accelerator.end_training()


In [None]:
pprint(vars(train_args))

In [None]:
train_data = utils.open_json(train_file)
eval_data = utils.open_json(eval_file)

In [None]:
max_length = 200
shuffle = False

In [None]:
train_dataset = BinaryCustomDatasetShuffle(train_data, tokenizer = tokenizer, \
                                           max_length = max_length, shuffle = shuffle)

In [None]:
eval_dataset = BinaryCustomDatasetShuffle(eval_data, tokenizer = tokenizer, \
                                           max_length = max_length, shuffle = shuffle)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

In [None]:
batch_size = 8

In [None]:
train_dataloader = DataLoader(train_dataset,
                              shuffle = True,
                              collate_fn=data_collator,
                              batch_size=batch_size,
)

In [None]:
eval_dataloader = DataLoader(eval_dataset,
                              shuffle = True,
                              collate_fn=data_collator,
                              batch_size=batch_size,
)

In [None]:
no_decay = ["bias", "LayerNorm.weight"]

In [None]:
weight_decay = 0.0

In [None]:
optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

In [None]:
optimizer_grouped_parameters[0]["weight_decay"]

In [None]:
learning_rate=5e-5

In [None]:
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=learning_rate)

In [None]:
lr_scheduler_type='linear'
num_warmup_steps = 0
# max_train_steps = 
num_train_epochs = 5
gradient_accumulation_steps = 1

In [None]:
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)

In [None]:
max_train_steps = num_train_epochs * num_update_steps_per_epoch

In [None]:
max_train_steps

In [None]:
lr_scheduler = get_scheduler(
        name=lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=max_train_steps,
)

In [None]:
accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

In [None]:
metric = evaluate.load("accuracy")

In [None]:
per_device_train_batch_size = 8

In [None]:
total_batch_size = per_device_train_batch_size * accelerator.num_processes * gradient_accumulation_steps

In [None]:
total_batch_size

In [None]:
progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)

In [None]:
num_train_epochs

In [None]:
starting_epoch = 0
with_tracking = True

In [None]:
checkpointing_steps = 50

In [None]:
for epoch in range(starting_epoch, num_train_epochs):
    model.train()
    if with_tracking:
        total_loss = 0
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        
        if with_tracking:
            total_loss += loss.detach().float()
            
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        
        if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            completed_steps += 1
            
        if isinstance(checkpointing_steps, int):
            if completed_steps % checkpointing_steps == 0:
                output_dir = f"step_{completed_steps }"
                if output_dir is not None:
                    output_dir = os.path.join(args.output_dir, output_dir)
                accelerator.save_state(output_dir)
        if completed_steps >= args.max_train_steps:
                break
                
                
    model.eval()
    samples_seen = 0
    for step, batch in enumerate(eval_dataloader):
         with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1) 
        predictions, references = accelerator.gather((predictions, batch["labels"]))
        
        if accelerator.num_processes > 1:
            if step == len(eval_dataloader) - 1:
                predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
                references = references[: len(eval_dataloader.dataset) - samples_seen]
            else:
                samples_seen += references.shape[0]
        
        metric.add_batch(
                predictions=predictions,
                references=references,
            )
        
        eval_metric = metric.compute()
        logger.info(f"epoch {epoch}: {eval_metric}")
        
        if args.with_tracking:
            accelerator.log(
                {
                    "accuracy" : eval_metric,
                    "train_loss": total_loss.item() / len(train_dataloader),
                    "epoch": epoch,
                    "step": completed_steps,
                },
                step=completed_steps,
            )


In [None]:
metric_acc = evaluate.load("accuracy")
metric_pre = evaluate.load('precision')
metric_re = evaluate.load('recall')
metric_f1 = evaluate.load('f1')

In [None]:
metric_acc

In [None]:
accelerator.num_processes

In [None]:
parser = HfArgumentParser(
    (ModelArguments, DataTrainingArguments, TrainingArguments)
)
args = ["--model_name_or_path", 'allenai/longformer-large-4096', '--output_dir', './']
model_args, data_args, training_args = parser.parse_args_into_dataclasses(args)


In [None]:
type(model_args)

In [None]:
vars(training_args)

In [None]:
logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process the small summary:
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")

In [None]:
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
    last_checkpoint = get_last_checkpoint(training_args.output_dir)
    if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
            "Use --overwrite_output_dir to overcome."
        )
    elif last_checkpoint is not None:
        logger.info(
            f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
            "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
        )


In [None]:
# Set seed before initializing model.
set_seed(training_args.seed)

In [None]:
config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=model_args.num_labels,
    )
tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    )
model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name_or_path,
    config=config,
)

In [None]:
if training_args.do_train:
    instances, cut_off, total_questions = preprocessing_data(
        data_args.train_file, 
        data_args.sample_size, 
        data_args.position)
    
    train_instance = instances[data_args.dev_size:]
    dev_instance = instances[:data_args.dev_size]
    
    train_dataset = CustomDataset(train_instance, 
                               tokenizer, 
                               model_args.max_seq_length)
    dev_dataset = CustomDataset(train_instance, 
                               tokenizer, 
                               model_args.max_seq_length)
    
    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

if training_args.do_eval:
    instances, cut_off, total_questions = preprocessing_data(
        data_args.test_file, 
        data_args.sample_size, 
        data_args.position)
    
    test_dataset = CustomDataset(instances, 
                               tokenizer, 
                               model_args.max_seq_length)
    
    


In [None]:
# Get the metric function
metric = evaluate.load("xnli")

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)


In [None]:
 # Initialize Trainer
data_collator = DataCollatorWithPadding(
    tokenizer, 
    pad_to_multiple_of=8,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_train else None,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=30)]
)

# Training
if training_args.do_train:
    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint
    elif last_checkpoint is not None:
        checkpoint = last_checkpoint
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
    metrics = train_result.metrics
    max_train_samples = (
        data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
    )
    metrics["train_samples"] = min(max_train_samples, len(train_dataset))

    trainer.save_model()  # Saves the tokenizer too for easy upload

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    
# Evaluation
if training_args.do_eval:
    logger.info("*** Evaluate ***")
    metrics = trainer.evaluate(eval_dataset=eval_dataset)

    max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
    metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)

In [None]:
metric

In [None]:
data_args.dataset_name = a
    

In [None]:
data.max_seq_length

In [None]:
training_args.fp16

In [None]:
bb

In [None]:
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments)
    )
    
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
    
    