## Import Modules

In [1]:
from mica_text_coref.coref.movie_coref.data import (CharacterRecognitionDataset, 
                                                    CorefCorpus)

import accelerate
from accelerate import logging
import collections
import contextlib
import logging as pylogging
import numpy as np
import os
import sys
import time
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.optim import Optimizer, AdamW
from transformers import (get_linear_schedule_with_warmup, AutoTokenizer,
                          AutoModel)

### Metric

In [2]:
class Metric:
    """General Metric class"""

    @property
    def score(self) -> float:
        """Main metric score used for comparison"""
        raise NotImplementedError

### Character Recognition Model

In [15]:
from torch import logit


class CharacterRecognition(nn.Module):
    """Character Recognition Model.
    """

    def __init__(self, 
                 encoder_name: str,
                 num_parse_tags: int,
                 parse_tag_embedding_size: int,
                 gru_hidden_size: int,
                 gru_num_layers: int,
                 gru_dropout: float,
                 gru_bidirectional: bool,
                 num_labels: int) -> None:
        """Initializer for Character Recognition Model.

        Args:
            encoder_name: Language model encoder name from transformers hub
                e.g. bert-base-cased
            num_parse_tags: Parse tag set size
            parse_tag_embedding_size: Embedding size of the parse tags
            gru_hidden_size: Hidden size of the GRU
            gru_num_layers: Number of layers of the GRU
            gru_dropout: Dropout used between the GPU layers
            gru_bidirectional: If true, the GRU is bidirectional
            num_labels: Number of labels in the label set. 2 if label_type =
                "head" or 3 if label_type = "span"
        """
        super().__init__()
        self.num_labels = num_labels
        
        self.encoder = AutoModel.from_pretrained(
            encoder_name, add_pooling_layer=False)
        self.encoder_hidden_size = self.encoder.config.hidden_size
        self.subtoken = nn.Linear(self.encoder_hidden_size, 1)
        self.parse_embedding = nn.Embedding(
            num_parse_tags, parse_tag_embedding_size)
        self.gru_input_size = (self.encoder_hidden_size +
                               parse_tag_embedding_size)
        self.gru_output_size = gru_hidden_size * (1 + int(gru_bidirectional))
        self.gru = nn.GRU(self.gru_input_size, gru_hidden_size,
                          num_layers=gru_num_layers, batch_first=True,
                          dropout=gru_dropout, bidirectional=gru_bidirectional)
        self.output = nn.Linear(self.gru_output_size, num_labels)
        self._device = "cpu"
    
    @property
    def device(self) -> torch.device:
        """Getter for model device."""
        return self._device
    
    @device.setter
    def device(self, device):
        """Setter for model device. Used by accelerate."""
        self._device = device
    
    def forward(self, subtoken_ids: torch.Tensor, attention_mask: torch.Tensor,
                token_offset: torch.Tensor, parse_ids: torch.Tensor,
                labels: torch.Tensor) -> torch.Tensor:
        """Forward propagation for the Character Recognition Model.

        Args:
            subtoken_ids: `batch_size x max_n_subtokens` Long Tensor
            attention_mask: `batch_size x max_n_subtokens` Float/Long Tensor
            token_offset: `batch_size x max_n_tokens x 2` Long Tensor
            parse_ids: `batch_size x max_n_tokens` Long Tensor
            labels: `batch_size x max_n_tokens` Long Tensor
        
        Returns:
            Return the loss value if model is begin trained, else the logits 
            `batch_size x max_n_tokens x num_labels` Float Tensor
        """
        batch_size = len(subtoken_ids)

        # subtoken_embedding = batch_size x max_n_subtokens x encoder_hidden_size
        encoder_output = self.encoder(subtoken_ids, attention_mask)
        subtoken_embedding = encoder_output.last_hidden_state

        # _subtoken_embedding = batch_size * max_n_subtokens x encoder_hidden_size
        # subtoken_attn = batch_size * max_n_tokens x batch_size * max_n_subtokens
        _subtoken_embedding = subtoken_embedding.view(-1, self.encoder_hidden_size)
        subtoken_attn = self._attn_scores(_subtoken_embedding,
                                          token_offset.view(-1, 2))
        
        # token_embedding = batch_size x max_n_tokens x encoder_hidden_size
        token_embedding = torch.mm(
            subtoken_attn, _subtoken_embedding).reshape(
                batch_size, -1, self.encoder_hidden_size)
        
        # gru_input = batch_size x max_n_tokens x (encoder_hidden_size +
        # parse_tag_embedding_size)
        parse_input = self.parse_embedding(parse_ids)
        gru_input = torch.cat((token_embedding, parse_input), dim=2).contiguous()

        # logits = batch_size x max_n_tokens x num_labels
        gru_output, _ = self.gru(gru_input)
        logits = self.output(gru_output)

        # TODO = remove this
        nz = (logits[:,:,1] > logits[:,:,0]).sum().item()
        print(f"Inside Model: logits sum = {nz}")

        if self.training:
            token_attention_mask = torch.any(subtoken_attn > 0, dim=1).reshape(
                batch_size, -1)
            loss = compute_loss(logits, labels, token_attention_mask,
                                self.num_labels)
            return loss
        else:
            return logits

    def _attn_scores(self,
                     subtoken_embeddings: torch.FloatTensor,
                     token_offset: torch.LongTensor) -> torch.FloatTensor:
        """ Calculates attention scores for each of the subtokens of a token.

        Args:
            subtoken_embedding: `n_subtokens x embedding_size` Float Tensor,
                embeddings for each subtoken
            token_offset: `n_tokens x 2` Long Tensor, subtoken offset of each
                token

        Returns:
            torch.FloatTensor: `n_tokens x n_subtokens` Float Tensor, attention
            weights for each subtoken of a token
        """
        n_subtokens, n_tokens = len(subtoken_embeddings), len(token_offset)
        token_begin, token_end = token_offset[:,0], token_offset[:,1]
        
        # attn_mask: n_tokens x n_subtokens, contains -∞ for subtokens outside
        # the token's offsets and 0 for subtokens inside the token's offsets
        attn_mask = torch.arange(0, n_subtokens, device=self.device).expand(
            (n_tokens, n_subtokens))
        attn_mask = ((attn_mask >= token_begin.unsqueeze(1)) * 
                     (attn_mask <= token_end.unsqueeze(1)))
        attn_mask = torch.log(attn_mask.to(torch.float))

        # attn_scores: 1 x n_subtokens
        attn_scores = self.subtoken(subtoken_embeddings).T

        # attn_scores: n_tokens x n_subtokens
        attn_scores = attn_scores.expand((n_tokens, n_subtokens))

        # -∞ for subtokens outside the token's offsets and attn_scores for
        # inside the token's offsets
        attn_scores = attn_mask + attn_scores
        del attn_mask

        # subtoken_attn contains 0 for subtokens outside the token's offsets
        subtoken_attn = torch.softmax(attn_scores, dim=1)
        return subtoken_attn
    
def compute_loss(
    logits: torch.FloatTensor, label_ids: torch.LongTensor,
    attn_mask: torch.FloatTensor, n_labels: int) -> torch.FloatTensor:
    """Compute cross entropy loss"""
    active_labels = label_ids[attn_mask == 1.]
    active_logits = logits.flatten(0, 1)[attn_mask.flatten() == 1.]
    label_distribution = torch.bincount(active_labels, minlength=n_labels)
    class_weight = len(active_labels)/(1 + label_distribution)
    print(class_weight)
    cross_entrop_loss_fn = nn.CrossEntropyLoss(
        weight=class_weight, reduction="mean")
    loss = cross_entrop_loss_fn(active_logits, active_labels)
    return loss

### Trainer

In [10]:
class Trainer:
    """General trainer class that uses huggingface """

    def __init__(self,
                accelerator: accelerate.Accelerator,
                logger: logging.MultiProcessAdapter,
                model: nn.Module,
                train_dataloader: DataLoader,
                dev_dataloader: DataLoader,
                optimizer: Optimizer,
                use_scheduler: bool = False,
                warmup_ratio: float = None,
                warmup_steps: int = None,
                max_epochs: int = 1,
                max_grad_norm: float = None,
                patience: int = 1,
                log_batch_frequency: int = 1,
                evaluate_train: bool = False,
                save_model: bool = False,
                save_tensors: bool = False,
                save_tensors_name: list[str] = None,
                save_dir: str = None
                ) -> None:
        """Initializer for the general trainer class that uses accelerate to
        train your model.

        Args:
            accelerator: Instance of the Accelerator class.
            logger: Instance of the Accelerator logger used for distributed
                logging.
            model: Torch nn.Module subclass to train.
            train_dataloader: Train set dataloader.
            dev_dataloader: Dev set dataloader.
            optimizer: Optimizer.
            use_scheduler: Whether to use scheduler.
            warmup_ratio: Ratio of training steps to use in the scheduler's
                warmup. warmup_steps has to be None if you want to use this
                parameter.
            warmup_steps: Number of steps to use in the scheduler's warmup.
                This parameter supercedes warmup_ratio.
            max_epochs: Maximum number of epochs to train for.
            max_grad_norm: Maximum norm of gradient to be used in gradient
                clipping. If None, gradient clipping is not done.
            patience: Maximum number of epochs to wait for development set
                performance to improve before early-stopping.
            log_batch_frequency: Training loop logs training loss and timing
                information after every log_batch_frequency batches.
            evaluate_train: Whether to evaluate on the training set.
            save_model: Whether to save model after every epoch.
            save_tensors: Whether to save the tensors of development set, along
                with the logits.
            save_tensors_name: List of tensor names which are to be saved. If
                none, all tensors are saved.
            save_dir: Directory to which the model weights, ground truth, and
                predictions will be saved.
        """
        self.accelerator = accelerator
        self.logger = logger
        self.model = model
        self.train_dataloader = train_dataloader
        self.dev_dataloader = dev_dataloader
        self.optimizer = optimizer
        self.use_scheduler = use_scheduler
        self.warmup_ratio = warmup_ratio
        self.warmup_steps = warmup_steps
        self.max_epochs = max_epochs
        self.max_grad_norm = max_grad_norm
        self.patience = patience
        self.log_batch_frequency = log_batch_frequency
        self.evaluate_train = evaluate_train
        self.save_model = save_model
        self.save_tensors = save_tensors
        self.save_tensors_name = save_tensors_name
        self.save_dir = save_dir

        if self.use_scheduler:
            assert (self.warmup_ratio is not None or 
                    self.warmup_steps is not None), (
                    "Set warmup_ratio or warmup_steps "
                    "if you are using scheduler")
        
        if self.save_model or self.save_predictions:
            assert self.save_dir is not None, (
                "Set save_dir if you are saving model and/or predictions")
        
        self.n_training_samples = len(self.train_dataloader.dataset)
        self.n_dev_samples = len(self.dev_dataloader.dataset)
        self.model.eval()
        self.model.device = self.accelerator.device
    
    def log(self, message):
        """Logging or printing"""
        self.accelerator.print(message)
    
    @contextlib.contextmanager
    def _timer(self, message):
        """Context manager for timing a codeblock"""
        start_time = time.time()
        self.log(f"Starting {message}")
        yield
        time_taken = time.time() - start_time
        time_taken_str = self._convert_float_seconds_to_time_string(time_taken)
        self.log(f"{message} done, time taken = {time_taken_str}")

    def _convert_float_seconds_to_time_string(self, seconds: float) -> str:
        """Convert seconds to h m s format"""
        seconds = int(seconds)
        minutes, seconds = seconds//60, seconds%60
        hours, minutes = minutes//60, minutes%60
        return f"{hours}h {minutes}m {seconds}s"
    
    def _save_model(self, model: nn.Module, directory: str):
        """Save model's weights to directory with filename `model.pt`.

        Args:
            model: Torch nn.Module.
            directory: Directory where model's weights will be saved.
        """
        self.accelerator.save(
            model.state_dict(), os.path.join(directory, "model.pt"))

    def _save_tensors(self, directory: str, **tensors):
        """Save the tensors returned from inference to directory.

        Args:
            directory: Directory where the tensors will be saved.
            tensors: Dictionary of tensor name to tensor.
        """
        for name, pt in tensors.items():
            if self.save_tensors_name is None or name in self.save_tensors_name:
                self.accelerator.save(pt, os.path.join(directory, f"{name}.pt"))
    
    def run(self):
        best_dev_score = None
        best_epoch = None
        epochs_left = self.patience
        save = self.save_model or self.save_tensors
        
        # Accelerate model, dataloaders, and optimizer
        (self.model, self.train_dataloader, self.dev_dataloader,
         self.optimizer) = (self.accelerator.prepare(
            self.model, self.train_dataloader, self.dev_dataloader, 
            self.optimizer))

        # Log number of training and inference batches, 
        # and number of training steps
        n_train_batches = len(self.train_dataloader)
        n_dev_batches = len(self.dev_dataloader)
        effective_train_batch_size = round(
            self.n_training_samples/n_train_batches)
        effective_dev_batch_size = round(self.n_dev_samples/n_dev_batches)
        n_training_steps = self.max_epochs * n_train_batches
        self.log("Effective train batch size = "
                        f"{effective_train_batch_size}")
        self.log("Effective dev batch size = "
                        f"{effective_dev_batch_size}")
        self.log(f"Number of training batches = {n_train_batches}")
        self.log(f"Number of inference batches = {n_dev_batches}")
        self.log(f"Number of training steps = {n_training_steps}")

        # Initialize and accelerate scheduler
        if self.use_scheduler:
            n_warmup_steps = self.warmup_steps if (
                self.warmup_steps is not None) else (
                    int(self.warmup_ratio * n_training_steps))
            scheduler = get_linear_schedule_with_warmup(
                self.optimizer, num_warmup_steps=n_warmup_steps,
                num_training_steps=n_training_steps)
            scheduler = self.accelerator.prepare_scheduler(scheduler)
            self.log(f"Number of warmup steps = {n_warmup_steps}")
        
        # Training and evaluation loop
        with self._timer("training"):
            for epoch in range(self.max_epochs):
                
                if save:
                    # Create epoch directories
                    epoch_dir = os.path.join(
                        self.save_dir, f"epoch_{epoch + 1}")
                    epoch_dev_dir = os.path.join(epoch_dir, "dev")
                    os.makedirs(epoch_dir, exist_ok=True)
                    os.makedirs(epoch_dev_dir, exist_ok=True)

                # Training for one epoch
                with self._timer(f"epoch {epoch + 1} training"):
                    self.model.train()
                    running_batch_loss = []
                    running_batch_train_time = []
                    
                    # Batch training loop
                    for i, batch in enumerate(self.train_dataloader):
                        batch_start_time = time.time()
                        
                        # One training step
                        with self.accelerator.accumulate(self.model):
                            self.optimizer.zero_grad()
                            with self.accelerator.autocast():
                                batch_loss = self.model(**batch)
                            self.accelerator.backward(batch_loss)
                            if self.optimizer.gradient_state.sync_gradients and self.max_grad_norm is not None:
                                self.accelerator.clip_grad_norm_(
                                    self.model.parameters(), self.max_grad_norm)
                            self.optimizer.step()
                            if self.use_scheduler and (
                            not self.accelerator.optimizer_step_was_skipped):
                                scheduler.step()

                        batch_time_taken = time.time() - batch_start_time
                        running_batch_loss.append(batch_loss.detach().item())
                        running_batch_train_time.append(batch_time_taken)

                        # Log after log_batch_frequency batches
                        if (i + 1) % self.log_batch_frequency == 0:
                            average_batch_loss = np.mean(running_batch_loss)
                            average_batch_train_time = np.mean(
                                running_batch_train_time)
                            estimated_time_remaining = (
                                self._convert_float_seconds_to_time_string(
                                average_batch_train_time * (
                                    n_train_batches-i-1)))
                            average_batch_train_time_str = (
                                self._convert_float_seconds_to_time_string(
                                average_batch_train_time))
                            self.log(f"Batch {i + 1}")
                            self.log(
                                "Average training loss @ batch = "
                                f"{average_batch_loss:.4f}")
                            self.log(
                                "Average training time taken @ batch = "
                                f"{average_batch_train_time_str}")
                            self.log(
                                "Estimated training time remaining for epoch "
                                f"{epoch + 1} = {estimated_time_remaining}")
                            running_batch_loss = []
                            running_batch_train_time = []

                # Wait for all process to complete
                self.accelerator.wait_for_everyone()

                # Save model
                if self.save_model:
                    self.log(f"Saving model after epoch {epoch + 1}")
                    unwrapped_model = self.accelerator.unwrap_model(self.model)
                    self._save_model(unwrapped_model, epoch_dir)

                # Inference and evaluation on training set
                if self.evaluate_train:
                    with self._timer(
                        f"epoch {epoch + 1} training inference and evaluation"):
                        train_inference_output = self._infer(
                            self.train_dataloader, self.model)
                        train_metric = self.evaluate(**train_inference_output)
                        self.log(
                            f"Training Performance = {train_metric.score}")
                    self.accelerator.wait_for_everyone()
            
                # Inference and evaluation on dev set
                with self._timer(
                    f"epoch {epoch + 1} dev inference and evaluation"):
                    dev_inference_output = self._infer(
                        self.dev_dataloader, self.model)
                    dev_metric = self.evaluate(**dev_inference_output)
                    self.log(f"Dev Performance = {dev_metric.score}")
                self.accelerator.wait_for_everyone()
                if self.save_tensors:
                    self.log(
                        f"Saving dev tensors after epoch {epoch + 1}")
                    self._save_tensors(epoch_dev_dir, **dev_inference_output)

                # Early-stopping
                self.log("Checking for early-stopping")
                dev_score = dev_metric.score
                if best_dev_score is None or dev_score > best_dev_score:
                    epochs_left = self.patience
                    best_epoch = epoch + 1
                    if best_dev_score is not None:
                        delta = 100 * (dev_score - best_dev_score)
                        self.log(f"Dev score improved by {delta:.1f}")
                    best_dev_score = dev_score
                else:
                    epochs_left -= 1
                    delta = 100 * (best_dev_score - dev_score)
                    self.log(
                        f"Dev score is {delta:.1f} lower than best Dev score "
                        f"({100*best_dev_score:.1f})")
                    self.log(
                        f"{epochs_left} epochs left until Dev score to improve to"
                        " avoid early-stopping!")
                if epochs_left == 0:
                    self.log("Early stopping!")
                    break

                self.log(f"Epoch {epoch + 1} done")

        self.log(f"Best Dev score = {100*best_dev_score:.1f}")
        self.log(f"Best epoch = {best_epoch}")
    
    def _infer(self, dataloader: DataLoader, model: nn.Module) -> (
        dict[str, torch.Tensor]):
        """Run inference on the dataloader.
        Args:
            dataloader: PyTorch dataloader.
            model: PyTorch module.
        
        Returns:
            Labels and predictions tensors.
        """
        # Initialize variables
        model.eval()
        tensors: dict[str, list[torch.Tensor]] = collections.defaultdict(list)
        n_batches = len(dataloader)
        self.log(f"Number of inference batches = {n_batches}")

        # Inference Loop
        with self._timer("inference"), torch.no_grad():
            running_batch_times = []
            for i, batch in enumerate(dataloader):

                # One inference step
                start_time = time.time()
                batch_logits = model(**batch)
                batch_logits = self.accelerator.gather_for_metrics(batch_logits)
                batch = self.accelerator.gather_for_metrics(batch)
                batch["logits"] = batch_logits
                nz = (batch_logits[:,:,1] > batch_logits[:,:,0]).sum().item()
                self.log(f"Outside Model: logits sum = {nz}")
                for name, tensor in batch.items():
                    tensors[name].append(tensor)
                time_taken = time.time() - start_time
                running_batch_times.append(time_taken)

                # Log after log_batch_frequency batches
                if (i + 1) % self.log_batch_frequency == 0:
                    average_time_per_batch = np.mean(running_batch_times)
                    estimated_time_remaining = (n_batches - i - 1) * (
                                                average_time_per_batch)
                    average_time_per_batch_str = (
                        self._convert_float_seconds_to_time_string(
                            average_time_per_batch))
                    estimated_time_remaining_str = (
                        self._convert_float_seconds_to_time_string(
                            estimated_time_remaining))
                    running_batch_times = []

                    self.log(f"Batch {i + 1}")
                    self.log("Average inference time @ batch = "
                                f"{average_time_per_batch_str}")
                    self.log("Estimated inference time remaining = "
                                f"{estimated_time_remaining_str}")

        # Concat tensors
        output: dict[str, torch.Tensor] = {}
        for name, tensor_list in tensors.items():
            output[name] = torch.cat(tensor_list, dim=0)
        return output
    
    def evaluate(self, **tensors) -> Metric:
        """Evaluate the output of inference"""
        raise NotImplementedError

### Character Recognition - Metric & Trainer

In [11]:
class CharacterRecognitionMetric(Metric):

    def __init__(self, precision, recall) -> None:
        super().__init__()
        self.precision = precision
        self.recall = recall
    
    @property
    def score(self) -> float:
        return 2*self.precision*self.recall/(self.precision+self.recall+1e-23)

class CharacterRecognitionTrainer(Trainer):

    def evaluate(self, **tensors) -> Metric:
        logits, labels, offset = (
            tensors["logits"], tensors["labels"], tensors["token_offset"])
        pred = logits.argmax(dim=2)
        mask = ~((offset[:,:,0] == 0) & (offset[:,:,1] == 0))
        pred = pred[mask]
        labels = labels[mask]
        tp = ((labels == pred) & (pred != 0)).sum().item()
        fp = ((labels != pred) & (pred != 0)).sum().item()
        fn = ((labels != pred) & (labels != 0)).sum().item()
        precision = tp/(tp + fp + 1e-23)
        recall = tp/(tp + fn + 1e-23)
        return CharacterRecognitionMetric(precision, recall)

### Training w/ Accelerate

In [16]:
def training_function():
    train_batch_size = 16
    infer_batch_size = 16
    parse_tag_embedding_size = 32
    encoder_name = "roberta-base"
    gru_hidden_size = 768
    gru_num_layers = 2
    gru_dropout = 0.2
    gru_bidirectional = True
    lr = 1e-5
    weight_decay = 1e-3
    use_scheduler = True
    warmup_ratio = 0.1
    warmup_steps = None
    max_epochs = 5
    max_grad_norm = 0.1
    patience = 3
    log_batch_frequency = 5
    evaluate_train = False
    save_model = True
    save_tensors = True
    # save_tensors_name = ["logits", "labels"]
    save_tensors_name = None
    save_dir = ("/home/sbaruah_usc_edu/mica_text_coref/data/movie_coref/results/"
                "character_recognition/")

    accelerator = accelerate.Accelerator(mixed_precision="fp16")
    logger = logging.get_logger("")

    corpus = CorefCorpus("/home/sbaruah_usc_edu/mica_text_coref/data/"
                        "movie_coref/results/regular/movie.jsonlines")
    roberta_tokenizer = AutoTokenizer.from_pretrained(encoder_name, use_fast=True)
    dataset = CharacterRecognitionDataset(
        corpus, roberta_tokenizer, seq_length=256, obey_scene_boundaries=False)

    train_dataloader = DataLoader(
        dataset, batch_size=train_batch_size, shuffle=True)
    dev_dataloader = DataLoader(
        dataset, batch_size=infer_batch_size)

    model = CharacterRecognition(encoder_name=encoder_name,
                    num_parse_tags=len(dataset.parse_tag_to_id),
                    parse_tag_embedding_size=parse_tag_embedding_size,
                    gru_hidden_size=gru_hidden_size,
                    gru_num_layers=gru_num_layers,
                    gru_dropout=gru_dropout,
                    gru_bidirectional=gru_bidirectional,
                    num_labels=2)

    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    trainer = CharacterRecognitionTrainer(
        accelerator,
        logger,
        model,
        train_dataloader,
        dev_dataloader,
        optimizer,
        use_scheduler = use_scheduler,
        warmup_ratio = warmup_ratio,
        warmup_steps = warmup_steps,
        max_epochs = max_epochs,
        max_grad_norm = max_grad_norm,
        patience = patience,
        log_batch_frequency = log_batch_frequency,
        evaluate_train = evaluate_train,
        save_model = save_model,
        save_tensors = save_tensors,
        save_tensors_name = save_tensors_name,
        save_dir = save_dir)
    
    trainer.run()

### Launch training

In [17]:
accelerate.notebook_launcher(
    training_function, num_processes=4)

Launching training on 4 GPUs.


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense

Effective train batch size = 61
Effective dev batch size = 61
Number of training batches = 13
Number of inference batches = 13
Number of training steps = 65
Number of warmup steps = 6
Starting training
Starting epoch 1 training
Inside Model: logits sum = 2632
Inside Model: logits sum = 1680
tensor([1.1784, 6.5852], device='cuda:2')
tensor([1.1384, 8.1920], device='cuda:3')
Inside Model: logits sum = 2633
tensor([1.1419, 8.0157], device='cuda:1')
Inside Model: logits sum = 1927
tensor([1.1337, 8.4454], device='cuda:0')
Inside Model: logits sum = 1680Inside Model: logits sum = 2498

Inside Model: logits sum = 1893
tensor([1.1554, 7.4069], device='cuda:3')tensor([1.1686, 6.9073], device='cuda:1')

tensor([1.1496, 7.6561], device='cuda:0')
Inside Model: logits sum = 2195
tensor([1.1343, 8.4107], device='cuda:2')
Inside Model: logits sum = 1733
Inside Model: logits sum = 1361
tensor([1.1397, 8.1270], device='cuda:2')
tensor([1.1525, 7.5294], device='cuda:0')
Inside Model: logits sum = 1749I