In [24]:
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.nn import CrossEntropyLoss, MSELoss

from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

from transformers.modeling_utils import PreTrainedModel, unwrap_model

from transformers import (
    MBart50TokenizerFast,
    AdamW
)

from transformers.models.mbart.configuration_mbart import MBartConfig

from transformers.models.mbart.modeling_mbart import (
    MBartPreTrainedModel,
    MBartDecoder,
    MBartLearnedPositionalEmbedding,
    MBartEncoderLayer,
    shift_tokens_right,
)


from transformers.modeling_outputs import (
    BaseModelOutput,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput
)

import math


In [None]:
class MultimodalMBartEncoder(MBartPreTrainedModel):

  def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)

        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop

        embed_dim = config.d_model
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)

        self.embed_positions = MBartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
        )
        self.layers = nn.ModuleList([MBartEncoderLayer(config) for _ in range(config.encoder_layers)])
        self.layernorm_embedding = nn.LayerNorm(embed_dim)
        self.layer_norm = nn.LayerNorm(config.d_model)

        self.init_weights()
        self.gradient_checkpointing = False

  def forward(self,
        input_ids=None,
        attention_mask=None,
        #acoustic_input=None,      # New addition of acoustic_input
        #visual_input=None,      # New addition of visual_input
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,)-> Union[Tuple, BaseModelOutput]:



      #Handeling empty inputs
      '''
      output_attentions (`bool`, *optional*):Whether or not to return the attentions tensors of all attention layers.
      output_hidden_states (`bool`, *optional*):Whether or not to return the hidden states of all layers
      return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
      '''

      output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
      output_hidden_states =(
              output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
          )
      return_dict = return_dict if return_dict is not None else self.config.use_return_dict


      '''
      Checking for input ids and input embeds if input ids is given then we are forming embedinng from it but if
      embediing if given we are extracting input_ids from it dierectly
      '''

      if input_ids is not None and inputs_embeds is not None:
              raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
      elif input_ids is not None:
              input = input_ids
              input_shape = input.shape
              input_ids = input_ids.view(-1, input_shape[-1])
      elif inputs_embeds is not None:
              input = inputs_embeds[:, :, -1]
      else:
          raise ValueError("You have to specify either input_ids or inputs_embeds")

      if inputs_embeds is None:
        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale



      embed_pos = self.embed_positions(input) #Getting the positional embeddings
      hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device) #Adding word and positional embeddings
      hidden_states = self.layernorm_embedding(hidden_states) #for preventing covariance shift
      hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) #why









In [13]:
class MultimodalMBartModel(MBartPreTrainedModel):

  def __init__(self, config: MBartConfig):
        super().__init__(config)

        self.encoder = MultimodalMBartEncoder(config, self.shared)

  def get_encoder(self):
      return self.encoder



In [21]:
class MultimodalMBartForConditionalGeneration(MBartPreTrainedModel):

  def __init__(self, config: MBartConfig):

        super().__init__(config)
        self.model = MultimodalMBartModel(config)
        #self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
        #self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)

        self.init_weights()


In [23]:

DEVICE = 'cuda'

MODEL = MultimodalMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')

print("Model loaded...\n")
MODEL.to(DEVICE)
TOKENIZER = MBart50TokenizerFast.from_pretrained('facebook/mbart-large-50-many-to-many-mmt',
                                                     src_lang="en_XX",
                                                     tgt_lang="en_XX")
print("Tokenizer loaded...\n")

Model loaded...

Tokenizer loaded...

