In [4]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch 
from torch.nn.utils.rnn import pad_sequence
# Load pre-trained MBART model and tokenizer (MBART-50 for multilingual tasks)
model_name = "../mbart_model"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Example batch of input sentences in various languages
batch_sentences = [
    "Hello, how are you?",   # English
    "What have you been up to recently?", # French
    "Do you want to go for a run?",    # Spanish
]


# Tokenize the input batch of sentences
inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True)
print(f"input attention mask: {inputs['attention_mask']}")
# Generate translations (for example, to French) or any other target language
# Specify the target language for the model to generate in
forced_bos_token_id = tokenizer.lang_code_to_id["zh_CN"]

# Perform inference with the model to generate translations
outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], 
                         forced_bos_token_id=forced_bos_token_id)

predictions = [] 
for i in range(len(outputs)): 
    predictions.append(outputs[i, :])

pad_tensor = torch.ones(200-len(predictions[0]))
predictions[0] = torch.cat((predictions[0],pad_tensor.long()),dim = 0)
predictions = pad_sequence(predictions,batch_first=True,padding_value=1)

# Decode the generated outputs back to text
translated_sentences = tokenizer.batch_decode(predictions, skip_special_tokens=True)

# Print the generated translations
for i, translation in enumerate(translated_sentences):
    print(f"Original: {batch_sentences[i]}")
    print(f"Translated: {translation}")
    print()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


input attention mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Original: Hello, how are you?
Translated: 你好,你好吗?

Original: What have you been up to recently?
Translated: 你最近做了些什么?

Original: Do you want to go for a run?
Translated: 你想跑吗?



In [41]:
import torch
from transformers import MBartTokenizer, MBartForConditionalGeneration

# Load the mBART model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")

# Example sentences
sentences = ["This is the first sentence.", "This is another sentence."]

# Tokenize the input with padding
inputs = tokenizer(sentences, padding=True, return_tensors="pt")

# input_ids shape: (batch_size, sequence_length)
# attention_mask shape: (batch_size, sequence_length)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Ensure the shapes are the same
print(f"input_ids shape: {input_ids.shape}")         # Expected: (batch_size, sequence_length)
print(f"attention_mask shape: {attention_mask.shape}")  # Expected: (batch_size, sequence_length)

# Forward pass through the model
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)

# Check model outputs
print(f"Logits shape: {outputs.logits.shape}")       # Expected: (batch_size, sequence_length, vocab_size)
# Create an attention mask with incorrect shape
incorrect_attention_mask = attention_mask[:, :-1]  # Removes one token from the attention mask, causing a mismatch

# Check the shapes
print(f"input_ids shape: {input_ids.shape}")                  # (batch_size, sequence_length)
print(f"incorrect_attention_mask shape: {incorrect_attention_mask.shape}")  # (batch_size, sequence_length - 1)

# This will raise an error due to shape mismatch
try:
    outputs = model(inputs_embeds=input_ids, attention_mask=incorrect_attention_mask, labels=input_ids)
except Exception as e:
    print(f"Error: {e}")

input_ids shape: torch.Size([2, 8])
attention_mask shape: torch.Size([2, 8])
Logits shape: torch.Size([2, 8, 250027])
input_ids shape: torch.Size([2, 8])
incorrect_attention_mask shape: torch.Size([2, 7])
Error: too many indices for tensor of dimension 2


In [46]:
import torch
from transformers import MBart50Tokenizer, MBartForConditionalGeneration

# Load the mBART model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# Example sentences
sentences = ["This is the first sentence.", "This is another sentence."]

# Tokenize the input with padding
inputs = tokenizer(sentences, padding=True, return_tensors="pt")

# input_ids shape: (batch_size, sequence_length)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Get the input embeddings from the model's embedding layer
inputs_embeds = model.model.shared(input_ids)  # Shape: (batch_size, sequence_length, embed_dim)

# Ensure the shapes are the same
print(f"inputs_embeds shape: {inputs_embeds.shape}")         # (batch_size, sequence_length, embed_dim)
print(f"attention_mask shape: {attention_mask.shape}")       # (batch_size, sequence_length)

# Forward pass through the model using inputs_embeds instead of input_ids
outputs = model(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=input_ids)

# Check model outputs
print(f"Logits shape: {outputs.logits.shape}")  # Expected: (batch_size, sequence_length, vocab_size)

# Create an attention mask with incorrect shape
incorrect_attention_mask = attention_mask[:, :-1]  # Removes one token from the attention mask

# Check the shapes
print(f"inputs_embeds shape: {inputs_embeds.shape}")                  # (batch_size, sequence_length, embed_dim)
print(f"incorrect_attention_mask shape: {incorrect_attention_mask.shape}")  # (batch_size, sequence_length - 1)

# This will raise an error due to shape mismatch
try:
    outputs = model(inputs_embeds=inputs_embeds, attention_mask=incorrect_attention_mask, labels=input_ids)
except Exception as e:
    print(f"Error: {e}")

inputs_embeds shape: torch.Size([2, 8, 1024])
attention_mask shape: torch.Size([2, 8])
Logits shape: torch.Size([2, 8, 250054])
inputs_embeds shape: torch.Size([2, 8, 1024])
incorrect_attention_mask shape: torch.Size([2, 7])
Error: Attention mask should be of size (2, 1, 8, 8), but is torch.Size([2, 1, 7, 7])


In [48]:
## Check on dataloader
from torch.utils.data import DataLoader
from train_sign_utils import * 
from signdata import SignTransDataset
import torch
import multiprocessing

multiprocessing.set_start_method('fork', force=True)
accelerator = Accelerator()
config = OmegaConf.load("configs/Sign2Text_CSL_config.yaml")
logger = setup_logger(name="Sign2Text", log_level="INFO",
        output_file=f"./log{accelerator.process_index}.txt")
trainloader, devloader, testloader = create_signloader(config, logger,accelerator, tokenizer)

for i, (src, tgt) in enumerate(tqdm(trainloader, desc=f"Training!")):
    batch = src['input_ids']
    src_length = src['src_length_batch']
    tgt_attn = tgt.attention_mask
    tgt_input = tgt['input_ids']
    input_attn = src['attention_mask']
    print(batch.shape)
    print(src_length)
    print(tgt_attn.shape)
    print(tgt_input.shape)
    print(input_attn.shape)
    break

AttributeError: type object 'Accelerator' has no attribute 'to'

In [51]:
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from unittest.mock import MagicMock

# Sample tokenizer mock (simulates tokenizer behavior)
class MockTokenizer:
    def as_target_tokenizer(self):
        return self

    def __call__(self, texts, return_tensors="pt", padding=True, truncation=True):
        # Mock tokenization: Returns a tensor with random token IDs and attention masks
        max_len = max(len(t) for t in texts)  # Simulate max token length in batch
        input_ids = [torch.randint(1, 100, (len(t),)) for t in texts]
        input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
        attention_mask = (input_ids_padded != 0).long()
        return {
            'input_ids': input_ids_padded,
            'attention_mask': attention_mask
        }

# Sample collate function (from your code)
def collate_fn(batch, tokenizer):
    tgt_batch, img_tmp, src_length_batch, name_batch = [], [], [], []

    # Separate the video frames and labels
    for name_sample, img_sample, tgt_sample in batch:
        name_batch.append(name_sample)
        img_tmp.append(img_sample)
        tgt_batch.append(tgt_sample)

    max_len = max([len(vid) for vid in img_tmp])
    video_length = torch.LongTensor([int(np.ceil(len(vid) / 4.0) * 4 + 16) for vid in img_tmp])
    left_pad = 8
    right_pad = int(np.ceil(max_len / 4.0)) * 4 - max_len + 8
    max_len = max_len + left_pad + right_pad

    padded_video = [torch.cat(
        (
            vid[0][None].expand(left_pad, -1, -1, -1),  # Padding at the start with the first frame.
            vid,
            vid[-1][None].expand(max_len - len(vid) - left_pad, -1, -1, -1),  # Padding at the end.
        ), dim=0) for vid in img_tmp]
    print([padded_video[i].shape for i in range(len(padded_video))])
    img_tmp = [padded_video[i][0:video_length[i], :, :, :] for i in range(len(padded_video))]
    print([img_tmp[i].shape for i in range(len(img_tmp))])

    for i in range(len(img_tmp)):
        src_length_batch.append(len(img_tmp[i]))
    src_length_batch = torch.tensor(src_length_batch)

    img_batch = torch.cat(img_tmp, 0)

    new_src_lengths = (((src_length_batch - 5 + 1) / 2) - 5 + 1) / 2
    new_src_lengths = new_src_lengths.long()

    mask_gen = []
    for i in new_src_lengths:
        tmp = torch.ones([i]) + 7
        print(f"tmp: {tmp}")
        mask_gen.append(tmp)
    mask_gen = pad_sequence(mask_gen, padding_value=0, batch_first=True)
    print(f"mask_gen: {mask_gen}")
    img_padding_mask = (mask_gen != 0).long()

    # Tokenize the text labels
    tgt_input = tokenizer(tgt_batch, return_tensors="pt", padding=True, truncation=True)

    # Print the shapes of the resulting tensors
    print(f"Shape of img_batch (stacked videos): {img_batch.shape}")
    print(f"Shape of src_length_batch: {src_length_batch.shape}")
    print(f"Shape of new_src_lengths: {new_src_lengths.shape}")
    print(f"Shape of img_padding_mask: {img_padding_mask.shape}")
    print(f"Shape of tgt_input['input_ids']: {tgt_input['input_ids'].shape}")
    print(f"Shape of tgt_input['attention_mask']: {tgt_input['attention_mask'].shape}")
    print("Image padding masks: ", img_padding_mask)
    print("New src lengths: ", new_src_lengths)


    src_input = {
        'input_ids': img_batch,
        'attention_mask': img_padding_mask,
        'name_batch': name_batch,
        'src_length_batch': src_length_batch,
        'new_src_length_batch': new_src_lengths
    }

    return src_input, tgt_input

# Test the collate function
def test_collate_fn():
    # Create mock video data (batch of 3 videos with different lengths)
    video1 = torch.randn(30, 64, 64, 3)  # 10 frames
    video2 = torch.randn(20, 64, 64, 3)  # 12 frames
    video3 = torch.randn(10, 64, 64, 3)   # 8 frames

    # Corresponding text labels
    labels = ["sign language translation one", "sign language translation two", "sign language translation three"]

    # Create a batch (list of tuples: name_sample, video_sample, label)
    batch = [
        ("sample1", video1, labels[0]),
        ("sample2", video2, labels[1]),
        ("sample3", video3, labels[2])
    ]

    # Instantiate the mock tokenizer
    tokenizer = MockTokenizer()

    # Call the collate function
    src_input, tgt_input = collate_fn(batch, tokenizer)

    # Check the output shapes
    print("Test completed.")
    
# Run the test case
test_collate_fn()

[torch.Size([48, 64, 64, 3]), torch.Size([48, 64, 64, 3]), torch.Size([48, 64, 64, 3])]
[torch.Size([48, 64, 64, 3]), torch.Size([36, 64, 64, 3]), torch.Size([28, 64, 64, 3])]
tmp: tensor([8., 8., 8., 8., 8., 8., 8., 8., 8.])
tmp: tensor([8., 8., 8., 8., 8., 8.])
tmp: tensor([8., 8., 8., 8.])
mask_gen: tensor([[8., 8., 8., 8., 8., 8., 8., 8., 8.],
        [8., 8., 8., 8., 8., 8., 0., 0., 0.],
        [8., 8., 8., 8., 0., 0., 0., 0., 0.]])
Shape of img_batch (stacked videos): torch.Size([112, 64, 64, 3])
Shape of src_length_batch: torch.Size([3])
Shape of new_src_lengths: torch.Size([3])
Shape of img_padding_mask: torch.Size([3, 9])
Shape of tgt_input['input_ids']: torch.Size([3, 31])
Shape of tgt_input['attention_mask']: torch.Size([3, 31])
Image padding masks:  tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0]])
New src lengths:  tensor([9, 6, 4])
Test completed.


## Testing LLM adaptor 2 shape


In [52]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

# Define a dummy PAD_IDX for padding purposes
PAD_IDX = 0

class LLMAdapter2(nn.Module):
    '''
    LLM adapter aims to capture temporal relations and transform 32 tokens into 1024 tokens.
    This version introduces an additional projection layer between the two convolution layers.
    '''
    def __init__(self, num_tokens=32, hidden_dim=1024, kernel_size=5):
        super(LLMAdapter2, self).__init__()
        
        # Store parameters
        self.num_tokens = num_tokens
        self.hidden_dim = hidden_dim
        
        # First projection from input tokens to hidden_dim/2
        self.proj = nn.Linear(self.num_tokens, self.hidden_dim // 2)

        # First convolutional block
        self.conv_block_1 = nn.Sequential(
            nn.Conv1d(self.hidden_dim // 2, self.hidden_dim // 2, kernel_size=kernel_size,stride =1, padding=0),
            nn.BatchNorm1d(self.hidden_dim // 2),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=2, ceil_mode=False)
        )

        # New projection layer between convolution layers
        self.intermediate_proj = nn.Linear(self.hidden_dim // 2, self.hidden_dim)

        # Second convolutional block
        self.conv_block_2 = nn.Sequential(
            nn.Conv1d(self.hidden_dim, self.hidden_dim, kernel_size=kernel_size, stride=1, padding=0),
            nn.BatchNorm1d(self.hidden_dim),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=2, ceil_mode=False)
        )


    def forward(self, x, src_length):
        # Input shape: (batch_size, num_frames, num_tokens)
        
        # Split the input into individual batches according to src_length
        start = 0
        x_batch = []
        for length in src_length:
            end = start + length
            x_batch.append(x[start:end])
            start = end
        
        # Pad sequences to ensure uniform batch sizes
        x = pad_sequence(x_batch, padding_value=PAD_IDX, batch_first=True)
        print(f"After padding: {x.shape}")  # Check padding result (batch_size, num_frames, num_tokens)
        
        # Apply the initial projection layer
        x = self.proj(x)  # Shape: (batch_size, num_frames, hidden_dim / 2)
        print(f"After initial projection: {x.shape}")  # Should be (batch_size, num_frames, 512)
        
        # Permute to (batch_size, hidden_dim / 2, num_frames) for Conv1d
        x = x.permute(0, 2, 1)
        print(f"After permute (before first conv): {x.shape}")  # Should be (batch_size, 512, num_frames)
        
        # First convolutional block
        x = self.conv_block_1(x)  # Shape: (batch_size, hidden_dim / 2, reduced_num_frames)
        print(f"After first conv block: {x.shape}")  # Check after first conv
        
        # Apply the intermediate projection layer
        x = x.permute(0, 2, 1)  # Back to (batch_size, reduced_num_frames, hidden_dim / 2)
        x = self.intermediate_proj(x)  # Shape: (batch_size, reduced_num_frames, hidden_dim)
        print(f"After intermediate projection: {x.shape}")  # Should be (batch_size, reduced_num_frames, 1024)
        x = x.permute(0, 2, 1)  # Back to (batch_size, hidden_dim, reduced_num_frames)
        print(f"After permute (before second conv): {x.shape}")  # Check before second conv
        
        # Second convolutional block
        x = self.conv_block_2(x)  # Shape: (batch_size, hidden_dim, further_reduced_num_frames)
        print(f"After second conv block: {x.shape}")  # Check after second conv
        
        # Convert back to (batch_size, further_reduced_num_frames, hidden_dim)
        x = x.permute(0, 2, 1)
        print(f"Final output shape: {x.shape}")  # Should be (batch_size, further_reduced_num_frames, 1024)

        return x

# Create an instance of LLMAdapter2
model = LLMAdapter2()

# Test input
batch_size = 10
num_frames = 36 # Let's assume each sequence has 15 frames
num_tokens = 32  # As specified in the model

# Random test tensor simulating a batch of 10 sequences, each with 15 frames and 32 tokens
test_input = torch.rand((batch_size * num_frames, num_tokens))

# Source lengths for each batch (assuming all sequences have 15 frames)
src_length = torch.tensor([num_frames] * batch_size)

# Forward pass
output = model(test_input, src_length)

After padding: torch.Size([10, 36, 32])
After initial projection: torch.Size([10, 36, 512])
After permute (before first conv): torch.Size([10, 512, 36])
After first conv block: torch.Size([10, 512, 16])
After intermediate projection: torch.Size([10, 16, 1024])
After permute (before second conv): torch.Size([10, 1024, 16])
After second conv block: torch.Size([10, 1024, 6])
Final output shape: torch.Size([10, 6, 1024])


## Testing LLM adaptor 3 shape

In [2]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

# Define a dummy PAD_IDX for this example
PAD_IDX = 0

class LLMAdapter3(nn.Module):
    def __init__(self, num_tokens=32, hidden_dim=1024, kernel_size=5):
        super(LLMAdapter3, self).__init__()
        self.num_tokens = num_tokens
        self.hidden_dim = hidden_dim
        
        # Temporal convolution over the time dimension
        self.temporal_conv = nn.Sequential(
            nn.Conv1d(self.num_tokens, self.num_tokens * 2, kernel_size=5, stride=1, padding=0),
            nn.BatchNorm1d(self.num_tokens * 2),  # Channels must match Conv1d output channels
            nn.ReLU(inplace=True),
            # Reduce kernel size for pooling to avoid sequence collapse
            nn.AvgPool1d(kernel_size=2, ceil_mode=False),  

            nn.Conv1d(self.num_tokens * 2, self.num_tokens * 4, kernel_size=5, stride=1, padding=0),
            nn.BatchNorm1d(self.num_tokens * 4),  # Channels must match Conv1d output channels
            nn.ReLU(inplace=True),
            nn.AvgPool1d(kernel_size=2, ceil_mode=False)  # Adjusted pooling to avoid reducing size to zero
        )
        
        # Final projection layer
        self.final_proj = nn.Sequential(
            nn.Linear(self.num_tokens * 4, self.hidden_dim)
        )
        self.out = nn.Sequential(nn.BatchNorm1d(self.hidden_dim),
            nn.ReLU(inplace=True))

    def forward(self, x, src_length):
        start = 0
        x_batch = []
        for length in src_length:
            end = start + length
            x_batch.append(x[start:end])
            start = end
        print(f"Before padding: {x.shape}") 
        x = pad_sequence(x_batch, padding_value=PAD_IDX, batch_first=True)
        print(f"After padding: {x.shape}")  # Print shape after padding
        
        # Permute to match Conv1d expected shape: (batch_size, channels, sequence_length)
        x = x.permute(0, 2, 1)
        print(f"After permute: {x.shape}")  # Shape should now be (batch_size, num_tokens, num_frames)
        
        # Apply temporal convolution
        x = self.temporal_conv(x)
        print(f"After temporal_conv: {x.shape}")  # Check shape after convolution
        
        # Permute back to (batch_size, sequence_length, hidden_dim)
        x = x.permute(0, 2, 1)
        print(f"After second permute: {x.shape}")  # Shape should be (batch_size, num_frames, num_tokens*4)
        
        # Apply final projection (we need to flatten or reshape input to match Linear input requirements)
        batch_size, seq_len, hidden_dim = x.shape
        x = self.final_proj(x)
        #x = self.final_proj(x.reshape(batch_size * seq_len, hidden_dim))
        print(f"After final_proj: {x.shape}")  # Check final shape

        print(f"before out shape : {x.shape}")
        x = self.out(x.permute(0, 2, 1)).permute(0, 2, 1)
        return x

# Create an instance of LLMAdapter3
model = LLMAdapter3()

# Test input
batch_size = 10
num_frames = 48 # Let's assume each sequence has 15 frames
num_tokens = 32  # As specified in the model

# Random test tensor simulating a batch of 10 sequences, each with 15 frames and 32 tokens
test_input = torch.rand((batch_size * num_frames, num_tokens))

# Source lengths for each batch (assuming all sequences have 15 frames)
src_length = torch.tensor([num_frames] * batch_size)

# Forward pass
output = model(test_input, src_length)

Before padding: torch.Size([480, 32])
After padding: torch.Size([10, 48, 32])
After permute: torch.Size([10, 32, 48])
After temporal_conv: torch.Size([10, 128, 9])
After second permute: torch.Size([10, 9, 128])
After final_proj: torch.Size([10, 9, 1024])
before out shape : torch.Size([10, 9, 1024])


In [3]:
import torch

# Define constants
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PAD_IDX = 0  # Assuming PAD_IDX is 0 (the usual padding token in tokenized sequences)

# Sample sequences (src and tgt) with padding
# Here 1, 2, 3 are tokens, and 0 is the padding token (PAD_IDX)
src = torch.tensor([[1, 2, 3, 0], [1, 2, 0, 0]], device=DEVICE)  # shape: (batch_size=2, src_seq_len=4)
tgt = torch.tensor([[1, 2, 0], [1, 0, 0]], device=DEVICE)        # shape: (batch_size=2, tgt_seq_len=3)

# Function to generate a square subsequent mask
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

# Function to create masks for src and tgt sequences
def create_mask(src, tgt):
    src_seq_len = src.shape[1]  # Take the sequence length dimension
    tgt_seq_len = tgt.shape[1]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)  # shape: (src_seq_len, batch_size)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)  # shape: (tgt_seq_len, batch_size)
    
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

# Call create_mask function
src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt)

# Output the masks to verify them
print("Source Mask (src_mask):")
print(src_mask)

print("\nTarget Mask (tgt_mask):")
print(tgt_mask)

print("\nSource Padding Mask (src_padding_mask):")
print(src_padding_mask)

print("\nTarget Padding Mask (tgt_padding_mask):")
print(tgt_padding_mask)

Source Mask (src_mask):
tensor([[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]])

Target Mask (tgt_mask):
tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]])

Source Padding Mask (src_padding_mask):
tensor([[False, False],
        [False, False],
        [False,  True],
        [ True,  True]])

Target Padding Mask (tgt_padding_mask):
tensor([[False, False],
        [False,  True],
        [ True,  True]])


In [2]:
import torch

# Create a 4 by 12 matrix
matrix = torch.arange(4 * 12).reshape(4, 12)
print("Original matrix:")
print(matrix)

# Reshape the matrix to 6 by 8
reshaped_matrix = matrix.reshape(-1)
print("\nReshaped matrix:")
print(reshaped_matrix.shape)

Original matrix:
tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
        [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
        [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
        [36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]])

Reshaped matrix:
torch.Size([48])


In [11]:
logits= torch.arange(4 * 12 * 50625).reshape(4,12,50625)
logits = logits.reshape(-1,logits.shape[-1])
print(logits.shape)

torch.Size([48, 50625])
