### Testing the Causal model


In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.models.mbart.modeling_mbart import shift_tokens_right
tokenizer = AutoTokenizer.from_pretrained("gemma_instruct_7b")
model = AutoModelForCausalLM.from_pretrained("gemma_instruct_7b")

  from .autonotebook import tqdm as notebook_tqdm
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.66s/it]


In [5]:
input_text = "Übersetzen Sie den gegebenen Satz ins Chinesische: Mutter isst gerne Brathähnchen."

input_ids = tokenizer(input_text, return_tensors="pt")
print(input_ids)
outputs = model.generate(**input_ids, max_length = 50 )
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

{'input_ids': tensor([[     2,  33571,  41516,   3670,   1600, 172299,  78450,   2029, 212951,
           6765, 235292,  45310,    603,    490,  36236,   9070,    753,  42442,
           1748, 235265]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
<bos>Übersetzen Sie den gegebenen Satz ins Chinesische: Mutter isst gerne Brathähnchen.

Mutter isst gerne Brathähnchen.

→ 母亲喜欢吃烤鸡。<eos>


In [126]:
outputs

tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,  17025,
          16147,    577,   7812,  30196,  53190, 235265, 235248,    109,  50039,
         235465,  56506]])

In [3]:
input_text = "Translate the given sentence into Chinese: I like to eat fried chicken."
input_ids = tokenizer(input_text, return_tensors="pt")

print(input_ids)

{'input_ids': tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,    590,
           1154,    577,   7812,  30196,  12254, 235265]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [4]:
# Extract input embeddings using the model's embedding layer
import torch 
with torch.no_grad():
    # Get the input embeddings
    input_embeds = model.get_input_embeddings()(input_ids["input_ids"])
    print(input_embeds.shape)
#output = model(inputs_embeds = input_embeds)

torch.Size([1, 15, 2048])


In [5]:
output = model(input_ids = input_ids.input_ids, labels = input_ids.input_ids)

### Testing gemma masking for causal finetuning

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("gemma_instruct_2b")
model = AutoModelForCausalLM.from_pretrained(
    "gemma_instruct_2b"
)

In [81]:
tokenizer(" ")

{'input_ids': [2, 235248], 'attention_mask': [1, 1]}

In [163]:
print(tokenizer.batch_decode(torch.tensor([1,  30485], dtype=torch.long)))
print(tokenizer.batch_decode(torch.tensor([1,  235248], dtype=torch.long)))
print(tokenizer.batch_decode(torch.tensor([1,  19891], dtype=torch.long)))
print(tokenizer.batch_decode(torch.tensor([235248,  42130,  19891, 236280, 238069, 237619, 235362], dtype=torch.long)))



['<eos>', ' 你']
['<eos>', ' ']
['<eos>', '喜欢']
[' ', '妈妈', '喜欢', '吃', '炸', '鸡', '。']


lowest loss: 
concated then tokenized tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,   1646,
           2182,    577,   7812,  30196,  53190, 235265,  30485,  19891, 236280,
         238069, 237619, 235362]])
         
Remember to absorb the space 

In [301]:


input_text = "Translate the given sentence into Chinese: Mother loves to eat fried chickens."
ans_text = " 妈妈喜欢吃炸鸡。"
combined_text = input_text +  ans_text
input_ids = tokenizer(input_text, return_tensors="pt")
combined_ids = tokenizer(combined_text, return_tensors="pt")
ans_ids = tokenizer(ans_text, return_tensors="pt")

#19891 --》 194816



labels = combined_ids["input_ids"][0].clone()
labels[:len(input_ids.input_ids[0])] = -100
print(f"input_ids: {input_ids.input_ids.shape}")
print(f"combined_ids: {combined_ids.input_ids.shape}")
print(f"ans_text: {ans_ids.input_ids.shape}")
print(len(input_ids["input_ids"][0]))
print(len(combined_ids["input_ids"][0]))
print(labels)
print(f"len(labels): {len(labels)}")
print(f"len(combined_ids): {len(combined_ids['input_ids'][0])}")
print(f"len(ans_ids): {len(ans_ids['input_ids'][0])}")
concated_ids = torch.cat([input_ids["input_ids"], ans_ids["input_ids"][:,1:]], dim = 1)
print(f"concated_ids: {concated_ids.shape}")    
print(tokenizer.batch_decode(concated_ids , skip_special_tokens=False))
print(tokenizer.batch_decode(combined_ids["input_ids"], skip_special_tokens=False))
#print(model(input_ids = combined_ids.input_ids, labels = labels.unsqueeze(0))) 
print(concated_ids)
print(combined_ids["input_ids"])
print(model(input_ids = concated_ids, labels = labels.unsqueeze(0)).loss)
print(model(input_ids = combined_ids['input_ids'], labels = labels.unsqueeze(0)).loss)

input_ids: torch.Size([1, 15])
combined_ids: torch.Size([1, 22])
ans_text: torch.Size([1, 8])
15
22
tensor([  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100, 235248,  42130,  19891,
        236280, 238069, 237619, 235362])
len(labels): 22
len(combined_ids): 22
len(ans_ids): 8
concated_ids: torch.Size([1, 22])
['<bos>Translate the given sentence into Chinese: Mother loves to eat fried chickens. 妈妈喜欢吃炸鸡。']
['<bos>Translate the given sentence into Chinese: Mother loves to eat fried chickens. 妈妈喜欢吃炸鸡。']
tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,  17025,
          16147,    577,   7812,  30196,  53190, 235265, 235248,  42130,  19891,
         236280, 238069, 237619, 235362]])
tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,  17025,
          16147,    577,   7812,  30196,  53190, 235265, 235248,  42130,  19891,
         236280, 238069, 237619, 235362]])
tensor(

In [333]:


input_text = "Translate the given sentence into Chinese: I enjoy eating fish."
ans_text = " 我喜欢吃鱼。"
combined_text = input_text +  ans_text
input_ids = tokenizer(input_text, return_tensors="pt")
combined_ids = tokenizer(combined_text, return_tensors="pt")
ans_ids = tokenizer(ans_text, return_tensors="pt")

#19891 --》 194816



labels = combined_ids["input_ids"][0].clone()
labels[:len(input_ids.input_ids[0])] = -100
print(f"input_ids: {input_ids.input_ids.shape}")
print(f"combined_ids: {combined_ids.input_ids.shape}")
print(f"ans_text: {ans_ids.input_ids.shape}")
print(len(input_ids["input_ids"][0]))
print(len(combined_ids["input_ids"][0]))
print(labels)
print(f"len(labels): {len(labels)}")
print(f"len(combined_ids): {len(combined_ids['input_ids'][0])}")
print(f"len(ans_ids): {len(ans_ids['input_ids'][0])}")
concated_ids = torch.cat([input_ids["input_ids"], ans_ids["input_ids"][:,1:]], dim = 1)
print(f"concated_ids: {concated_ids.shape}")    
print(tokenizer.batch_decode(concated_ids , skip_special_tokens=False))
print(tokenizer.batch_decode(combined_ids["input_ids"], skip_special_tokens=False))
#print(model(input_ids = combined_ids.input_ids, labels = labels.unsqueeze(0))) 
print(concated_ids)
print(combined_ids["input_ids"])
print(model(input_ids = concated_ids, labels = labels.unsqueeze(0)).loss)
print(model(input_ids = combined_ids['input_ids'], labels = labels.unsqueeze(0)).loss)

input_ids: torch.Size([1, 13])
combined_ids: torch.Size([1, 18])
ans_text: torch.Size([1, 6])
13
18
tensor([  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,  25736,  19891, 236280, 237098, 235362])
len(labels): 18
len(combined_ids): 18
len(ans_ids): 6
concated_ids: torch.Size([1, 18])
['<bos>Translate the given sentence into Chinese: I enjoy eating fish. 我喜欢吃鱼。']
['<bos>Translate the given sentence into Chinese: I enjoy eating fish. 我喜欢吃鱼。']
tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,    590,
           4313,  12150,   5001, 235265,  25736,  19891, 236280, 237098, 235362]])
tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,    590,
           4313,  12150,   5001, 235265,  25736,  19891, 236280, 237098, 235362]])
tensor(3.6193, grad_fn=<NllLossBackward0>)
tensor(3.6193, grad_fn=<NllLossBackward0>)


[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100, 235248,  42130,  19891,
         236280, 238069, 237619, 235362],

Note that there is a difference between concatenating them as a combined sentence then tokenising versus tokenising individual parts and then concatenating  

### Test Gemma with embeddings and see if the loss is still the same

In [334]:
with torch.no_grad():
    # Get the input embeddings
    input_embeds = model.get_input_embeddings()(input_ids["input_ids"])
    print(input_embeds.shape)
    ans_embeds = model.get_input_embeddings()(ans_ids["input_ids"])
    print(ans_embeds.shape)
    combined_embeds = model.get_input_embeddings()(combined_ids["input_ids"])
    combined_embed_v2 = torch.cat((input_embeds, ans_embeds[:,1:]), dim = 1)
    print(combined_embeds.shape)
    print(combined_embed_v2.shape)

print(model(inputs_embeds = combined_embed_v2, labels = labels.unsqueeze(0)).loss)

torch.Size([1, 13, 2048])
torch.Size([1, 6, 2048])
torch.Size([1, 18, 2048])
torch.Size([1, 18, 2048])
tensor(3.6193, grad_fn=<NllLossBackward0>)


In [312]:


input_text = "Translate the given sentence into Chinese: Mother loves to eat fried chickens."
ans_text = " 妈妈喜欢吃炸鸡。"
combined_text = input_text +  ans_text
input_ids = tokenizer(input_text, return_tensors="pt")
combined_ids = tokenizer(combined_text, return_tensors="pt")
ans_ids = tokenizer(ans_text, return_tensors="pt")

#19891 --》 194816

space_token = 235248 

labels = combined_ids["input_ids"][0].clone()
labels[:len(input_ids.input_ids[0])] = -100
print(f"input_ids: {input_ids.input_ids.shape}")
print(f"combined_ids: {combined_ids.input_ids.shape}")
print(f"ans_text: {ans_ids.input_ids.shape}")
print(len(input_ids["input_ids"][0]))
print(len(combined_ids["input_ids"][0]))
print(labels)
print(f"len(labels): {len(labels)}")
print(f"len(combined_ids): {len(combined_ids['input_ids'][0])}")
print(f"len(ans_ids): {len(ans_ids['input_ids'][0])}")
concated_ids = torch.cat([input_ids["input_ids"], ans_ids["input_ids"][:,1:]], dim = 1)
print(f"concated_ids: {concated_ids.shape}")    
print(tokenizer.batch_decode(concated_ids , skip_special_tokens=False))
print(tokenizer.batch_decode(combined_ids["input_ids"], skip_special_tokens=False))
#print(model(input_ids = combined_ids.input_ids, labels = labels.unsqueeze(0))) 
print(concated_ids)
print(combined_ids["input_ids"])
print(model(input_ids = concated_ids, labels = labels.unsqueeze(0)).loss)
print(model(input_ids = combined_ids['input_ids'], labels = labels.unsqueeze(0)).loss)


input_ids: torch.Size([1, 15])
combined_ids: torch.Size([1, 22])
ans_text: torch.Size([1, 8])
15
22
tensor([  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100, 235248,  42130,  19891,
        236280, 238069, 237619, 235362])
len(labels): 22
len(combined_ids): 22
len(ans_ids): 8
concated_ids: torch.Size([1, 22])
['<bos>Translate the given sentence into Chinese: Mother loves to eat fried chickens. 妈妈喜欢吃炸鸡。']
['<bos>Translate the given sentence into Chinese: Mother loves to eat fried chickens. 妈妈喜欢吃炸鸡。']
tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,  17025,
          16147,    577,   7812,  30196,  53190, 235265, 235248,  42130,  19891,
         236280, 238069, 237619, 235362]])
tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,  17025,
          16147,    577,   7812,  30196,  53190, 235265, 235248,  42130,  19891,
         236280, 238069, 237619, 235362]])
tensor(

In [322]:
test_embeds = model.get_input_embeddings()(combined_ids['input_ids'])

In [209]:
print(labels.unsqueeze(0).shape)

torch.Size([1, 21])


Even though there is a difference in the loss values, let's just go with it first. （check if still applicable）

My stance now is that we put the space before the target sentence

In [106]:
print(tokenizer.decode(outputs[0][len(input_ids.input_ids[0]):], skip_special_tokens=False))

这句话的意思是：我喜欢吃炸鸡。

This translation is correct. It accurately captures the meaning of the sentence "I like to eat fried chicken."<eos>


### Creating Causal inputs and Causal outputs 

In [None]:
def create_causal_inputs(visual_feats ,visual_attn,  tgt_ids): 
    prompt = "Translate the given sentence into Chinese:" # The space will be added by the model itself
    prompt_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
    prompt_len = len(prompt_ids["input_ids"][0])
    #print(f"prompt_len: {prompt_len}")
    #print(f"visual feats shape: {visual_feats.shape}")
    prompt_embeds = model.get_input_embeddings()(prompt_ids["input_ids"]).squeeze()
    tgt_embeds = model.get_input_embeddings()(tgt_ids["input_ids"])

    #print(f"tgt embeds: {tgt_embeds.shape}")
    tgt_attn = tgt_ids["attention_mask"]
    #print(f"tgt attn: {tgt_attn.shape}")
    new_input_embeds = [] 
    new_labels = []

    for i in range(tgt_embeds.shape[0]): # batch size
        curr_vis_feats = visual_feats[i] # take the current visual features
        curr_vis_attn = visual_attn[i] # take the current visual attention mask
        curr_vis_feats = curr_vis_feats[curr_vis_attn==1] # only take the visual features that are attended to

        curr_vis_len = curr_vis_feats.shape[0] # get the length of the visual features

        curr_tgt_embeds = tgt_embeds[i] # take the current target embeddings

        curr_tgt_feats = curr_tgt_embeds[tgt_attn[i]==1][1:] # only take the target embeddings that are attended to, remove the bos token 
 
        combined_embeds = torch.cat((prompt_embeds, curr_vis_feats, curr_tgt_feats), dim = 0) #Concat all the embeddings
        #print(f"combined_embeds: {combined_embeds}")
        new_input_embeds.append(combined_embeds) 
        negate_tgt = torch.full((1, prompt_len + curr_vis_len), -100) # create the -100 labels for the model (only the target text is not -100)
        #print(f"negate_tgt: {negate_tgt.shape}")
        labels =torch.cat([negate_tgt,  tgt_ids["input_ids"][i][tgt_attn[i]==1][1:].clone().unsqueeze(0)], dim =1).permute(1,0) # Concat both the -100s and the target text

        new_labels.append(labels) # append the labels
        assert labels.shape[0] == len(combined_embeds), f"len labels: {labels.shape} vs len combined_embeds: {combined_embeds.shape}" 
        # assert the length of the labels is the same as the combined embeddings
    
    # perform padding for the batch before returning
    new_input_embeds = torch.nn.utils.rnn.pad_sequence(new_input_embeds, batch_first=True, padding_value=0)
    #print("HERE", [labels.shape for labels in new_labels])
    new_labels = torch.nn.utils.rnn.pad_sequence(new_labels, batch_first=True, padding_value=-100).squeeze()
    new_labels[new_labels==0]=-100
    return new_input_embeds, new_labels

'''Batch of 1 test case'''
input_text = " Mother loves to eat fried chickens."
input_ids = tokenizer(input_text, return_tensors="pt")
print(input_ids)

input_embeds = model.get_input_embeddings()(input_ids["input_ids"])[:, 1: ] # special case need to cut the extra bos token in front for the english sentence too
ans_text = " 妈妈喜欢吃炸鸡。"
tgt_ids = tokenizer(ans_text , return_tensors="pt", add_special_tokens=True)
new_input_embeds , new_labels=  create_causal_inputs(input_embeds,input_ids.attention_mask[:, 1:],  tgt_ids)

print(model(inputs_embeds= new_input_embeds, labels =new_labels).loss) 

'''Batch of 1 test case NUMBER 2'''
input_text = " I enjoy eating fish."
input_ids = tokenizer(input_text, return_tensors="pt")
print(input_ids)

input_embeds = model.get_input_embeddings()(input_ids["input_ids"])[:, 1: ] # special case need to cut the extra bos token in front for the english sentence too
ans_text = " 我喜欢吃鱼。"
tgt_ids = tokenizer(ans_text , return_tensors="pt", add_special_tokens=True)
new_input_embeds , new_labels=  create_causal_inputs(input_embeds, input_ids.attention_mask[:, 1:], tgt_ids)
print(model(inputs_embeds= new_input_embeds, labels =new_labels).loss) 
    
'''Batch of 2 test case'''
input_text = [" Mother loves to eat fried chickens."," I enjoy eating fish."]
input_ids = tokenizer(input_text, return_tensors="pt", padding = True )
print("input ids", input_ids)
input_embeds = model.get_input_embeddings()(input_ids["input_ids"])[:, 1: ] # special case need to cut the extra bos token in front for the english sentence too
ans_text = [" 妈妈喜欢吃炸鸡。", " 我喜欢吃鱼。"]
tgt_tokenizer = AutoTokenizer.from_pretrained("gemma_instruct_2b")
tgt_tokenizer.padding_side = "right"
tgt_ids = tgt_tokenizer(ans_text , return_tensors="pt", add_special_tokens=True, padding = True)
print("tgt_ids ", tgt_ids)
new_input_embeds , new_labels =  create_causal_inputs(input_embeds, input_ids.attention_mask [:, 1:], tgt_ids)
print(new_input_embeds.shape)
print(new_labels)
model(inputs_embeds= new_input_embeds, labels =new_labels).loss

{'input_ids': tensor([[     2,  17025,  16147,    577,   7812,  30196,  53190, 235265]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
tensor(1.4907, grad_fn=<NllLossBackward0>)
{'input_ids': tensor([[     2,    590,   4313,  12150,   5001, 235265]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
tensor(3.6193, grad_fn=<NllLossBackward0>)
input ids {'input_ids': tensor([[     2,  17025,  16147,    577,   7812,  30196,  53190, 235265],
        [     2,    590,   4313,  12150,   5001, 235265,      0,      0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0]])}
tgt_ids  {'input_ids': tensor([[     2, 235248,  42130,  19891, 236280, 238069, 237619, 235362],
        [     2,  25736,  19891, 236280, 237098, 235362,      0,      0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0]])}
torch.Size([2, 22, 2048])
tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,  

tensor(2.3777, grad_fn=<NllLossBackward0>)

In [323]:
print(test_embeds.shape)
test_embeds


torch.Size([1, 22, 2048])


tensor([[[ 0.1035,  0.0052, -0.0330,  ..., -0.0170, -0.0087, -0.0099],
         [ 0.2734, -0.0055,  0.0292,  ..., -0.0077,  0.0947,  0.0315],
         [ 0.2373, -0.0249, -0.0918,  ..., -0.0045,  0.0388,  0.0212],
         ...,
         [ 0.2490, -0.0132, -0.0410,  ..., -0.0186,  0.0549,  0.0041],
         [ 0.2344, -0.0138,  0.0146,  ..., -0.0047,  0.0442,  0.0232],
         [ 0.1895, -0.0265, -0.0408,  ..., -0.0092, -0.0200, -0.0006]]],
       grad_fn=<EmbeddingBackward0>)

In [None]:
print(new_input_embeds.shape)
new_input_embeds

torch.Size([22, 2048])


[tensor([[ 0.1035,  0.0052, -0.0330,  ..., -0.0170, -0.0087, -0.0099],
         [ 0.2734, -0.0055,  0.0292,  ..., -0.0077,  0.0947,  0.0315],
         [ 0.2373, -0.0249, -0.0918,  ..., -0.0045,  0.0388,  0.0212],
         ...,
         [ 0.2197,  0.0217, -0.1196,  ...,  0.0535, -0.0075, -0.0242],
         [ 0.2197,  0.0217, -0.1196,  ...,  0.0535, -0.0075, -0.0242],
         [ 0.2197,  0.0217, -0.1196,  ...,  0.0535, -0.0075, -0.0242]],
        grad_fn=<CatBackward0>)]

In [316]:
new_labels

tensor([  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100, 235248,  42130,  19891,
        236280, 238069, 237619, 235362])

In [262]:
new_attention_mask

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
         1., 0., 0.]])

In [264]:
new_labels

tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,  25736,  19891, 236280,
         238069, 237619, 235362],
        [  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,  25736, 113066, 107590, 235362,
           -100,   -100,   -100]])

In [259]:
print(new_input_embeds.shape)

torch.Size([2, 21, 2048])


In [210]:
new_attention_mask

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
         1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
         1., 1., 1., 0.]])

### Testing out M2M seq2seq training 

In [52]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

hi_text = "जीवन एक चॉकलेट बॉक्स की तरह है।"
chinese_text = "生活就像一盒巧克力。"

model = M2M100ForConditionalGeneration.from_pretrained("m2m_1.2b")
tokenizer = M2M100Tokenizer.from_pretrained("m2m_1.2b")

# # translate Hindi to French
# tokenizer.src_lang = "hi"
# encoded_hi = tokenizer(hi_text, return_tensors="pt")
# generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("fr"))
# tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# # => "La vie est comme une boîte de chocolat."

# translate Chinese to English
tokenizer.src_lang = "zh"
encoded_zh = tokenizer(chinese_text, return_tensors="pt")
generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "Life is like a box of chocolate."




['Life is like a box of chocolate.']

In [338]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load Gemma tokenizer and model
model_name = "gemma_instruct_2b"  # Replace with your specific Gemma model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [345]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    task_type="CAUSAL_LM",   # Type of task
    inference_mode=False,    # Enable training
    r=8,                     # Low-rank dimension
    lora_alpha=16,           # Scaling factor
    lora_dropout=0.1,        # Dropout rate for LoRA
    target_modules=["q_proj", "v_proj"]  # Target modules (Gemma specific)
)

# Apply LoRA to the model
lora_model = get_peft_model(model, lora_config)

# Check trainable parameters
lora_model.print_trainable_parameters()

trainable params: 3,538,944 || all params: 1,243,009,024 || trainable%: 0.2847


In [341]:
lora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 2048, padding_idx=0)
        (layers): ModuleList(
          (0-17): 18 x GemmaDecoderLayer(
            (self_attn): GemmaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pr

In [344]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Load M2M100 model and tokenizer
model_name = "m2m_1.2b"  # Or "facebook/m2m100_1.2B"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",   # Task type for sequence-to-sequence models
    inference_mode=False,       # Enable training mode
    r=8,                        # Low-rank dimension
    lora_alpha=16,              # Scaling factor
    lora_dropout=0.1,           # Dropout rate
    target_modules=["q_proj", "k_proj"]  # Target specific layers in the attention mechanism
)

# Apply LoRA to the model
lora_model = get_peft_model(model, lora_config)

# Print trainable parameters
lora_model.print_trainable_parameters()


trainable params: 2,359,296 || all params: 1,241,829,376 || trainable%: 0.1900


In [16]:
import torch
import copy


def update_lgt(self, lgt):
    lgt = torch.tensor(lgt)
    feat_len = copy.deepcopy(lgt)  # Deep copy the input
    for ks in self.kernel_size:
        if ks[0] == 'P':
            feat_len = torch.div(feat_len, 2)
        else:
            feat_len -= int(ks[1]) - 1
    lgt = lgt.cpu().to(torch.int).tolist()
    return feat_len

# Example usage
class ExampleModel:
    def __init__(self):
        self.kernel_size = [('P', 2), ('C', 3)]

model = ExampleModel()

# Python list
lgt_list = [8, 16, 32]

# Convert to tensor
lgt_tensor = torch.tensor(lgt_list, dtype=torch.int32)

# Call the function
updated_lgt = update_lgt(model, lgt_tensor)

print(updated_lgt[1].item())  # Output will be a PyTorch tensor


6.0


  lgt = torch.tensor(lgt)


In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the Flan-T5 model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Example input sentence
input_text = "Translate the following English sentence into French: The weather is beautiful today."

# Tokenize the input text
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Retrieve embeddings
embed_tokens = model.get_input_embeddings()
embeddings = embed_tokens(input_ids)
input_embeds = model.encoder(input_ids)
# Generate output (translation)
output_ids = model.generate(input_ids)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(f"Input Text: {input_text}")
print(f"Output Text: {output_text}")
print(f"Embedding Shape: {embeddings.shape}")  # (batch_size, seq_length, hidden_size)
print(input_embeds)

Input Text: Translate the following English sentence into French: The weather is beautiful today.
Output Text: La météo est superbe aujourd'hui.
Embedding Shape: torch.Size([1, 16, 768])
BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[-0.0636,  0.0402,  0.1111,  ..., -0.0441, -0.0149,  0.1803],
         [-0.0834,  0.0254,  0.1176,  ..., -0.0673, -0.0142,  0.1538],
         [-0.0116, -0.0046,  0.0072,  ...,  0.0003,  0.0025, -0.0064],
         ...,
         [ 0.0530, -0.0140, -0.0764,  ...,  0.0693,  0.0731,  0.1969],
         [ 0.0275,  0.0268, -0.0502,  ..., -0.1072, -0.0836,  0.0221],
         [ 0.0106,  0.0061,  0.0141,  ...,  0.0032, -0.0032,  0.0017]]],
       grad_fn=<MulBackward0>), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)


In [10]:
embeddings.shape

torch.Size([1, 16, 768])

In [11]:
input_embeds.last_hidden_state.shape

torch.Size([1, 16, 768])

In [None]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [13]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_name = "facebook/m2m100_418M"  # You can also use "facebook/m2m100_1.2B" for a larger model
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer = M2M100Tokenizer.from_pretrained(model_name)




In [26]:
def translate(text, src_lang, tgt_lang):
    # Set the source language
    tokenizer.src_lang = src_lang
    # Tokenize the input text
    encoded = tokenizer(text, return_tensors="pt")
    # Generate translation
    print(f"encoded: {len(encoded.input_ids[0]) }")
    generated_tokens = model.generate(
        **encoded,
        forced_bos_token_id=tokenizer.get_lang_id(tgt_lang)
    )
    # Decode the generated tokens
    translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return translation, encoded.input_ids


In [27]:
source_text = "The weather is beautiful today."
source_language = "en"
target_language = "fr"

translated_text, input_ids = translate(source_text, source_language, target_language)
print(f"Translated Text: {translated_text}")


encoded: 8
Translated Text: Le temps est beau aujourd’hui.


In [16]:
model

M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
       

In [28]:
# Access the token embedding layer
embed_tokens = model.get_input_embeddings()

# Get embeddings for the input IDs
embeddings = embed_tokens(input_ids)

print(f"Input Embeddings Shape: {embeddings.shape}") 

Input Embeddings Shape: torch.Size([1, 8, 1024])


In [29]:
input_embeds = model.model.encoder(input_ids)

In [30]:
input_embeds.last_hidden_state.shape

torch.Size([1, 8, 1024])

In [31]:
embeddings

tensor([[[ 0.6396,  2.0391,  4.0664,  ...,  2.8359, -1.1758,  1.7441],
         [-0.1472,  1.4834,  1.0049,  ...,  0.5913, -0.0179,  0.7139],
         [-3.8184,  2.4570,  0.9404,  ..., -2.0527, -0.1790, -0.2927],
         ...,
         [-0.4248, -0.7891,  1.0439,  ..., -0.5562, -0.7646, -1.5479],
         [-0.1066,  1.3613,  0.1825,  ...,  0.1327,  0.9346,  0.2668],
         [-1.2041, -0.9409, -1.0039,  ..., -0.9634, -1.1104, -0.9634]]],
       grad_fn=<MulBackward0>)

In [32]:
input_embeds

BaseModelOutput(last_hidden_state=tensor([[[-1.2980,  1.6914,  0.6505,  ...,  0.1016,  1.0246,  1.0725],
         [-0.6048, -0.0893, -0.7881,  ..., -0.4916, -0.0806,  0.7622],
         [-0.8598,  0.4656, -1.0828,  ..., -1.1058,  0.0023,  0.3622],
         ...,
         [ 0.1949, -0.4941,  0.9724,  ..., -0.3480,  0.6940, -0.2034],
         [ 0.1504,  1.1086,  1.0632,  ...,  0.4844, -0.9822,  0.4157],
         [ 0.0537,  0.0558, -0.0017,  ...,  0.0103,  0.0032, -0.0272]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [3]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Load model and tokenizer
model_name = "m2m_1.2b"
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer = M2M100Tokenizer.from_pretrained(model_name)

# Set the source language
tokenizer.src_lang = "en"

# Example input text
input_text = "The weather is beautiful today."

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt")
print(f"Input IDs: {inputs.input_ids}")

# Access the embedding layer
embed_tokens = model.get_input_embeddings()

# Get input embeddings
input_embeddings = embed_tokens(inputs.input_ids)
print(f"Input Embeddings Shape: {input_embeddings.shape}")

# Pass the inputs through the encoder
encoder_outputs = model.get_encoder()(inputs.input_ids)

# Get the last hidden states from the encoder
encoder_hidden_states = encoder_outputs.last_hidden_state
print(f"Encoder Hidden States Shape: {encoder_hidden_states.shape}")






Input IDs: tensor([[128022,   1658, 124299,    117, 120341, 118264,      5,      2]])
Input Embeddings Shape: torch.Size([1, 8, 1024])
Encoder Hidden States Shape: torch.Size([1, 8, 1024])


In [2]:
print(inputs)

{'input_ids': tensor([[128022,   1658, 124299,    117, 120341, 118264,      5,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


In [5]:
tokenizer.pad_token_id

1

In [6]:
from transformers import AutoTokenizer, MT5ForConditionalGeneration

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")

# training
input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
outputs = model(input_ids=input_ids, labels=labels)
loss = outputs.loss
logits = outputs.logits

# studies have shown that owning a dog is good for you.

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

<extra_id_0>




In [13]:

# inference
input_ids = tokenizer(
    "How have you been?", return_tensors="pt"
).input_ids  # Batch size 1
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]
                       ))

<pad> <extra_id_0></s>


In [15]:
import pickle
import gzip 
def load_dataset_file(filename):
    with gzip.open(filename, "rb") as f:
        loaded_object = pickle.load(f)
        return loaded_object

# Print the loaded data
data = load_dataset_file("/Users/tannicholas/Downloads/labels.dev")

{'dev/11August_2010_Wednesday_tagesschau-2': {'name': 'dev/11August_2010_Wednesday_tagesschau-2', 'gloss': 'DRUCK TIEF KOMMEN', 'text': 'tiefer luftdruck bestimmt in den nächsten tagen unser wetter', 'length': 42, 'imgs_path': ['dev/11August_2010_Wednesday_tagesschau-2/images0001.png', 'dev/11August_2010_Wednesday_tagesschau-2/images0002.png', 'dev/11August_2010_Wednesday_tagesschau-2/images0003.png', 'dev/11August_2010_Wednesday_tagesschau-2/images0004.png', 'dev/11August_2010_Wednesday_tagesschau-2/images0005.png', 'dev/11August_2010_Wednesday_tagesschau-2/images0006.png', 'dev/11August_2010_Wednesday_tagesschau-2/images0007.png', 'dev/11August_2010_Wednesday_tagesschau-2/images0008.png', 'dev/11August_2010_Wednesday_tagesschau-2/images0009.png', 'dev/11August_2010_Wednesday_tagesschau-2/images0010.png', 'dev/11August_2010_Wednesday_tagesschau-2/images0011.png', 'dev/11August_2010_Wednesday_tagesschau-2/images0012.png', 'dev/11August_2010_Wednesday_tagesschau-2/images0013.png', 'dev/

In [None]:
# Load model and tokenizer
model_name = "m2m_12b"
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer = M2M100Tokenizer.from_pretrained(model_name)