### Testing the Causal model


In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.models.mbart.modeling_mbart import shift_tokens_right
tokenizer = AutoTokenizer.from_pretrained("gemma_instruct_7b")
model = AutoModelForCausalLM.from_pretrained("gemma_instruct_7b")

  from .autonotebook import tqdm as notebook_tqdm
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.66s/it]


In [5]:
input_text = "Übersetzen Sie den gegebenen Satz ins Chinesische: Mutter isst gerne Brathähnchen."

input_ids = tokenizer(input_text, return_tensors="pt")
print(input_ids)
outputs = model.generate(**input_ids, max_length = 50 )
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

{'input_ids': tensor([[     2,  33571,  41516,   3670,   1600, 172299,  78450,   2029, 212951,
           6765, 235292,  45310,    603,    490,  36236,   9070,    753,  42442,
           1748, 235265]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
<bos>Übersetzen Sie den gegebenen Satz ins Chinesische: Mutter isst gerne Brathähnchen.

Mutter isst gerne Brathähnchen.

→ 母亲喜欢吃烤鸡。<eos>


In [126]:
outputs

tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,  17025,
          16147,    577,   7812,  30196,  53190, 235265, 235248,    109,  50039,
         235465,  56506]])

In [3]:
input_text = "Translate the given sentence into Chinese: I like to eat fried chicken."
input_ids = tokenizer(input_text, return_tensors="pt")

print(input_ids)

{'input_ids': tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,    590,
           1154,    577,   7812,  30196,  12254, 235265]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [4]:
# Extract input embeddings using the model's embedding layer
import torch 
with torch.no_grad():
    # Get the input embeddings
    input_embeds = model.get_input_embeddings()(input_ids["input_ids"])
    print(input_embeds.shape)
#output = model(inputs_embeds = input_embeds)

torch.Size([1, 15, 2048])


In [5]:
output = model(input_ids = input_ids.input_ids, labels = input_ids.input_ids)

### Testing gemma masking for causal finetuning

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("gemma_instruct_2b")
model = AutoModelForCausalLM.from_pretrained(
    "gemma_instruct_2b"
)

In [81]:
tokenizer(" ")

{'input_ids': [2, 235248], 'attention_mask': [1, 1]}

In [163]:
print(tokenizer.batch_decode(torch.tensor([1,  30485], dtype=torch.long)))
print(tokenizer.batch_decode(torch.tensor([1,  235248], dtype=torch.long)))
print(tokenizer.batch_decode(torch.tensor([1,  19891], dtype=torch.long)))
print(tokenizer.batch_decode(torch.tensor([235248,  42130,  19891, 236280, 238069, 237619, 235362], dtype=torch.long)))



['<eos>', ' 你']
['<eos>', ' ']
['<eos>', '喜欢']
[' ', '妈妈', '喜欢', '吃', '炸', '鸡', '。']


lowest loss: 
concated then tokenized tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,   1646,
           2182,    577,   7812,  30196,  53190, 235265,  30485,  19891, 236280,
         238069, 237619, 235362]])
         
Remember to absorb the space 

In [301]:


input_text = "Translate the given sentence into Chinese: Mother loves to eat fried chickens."
ans_text = " 妈妈喜欢吃炸鸡。"
combined_text = input_text +  ans_text
input_ids = tokenizer(input_text, return_tensors="pt")
combined_ids = tokenizer(combined_text, return_tensors="pt")
ans_ids = tokenizer(ans_text, return_tensors="pt")

#19891 --》 194816



labels = combined_ids["input_ids"][0].clone()
labels[:len(input_ids.input_ids[0])] = -100
print(f"input_ids: {input_ids.input_ids.shape}")
print(f"combined_ids: {combined_ids.input_ids.shape}")
print(f"ans_text: {ans_ids.input_ids.shape}")
print(len(input_ids["input_ids"][0]))
print(len(combined_ids["input_ids"][0]))
print(labels)
print(f"len(labels): {len(labels)}")
print(f"len(combined_ids): {len(combined_ids['input_ids'][0])}")
print(f"len(ans_ids): {len(ans_ids['input_ids'][0])}")
concated_ids = torch.cat([input_ids["input_ids"], ans_ids["input_ids"][:,1:]], dim = 1)
print(f"concated_ids: {concated_ids.shape}")    
print(tokenizer.batch_decode(concated_ids , skip_special_tokens=False))
print(tokenizer.batch_decode(combined_ids["input_ids"], skip_special_tokens=False))
#print(model(input_ids = combined_ids.input_ids, labels = labels.unsqueeze(0))) 
print(concated_ids)
print(combined_ids["input_ids"])
print(model(input_ids = concated_ids, labels = labels.unsqueeze(0)).loss)
print(model(input_ids = combined_ids['input_ids'], labels = labels.unsqueeze(0)).loss)

input_ids: torch.Size([1, 15])
combined_ids: torch.Size([1, 22])
ans_text: torch.Size([1, 8])
15
22
tensor([  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100, 235248,  42130,  19891,
        236280, 238069, 237619, 235362])
len(labels): 22
len(combined_ids): 22
len(ans_ids): 8
concated_ids: torch.Size([1, 22])
['<bos>Translate the given sentence into Chinese: Mother loves to eat fried chickens. 妈妈喜欢吃炸鸡。']
['<bos>Translate the given sentence into Chinese: Mother loves to eat fried chickens. 妈妈喜欢吃炸鸡。']
tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,  17025,
          16147,    577,   7812,  30196,  53190, 235265, 235248,  42130,  19891,
         236280, 238069, 237619, 235362]])
tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,  17025,
          16147,    577,   7812,  30196,  53190, 235265, 235248,  42130,  19891,
         236280, 238069, 237619, 235362]])
tensor(

In [333]:


input_text = "Translate the given sentence into Chinese: I enjoy eating fish."
ans_text = " 我喜欢吃鱼。"
combined_text = input_text +  ans_text
input_ids = tokenizer(input_text, return_tensors="pt")
combined_ids = tokenizer(combined_text, return_tensors="pt")
ans_ids = tokenizer(ans_text, return_tensors="pt")

#19891 --》 194816



labels = combined_ids["input_ids"][0].clone()
labels[:len(input_ids.input_ids[0])] = -100
print(f"input_ids: {input_ids.input_ids.shape}")
print(f"combined_ids: {combined_ids.input_ids.shape}")
print(f"ans_text: {ans_ids.input_ids.shape}")
print(len(input_ids["input_ids"][0]))
print(len(combined_ids["input_ids"][0]))
print(labels)
print(f"len(labels): {len(labels)}")
print(f"len(combined_ids): {len(combined_ids['input_ids'][0])}")
print(f"len(ans_ids): {len(ans_ids['input_ids'][0])}")
concated_ids = torch.cat([input_ids["input_ids"], ans_ids["input_ids"][:,1:]], dim = 1)
print(f"concated_ids: {concated_ids.shape}")    
print(tokenizer.batch_decode(concated_ids , skip_special_tokens=False))
print(tokenizer.batch_decode(combined_ids["input_ids"], skip_special_tokens=False))
#print(model(input_ids = combined_ids.input_ids, labels = labels.unsqueeze(0))) 
print(concated_ids)
print(combined_ids["input_ids"])
print(model(input_ids = concated_ids, labels = labels.unsqueeze(0)).loss)
print(model(input_ids = combined_ids['input_ids'], labels = labels.unsqueeze(0)).loss)

input_ids: torch.Size([1, 13])
combined_ids: torch.Size([1, 18])
ans_text: torch.Size([1, 6])
13
18
tensor([  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,  25736,  19891, 236280, 237098, 235362])
len(labels): 18
len(combined_ids): 18
len(ans_ids): 6
concated_ids: torch.Size([1, 18])
['<bos>Translate the given sentence into Chinese: I enjoy eating fish. 我喜欢吃鱼。']
['<bos>Translate the given sentence into Chinese: I enjoy eating fish. 我喜欢吃鱼。']
tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,    590,
           4313,  12150,   5001, 235265,  25736,  19891, 236280, 237098, 235362]])
tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,    590,
           4313,  12150,   5001, 235265,  25736,  19891, 236280, 237098, 235362]])
tensor(3.6193, grad_fn=<NllLossBackward0>)
tensor(3.6193, grad_fn=<NllLossBackward0>)


[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100, 235248,  42130,  19891,
         236280, 238069, 237619, 235362],

Note that there is a difference between concatenating them as a combined sentence then tokenising versus tokenising individual parts and then concatenating  

### Test Gemma with embeddings and see if the loss is still the same

In [334]:
with torch.no_grad():
    # Get the input embeddings
    input_embeds = model.get_input_embeddings()(input_ids["input_ids"])
    print(input_embeds.shape)
    ans_embeds = model.get_input_embeddings()(ans_ids["input_ids"])
    print(ans_embeds.shape)
    combined_embeds = model.get_input_embeddings()(combined_ids["input_ids"])
    combined_embed_v2 = torch.cat((input_embeds, ans_embeds[:,1:]), dim = 1)
    print(combined_embeds.shape)
    print(combined_embed_v2.shape)

print(model(inputs_embeds = combined_embed_v2, labels = labels.unsqueeze(0)).loss)

torch.Size([1, 13, 2048])
torch.Size([1, 6, 2048])
torch.Size([1, 18, 2048])
torch.Size([1, 18, 2048])
tensor(3.6193, grad_fn=<NllLossBackward0>)


In [312]:


input_text = "Translate the given sentence into Chinese: Mother loves to eat fried chickens."
ans_text = " 妈妈喜欢吃炸鸡。"
combined_text = input_text +  ans_text
input_ids = tokenizer(input_text, return_tensors="pt")
combined_ids = tokenizer(combined_text, return_tensors="pt")
ans_ids = tokenizer(ans_text, return_tensors="pt")

#19891 --》 194816

space_token = 235248 

labels = combined_ids["input_ids"][0].clone()
labels[:len(input_ids.input_ids[0])] = -100
print(f"input_ids: {input_ids.input_ids.shape}")
print(f"combined_ids: {combined_ids.input_ids.shape}")
print(f"ans_text: {ans_ids.input_ids.shape}")
print(len(input_ids["input_ids"][0]))
print(len(combined_ids["input_ids"][0]))
print(labels)
print(f"len(labels): {len(labels)}")
print(f"len(combined_ids): {len(combined_ids['input_ids'][0])}")
print(f"len(ans_ids): {len(ans_ids['input_ids'][0])}")
concated_ids = torch.cat([input_ids["input_ids"], ans_ids["input_ids"][:,1:]], dim = 1)
print(f"concated_ids: {concated_ids.shape}")    
print(tokenizer.batch_decode(concated_ids , skip_special_tokens=False))
print(tokenizer.batch_decode(combined_ids["input_ids"], skip_special_tokens=False))
#print(model(input_ids = combined_ids.input_ids, labels = labels.unsqueeze(0))) 
print(concated_ids)
print(combined_ids["input_ids"])
print(model(input_ids = concated_ids, labels = labels.unsqueeze(0)).loss)
print(model(input_ids = combined_ids['input_ids'], labels = labels.unsqueeze(0)).loss)


input_ids: torch.Size([1, 15])
combined_ids: torch.Size([1, 22])
ans_text: torch.Size([1, 8])
15
22
tensor([  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100, 235248,  42130,  19891,
        236280, 238069, 237619, 235362])
len(labels): 22
len(combined_ids): 22
len(ans_ids): 8
concated_ids: torch.Size([1, 22])
['<bos>Translate the given sentence into Chinese: Mother loves to eat fried chickens. 妈妈喜欢吃炸鸡。']
['<bos>Translate the given sentence into Chinese: Mother loves to eat fried chickens. 妈妈喜欢吃炸鸡。']
tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,  17025,
          16147,    577,   7812,  30196,  53190, 235265, 235248,  42130,  19891,
         236280, 238069, 237619, 235362]])
tensor([[     2,  49688,    573,   2764,  13060,   1280,   8974, 235292,  17025,
          16147,    577,   7812,  30196,  53190, 235265, 235248,  42130,  19891,
         236280, 238069, 237619, 235362]])
tensor(

In [322]:
test_embeds = model.get_input_embeddings()(combined_ids['input_ids'])

In [209]:
print(labels.unsqueeze(0).shape)

torch.Size([1, 21])


Even though there is a difference in the loss values, let's just go with it first. （check if still applicable）

My stance now is that we put the space before the target sentence

In [106]:
print(tokenizer.decode(outputs[0][len(input_ids.input_ids[0]):], skip_special_tokens=False))

这句话的意思是：我喜欢吃炸鸡。

This translation is correct. It accurately captures the meaning of the sentence "I like to eat fried chicken."<eos>


### Creating Causal inputs and Causal outputs 

In [None]:
def create_causal_inputs(visual_feats ,visual_attn,  tgt_ids): 
    prompt = "Translate the given sentence into Chinese:" # The space will be added by the model itself
    prompt_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
    prompt_len = len(prompt_ids["input_ids"][0])
    #print(f"prompt_len: {prompt_len}")
    #print(f"visual feats shape: {visual_feats.shape}")
    prompt_embeds = model.get_input_embeddings()(prompt_ids["input_ids"]).squeeze()
    tgt_embeds = model.get_input_embeddings()(tgt_ids["input_ids"])

    #print(f"tgt embeds: {tgt_embeds.shape}")
    tgt_attn = tgt_ids["attention_mask"]
    #print(f"tgt attn: {tgt_attn.shape}")
    new_input_embeds = [] 
    new_labels = []

    for i in range(tgt_embeds.shape[0]): # batch size
        curr_vis_feats = visual_feats[i] # take the current visual features
        curr_vis_attn = visual_attn[i] # take the current visual attention mask
        curr_vis_feats = curr_vis_feats[curr_vis_attn==1] # only take the visual features that are attended to

        curr_vis_len = curr_vis_feats.shape[0] # get the length of the visual features

        curr_tgt_embeds = tgt_embeds[i] # take the current target embeddings

        curr_tgt_feats = curr_tgt_embeds[tgt_attn[i]==1][1:] # only take the target embeddings that are attended to, remove the bos token 
 
        combined_embeds = torch.cat((prompt_embeds, curr_vis_feats, curr_tgt_feats), dim = 0) #Concat all the embeddings
        #print(f"combined_embeds: {combined_embeds}")
        new_input_embeds.append(combined_embeds) 
        negate_tgt = torch.full((1, prompt_len + curr_vis_len), -100) # create the -100 labels for the model (only the target text is not -100)
        #print(f"negate_tgt: {negate_tgt.shape}")
        labels =torch.cat([negate_tgt,  tgt_ids["input_ids"][i][tgt_attn[i]==1][1:].clone().unsqueeze(0)], dim =1).permute(1,0) # Concat both the -100s and the target text

        new_labels.append(labels) # append the labels
        assert labels.shape[0] == len(combined_embeds), f"len labels: {labels.shape} vs len combined_embeds: {combined_embeds.shape}" 
        # assert the length of the labels is the same as the combined embeddings
    
    # perform padding for the batch before returning
    new_input_embeds = torch.nn.utils.rnn.pad_sequence(new_input_embeds, batch_first=True, padding_value=0)
    #print("HERE", [labels.shape for labels in new_labels])
    new_labels = torch.nn.utils.rnn.pad_sequence(new_labels, batch_first=True, padding_value=-100).squeeze()
    new_labels[new_labels==0]=-100
    return new_input_embeds, new_labels

'''Batch of 1 test case'''
input_text = " Mother loves to eat fried chickens."
input_ids = tokenizer(input_text, return_tensors="pt")
print(input_ids)

input_embeds = model.get_input_embeddings()(input_ids["input_ids"])[:, 1: ] # special case need to cut the extra bos token in front for the english sentence too
ans_text = " 妈妈喜欢吃炸鸡。"
tgt_ids = tokenizer(ans_text , return_tensors="pt", add_special_tokens=True)
new_input_embeds , new_labels=  create_causal_inputs(input_embeds,input_ids.attention_mask[:, 1:],  tgt_ids)

print(model(inputs_embeds= new_input_embeds, labels =new_labels).loss) 

'''Batch of 1 test case NUMBER 2'''
input_text = " I enjoy eating fish."
input_ids = tokenizer(input_text, return_tensors="pt")
print(input_ids)

input_embeds = model.get_input_embeddings()(input_ids["input_ids"])[:, 1: ] # special case need to cut the extra bos token in front for the english sentence too
ans_text = " 我喜欢吃鱼。"
tgt_ids = tokenizer(ans_text , return_tensors="pt", add_special_tokens=True)
new_input_embeds , new_labels=  create_causal_inputs(input_embeds, input_ids.attention_mask[:, 1:], tgt_ids)
print(model(inputs_embeds= new_input_embeds, labels =new_labels).loss) 
    
'''Batch of 2 test case'''
input_text = [" Mother loves to eat fried chickens."," I enjoy eating fish."]
input_ids = tokenizer(input_text, return_tensors="pt", padding = True )
print("input ids", input_ids)
input_embeds = model.get_input_embeddings()(input_ids["input_ids"])[:, 1: ] # special case need to cut the extra bos token in front for the english sentence too
ans_text = [" 妈妈喜欢吃炸鸡。", " 我喜欢吃鱼。"]
tgt_tokenizer = AutoTokenizer.from_pretrained("gemma_instruct_2b")
tgt_tokenizer.padding_side = "right"
tgt_ids = tgt_tokenizer(ans_text , return_tensors="pt", add_special_tokens=True, padding = True)
print("tgt_ids ", tgt_ids)
new_input_embeds , new_labels =  create_causal_inputs(input_embeds, input_ids.attention_mask [:, 1:], tgt_ids)
print(new_input_embeds.shape)
print(new_labels)
model(inputs_embeds= new_input_embeds, labels =new_labels).loss

{'input_ids': tensor([[     2,  17025,  16147,    577,   7812,  30196,  53190, 235265]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
tensor(1.4907, grad_fn=<NllLossBackward0>)
{'input_ids': tensor([[     2,    590,   4313,  12150,   5001, 235265]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
tensor(3.6193, grad_fn=<NllLossBackward0>)
input ids {'input_ids': tensor([[     2,  17025,  16147,    577,   7812,  30196,  53190, 235265],
        [     2,    590,   4313,  12150,   5001, 235265,      0,      0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0]])}
tgt_ids  {'input_ids': tensor([[     2, 235248,  42130,  19891, 236280, 238069, 237619, 235362],
        [     2,  25736,  19891, 236280, 237098, 235362,      0,      0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0]])}
torch.Size([2, 22, 2048])
tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,  

tensor(2.3777, grad_fn=<NllLossBackward0>)

In [323]:
print(test_embeds.shape)
test_embeds


torch.Size([1, 22, 2048])


tensor([[[ 0.1035,  0.0052, -0.0330,  ..., -0.0170, -0.0087, -0.0099],
         [ 0.2734, -0.0055,  0.0292,  ..., -0.0077,  0.0947,  0.0315],
         [ 0.2373, -0.0249, -0.0918,  ..., -0.0045,  0.0388,  0.0212],
         ...,
         [ 0.2490, -0.0132, -0.0410,  ..., -0.0186,  0.0549,  0.0041],
         [ 0.2344, -0.0138,  0.0146,  ..., -0.0047,  0.0442,  0.0232],
         [ 0.1895, -0.0265, -0.0408,  ..., -0.0092, -0.0200, -0.0006]]],
       grad_fn=<EmbeddingBackward0>)

In [None]:
print(new_input_embeds.shape)
new_input_embeds

torch.Size([22, 2048])


[tensor([[ 0.1035,  0.0052, -0.0330,  ..., -0.0170, -0.0087, -0.0099],
         [ 0.2734, -0.0055,  0.0292,  ..., -0.0077,  0.0947,  0.0315],
         [ 0.2373, -0.0249, -0.0918,  ..., -0.0045,  0.0388,  0.0212],
         ...,
         [ 0.2197,  0.0217, -0.1196,  ...,  0.0535, -0.0075, -0.0242],
         [ 0.2197,  0.0217, -0.1196,  ...,  0.0535, -0.0075, -0.0242],
         [ 0.2197,  0.0217, -0.1196,  ...,  0.0535, -0.0075, -0.0242]],
        grad_fn=<CatBackward0>)]

In [316]:
new_labels

tensor([  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100, 235248,  42130,  19891,
        236280, 238069, 237619, 235362])

In [262]:
new_attention_mask

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
         1., 0., 0.]])

In [264]:
new_labels

tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,  25736,  19891, 236280,
         238069, 237619, 235362],
        [  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,  25736, 113066, 107590, 235362,
           -100,   -100,   -100]])

In [259]:
print(new_input_embeds.shape)

torch.Size([2, 21, 2048])


In [210]:
new_attention_mask

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
         1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
         1., 1., 1., 0.]])

### Testing out M2M seq2seq training 

In [52]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

hi_text = "जीवन एक चॉकलेट बॉक्स की तरह है।"
chinese_text = "生活就像一盒巧克力。"

model = M2M100ForConditionalGeneration.from_pretrained("m2m_1.2b")
tokenizer = M2M100Tokenizer.from_pretrained("m2m_1.2b")

# # translate Hindi to French
# tokenizer.src_lang = "hi"
# encoded_hi = tokenizer(hi_text, return_tensors="pt")
# generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("fr"))
# tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# # => "La vie est comme une boîte de chocolat."

# translate Chinese to English
tokenizer.src_lang = "zh"
encoded_zh = tokenizer(chinese_text, return_tensors="pt")
generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "Life is like a box of chocolate."




['Life is like a box of chocolate.']

In [338]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load Gemma tokenizer and model
model_name = "gemma_instruct_2b"  # Replace with your specific Gemma model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [345]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    task_type="CAUSAL_LM",   # Type of task
    inference_mode=False,    # Enable training
    r=8,                     # Low-rank dimension
    lora_alpha=16,           # Scaling factor
    lora_dropout=0.1,        # Dropout rate for LoRA
    target_modules=["q_proj", "v_proj"]  # Target modules (Gemma specific)
)

# Apply LoRA to the model
lora_model = get_peft_model(model, lora_config)

# Check trainable parameters
lora_model.print_trainable_parameters()

trainable params: 3,538,944 || all params: 1,243,009,024 || trainable%: 0.2847


In [341]:
lora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 2048, padding_idx=0)
        (layers): ModuleList(
          (0-17): 18 x GemmaDecoderLayer(
            (self_attn): GemmaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pr

In [344]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Load M2M100 model and tokenizer
model_name = "m2m_1.2b"  # Or "facebook/m2m100_1.2B"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",   # Task type for sequence-to-sequence models
    inference_mode=False,       # Enable training mode
    r=8,                        # Low-rank dimension
    lora_alpha=16,              # Scaling factor
    lora_dropout=0.1,           # Dropout rate
    target_modules=["q_proj", "k_proj"]  # Target specific layers in the attention mechanism
)

# Apply LoRA to the model
lora_model = get_peft_model(model, lora_config)

# Print trainable parameters
lora_model.print_trainable_parameters()


trainable params: 2,359,296 || all params: 1,241,829,376 || trainable%: 0.1900


In [16]:
import torch
import copy


def update_lgt(self, lgt):
    lgt = torch.tensor(lgt)
    feat_len = copy.deepcopy(lgt)  # Deep copy the input
    for ks in self.kernel_size:
        if ks[0] == 'P':
            feat_len = torch.div(feat_len, 2)
        else:
            feat_len -= int(ks[1]) - 1
    lgt = lgt.cpu().to(torch.int).tolist()
    return feat_len

# Example usage
class ExampleModel:
    def __init__(self):
        self.kernel_size = [('P', 2), ('C', 3)]

model = ExampleModel()

# Python list
lgt_list = [8, 16, 32]

# Convert to tensor
lgt_tensor = torch.tensor(lgt_list, dtype=torch.int32)

# Call the function
updated_lgt = update_lgt(model, lgt_tensor)

print(updated_lgt[1].item())  # Output will be a PyTorch tensor


6.0


  lgt = torch.tensor(lgt)
