# 1. Use a pre-trained google/flan-t5-small as the model.

In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
from transformers import T5ForConditionalGeneration,T5Config
from transformers.models.t5.modeling_t5 import T5DenseGatedActDense,T5DenseActDense,T5LayerNorm
import torch.nn as nn
import torch



In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

In [2]:
pipe = pipeline("text2text-generation", model=model,tokenizer=tokenizer)


In [3]:
text_summ = "Summerize : Defending Cricket World Cup champions England's campaign in the 2023 edition went from bad to worse as they slumped to their fourth defeat in five matches. On Thursday, England went down by eight wickets against Sri Lanka in ODI Cricket World Cup match in Bengaluru. With the loss, England went down to the ninth place in the 10-team Cricket World Cup. England have 2 points from five matches (NRR -1.634). Their remaining matches are against India, Australia, Netherlands and Pakistan. With hosts India and five-time champions Australia in good form, England's chances to enter semi-finals are hanging by a thread."
text_qa = "question: How much points England have? context:Defending Cricket World Cup champions England's campaign in the 2023 edition went from bad to worse as they slumped to their fourth defeat in five matches. On Thursday, England went down by eight wickets against Sri Lanka in ODI Cricket World Cup match in Bengaluru. With the loss, England went down to the ninth place in the 10-team Cricket World Cup. England have 2 points from five matches (NRR -1.634). Their remaining matches are against India, Australia, Netherlands and Pakistan. With hosts India and five-time champions Australia in good form, England's chances to enter semi-finals are hanging by a thread."
text_translate ="Translate to French:  My name is Prabhat and I lives in Hyderabad"


# 2. Verify if the summariza'on task works.

In [10]:
pipe(text_summ)

[{'generated_text': "England's campaign in the 2023 edition went from bad to worse as they slumped"}]

## 3. Verify if the Q&A task works.

In [11]:
pipe(text_qa)

[{'generated_text': '2'}]

## 4. Verify if English to French transla'on task works.

In [12]:
pipe(text_translate)

[{'generated_text': "M'ai nom est Prabhat et je vive en Hyderabad"}]

In [6]:

#check_point = "google/flan-t5-base"
#model = T5ForConditionalGeneration.from_pretrained(check_point)
#model_config = model.config.to_dict()

#model_config["d_model"] = 128
#config = T5Config(**model_config)

#model = T5ForConditionalGeneration.from_pretrained(model, config = config)


## 5. Programma'cally print the names of all the model layers and their dimensions.

In [13]:
for i,j in model.named_parameters():
    print(f"Name of layer : {i},'---->' Size of the layer {j.shape}")

Name of layer : shared.weight,'---->' Size of the layer torch.Size([32128, 512])
Name of layer : encoder.block.0.layer.0.SelfAttention.q.weight,'---->' Size of the layer torch.Size([384, 512])
Name of layer : encoder.block.0.layer.0.SelfAttention.k.weight,'---->' Size of the layer torch.Size([384, 512])
Name of layer : encoder.block.0.layer.0.SelfAttention.v.weight,'---->' Size of the layer torch.Size([384, 512])
Name of layer : encoder.block.0.layer.0.SelfAttention.o.weight,'---->' Size of the layer torch.Size([512, 384])
Name of layer : encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight,'---->' Size of the layer torch.Size([32, 6])
Name of layer : encoder.block.0.layer.0.layer_norm.weight,'---->' Size of the layer torch.Size([512])
Name of layer : encoder.block.0.layer.1.DenseReluDense.wi_0.weight,'---->' Size of the layer torch.Size([1024, 512])
Name of layer : encoder.block.0.layer.1.DenseReluDense.wi_1.weight,'---->' Size of the layer torch.Size([1024, 512])
Name

## 6. Programma'cally print the total number of parameters/weights in this model.

In [15]:
model.num_parameters()

76961152

## 7. Set the tensor in final layer (decoder.final_layer_norm.weight) to all zeros.

In [17]:
model.decoder.final_layer_norm.weight.data = nn.parameter.Parameter(torch.zeros_like(model.decoder.final_layer_norm.weight))

## 8. Verify if the Q&A task works aWer reseXng the weights of the above layer.

In [19]:
pipe = pipeline("text2text-generation", model=model,tokenizer=tokenizer)
text_qa1 = "question: What is the capital city of India? context: Capital city of India is New Delhi"
pipe(text_qa1)

[{'generated_text': ''}]

## 9. Replace the decoder.final_layer_norm.weight with a layer of smaller dimensions and adjust all the dependent layers to match the dimension

In [20]:
class T5LayerFF_128(nn.Module):
    def __init__(self, config: T5Config):
        super().__init__()
        if config.is_gated_act:
            self.DenseReluDense = T5DenseGatedActDense(config)
        else:
            self.DenseReluDense = T5DenseActDense(config)

        self.layer_norm = T5LayerNorm(128, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states):
        
        hidden_states=hidden_states[:,:,:128]
        forwarded_states = self.layer_norm(hidden_states)
        forwarded_states = self.DenseReluDense(forwarded_states)
        hidden_states = hidden_states + self.dropout(forwarded_states)
        return hidden_states

In [21]:
from transformers.models.t5.modeling_t5 import T5LayerNorm,T5LayerFF


model.decoder.final_layer_norm = T5LayerNorm(hidden_size=128)

## Changed the final_layer_norm to lower dimension of 128 from 512
model.decoder.final_layer_norm.weight.data = model.decoder.final_layer_norm.weight.data[0:128]

# Below are the adjustment of all other layers
model.decoder.block[7].layer[2].layer_norm = T5LayerNorm(hidden_size=128)
model.decoder.block[7].layer[2].layer_norm.weight.data = model.decoder.block[7].layer[2].layer_norm.weight.data[0:128]


model.lm_head.in_features = 128
model.lm_head.in_features = 1024
model.lm_head.weight.data = model.lm_head.weight.data[:1024,:128]
model.decoder.block[7].layer[2] = T5LayerFF_128(model.config)

model.decoder.block[7].layer[2].DenseReluDense.wo.out_features =512
model.decoder.block[7].layer[2].DenseReluDense.wo.in_features =128

model.decoder.block[7].layer[2].DenseReluDense.wo.weight.data = model.decoder.block[7].layer[2].DenseReluDense.wo.weight.data[:128,:]

model.decoder.block[7].layer[2].DenseReluDense.wi_0.out_features =512
model.decoder.block[7].layer[2].DenseReluDense.wi_0.weight.data = model.decoder.block[7].layer[2].DenseReluDense.wi_0.weight.data[:,:128]

model.decoder.block[7].layer[2].DenseReluDense.wi_1.out_features =512
model.decoder.block[7].layer[2].DenseReluDense.wi_1.weight.data = model.decoder.block[7].layer[2].DenseReluDense.wi_1.weight.data[:,:128]


In [22]:
from transformers import pipeline
pipe = pipeline("text2text-generation", model=model,tokenizer=tokenizer)
pipe(text_qa)

[{'generated_text': 'NR 303'}]

In [23]:
pipe(text_qa1)

[{'generated_text': 'New Sieling Stateling community office office office office office office office office office office office office office'}]

In [24]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 