In [1]:
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install accelerate -U
!pip install evaluate
!pip install bert-score
!pip install spacy

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
Col

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from bert_score import score
import evaluate
import numpy as np
import torch
from torch import nn
import copy

from transformers import (
    pipeline,
    T5Model,
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    default_data_collator,
    Seq2SeqTrainingArguments,
    GenerationConfig,
    AdamW,
    get_scheduler
)

from transformers.models.t5.modeling_t5 import (
    T5Stack,
    T5Block,
    T5LayerNorm,
    T5Config
)

from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [33]:
# load the model
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

## Utility Functions

In [6]:
def calculate_semantic_similarity(text_1, text_2):
  # Load SpaCy model
  try:
      nlp = spacy.load("en_core_web_md")
  except OSError:
      print("Downloading spaCy model...")
      from spacy.cli.download import download
      download("en_core_web_md")
      nlp = spacy.load("en_core_web_md")

  # Example original text and generated summary
  text_1_v = nlp(text_1)
  text_2_v = nlp(text_2)

  # Calculate semantic similarity
  similarity_score = text_1_v.similarity(text_2_v)
  print(f"Spacy based Sementic Similarity: {similarity_score}")
  return ''


In [7]:
def calculate_bert_scores(text_1, text_2):
  # Specify the language or model type
  lang = "en"  # Replace with the appropriate language code if needed

  # Calculate BERTScore
  P, R, F1 = score([text_1], [text_2], lang=lang)
  print(f"F1-Score: {F1.mean().item()}")
  print(f"Precesion: {P.mean().item()}")
  print(f"Recall: {R.mean().item()}")
  return ''


In [12]:
def bert_score_for_en_to_fr(fr_text_pred, fr_text_actual):
  # Compute BERT Score
  P, R, bert_score = score([fr_text_pred], [fr_text_actual], lang="en-fr")

  # Print BERT Score
  print("BERT Scores:")
  print("F1-Score:", bert_score.mean().item())
  print("Precision:", P.mean().item())
  print("Recall:", R.mean().item())
  return ''


# Summarization

In [8]:
def summarization(dataset, model):

    prompt = 'Summarize the following:\n[PASSAGE]\n.'
    example = dataset[int(np.random.randint(len(dataset), size=1)[0])]
    # print(example.keys())

    context, summary = example['document'], example['summary']

    prompt = prompt.replace("[PASSAGE]", context)
    input_ids = tokenizer(prompt, max_length=1024, truncation=True, return_tensors="pt").input_ids.to(device)

    generation_config = GenerationConfig(early_stopping=False, length_penalty= 0.0, early_stop=False, max_length=150)

    outputs = model.generate(input_ids, generation_config=generation_config)
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    print("Input:\n", prompt)
    print("Predicted Summary:", result)
    print("Reference Summary:", summary)
    print("\n")
    print(calculate_semantic_similarity(result[0], summary))
    print("Bert Scores:")
    print(calculate_bert_scores(result[0], summary))

dataset = load_dataset("multi_news", "3.0.0", split='validation')
summarization(dataset, model)

Downloading builder script:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/66.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.30M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.31M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]



Input:
 Summarize the following:
FBI personnel walk through the complex surrounding the apartment, where Ibragim Todashev was shot and killed by FBI, in Orlando, Florida, May 22, 2013. 
 
 GROZNY, Russia (Reuters) - The father of a Chechen immigrant killed during questioning over his links with one of the Boston Marathon bombings suspects said on Thursday he plans to travel to the United States where he thinks his son was tortured and killed. 
 
 Ibragim Todashev, 27, was killed by a federal agent in his apartment complex when he became violent during questioning over his ties to Tamerlan Tsarnaev, the older of two brothers suspected of planting two bombs at the marathon on April 15. 
 
 "I suspect that they tortured my son and that he suffered a painful death," said Abdulbaki Todashev, wiping away tears at the home he shares with one of his wives in the mostly Muslim region of Chechnya in Russia's North Caucasus. 
 
 "I will try to go to (the United States) and get to the truth," he s

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1-Score: 0.8626949191093445
Precesion: 0.8748985528945923
Recall: 0.8508270978927612



In [9]:
def summarization_with_pipeline(dataset):

    summarization_pipeline = pipeline("summarization", model="google/flan-t5-small")

    example = dataset[int(np.random.randint(len(dataset), size=1)[0])]
    # print(example.keys())

    context, summary = example['document'], example['summary']

    predicted_summary = summarization_pipeline(context, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, no_repeat_ngram_size=2)
    result = predicted_summary[0]['summary_text']

    print("Predicted Summary:", result)
    print("Reference Summary:", summary)
    print("\n")
    print(calculate_semantic_similarity(result, summary))
    print("Bert Scores:")
    print(calculate_bert_scores(result, summary))

dataset = load_dataset("multi_news", "3.0.0", split='validation')
summarization_with_pipeline(dataset)

Token indices sequence length is longer than the specified maximum sequence length for this model (2384 > 512). Running this sequence through the model will result in indexing errors


Predicted Summary: – More than 10 percent of US children have been diagnosed with attention deficit hyperactivity disorder (ADHD), reflecting a surge in recent years particularly among girls and minority groups, Reuters reports. The percentage of children who have ADHD, however, is not thought to have changed, it's simply that more are being diagnosed. While the findings, published today in the Journal of Clinical Psychiatry, may be received as more unnecessary medicalization of childhood, experts welcomed it as good news. "Part of the problem with ADHD is that there’s [a lot of misinformation], so some people would look at that and say we’re just diagnosing too many kids," said Dr. Mark Wolra
Reference Summary: – More than one in 10 US children between the ages of 5 and 17 have been diagnosed with ADHD, according to a study published Tuesday in the Journal of Clinical Psychology. That means about 5.8 million kids have the most commonly diagnosed mental disorder for US children. The st

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1-Score: 0.8646886348724365
Precesion: 0.8805652856826782
Recall: 0.8493743538856506



# Q&A

In [10]:
def q_n_a(dataset, model):

    prompt = 'Given the following:\n[PASSAGE].\nAnswer the following:\n[QUESTION].\n'
    example = dataset[int(np.random.randint(len(dataset), size=1)[0])]

    context, question, actual_answer = example['context'],  example['question'], example["answers"]

    prompt = prompt.replace("[PASSAGE]", context)
    prompt = prompt.replace("[QUESTION]", question)
    input_ids = tokenizer(prompt, max_length=512, truncation=True, return_tensors="pt").input_ids.to(device)

    outputs = model.generate(input_ids)
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    print("Input:\n", prompt)
    print("Predicted Answer:", result)
    print("Reference Answers:", actual_answer)
    print("\n")
    print(calculate_semantic_similarity(result[0], actual_answer['text'][0]))
    print("Bert Scores:")
    print(calculate_bert_scores(result[0], actual_answer['text'][0]))

dataset_sqaud = load_dataset("squad", split='validation')
q_n_a(dataset_sqaud, model)

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]



Input:
 Given the following:
Luther's rediscovery of "Christ and His salvation" was the first of two points that became the foundation for the Reformation. His railing against the sale of indulgences was based on it..
Answer the following:
What became the foundation of the Reformation?.

Predicted Answer: ['Luther\'s rediscovery of "Christ and His salvation"']
Reference Answers: {'text': ['Christ and His salvation', 'Christ and His salvation', 'Christ and His salvation', 'Christ and His salvation'], 'answer_start': [25, 25, 25, 25]}


Spacy based Sementic Similarity: 0.7530032053440274

Bert Scores:


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1-Score: 0.8997851610183716
Precesion: 0.8820759057998657
Recall: 0.918220043182373



In [11]:
def q_n_a_with_pipeline(dataset):

    qa_pipeline = pipeline("question-answering", model="google/flan-t5-small")

    example = dataset[int(np.random.randint(len(dataset), size=1)[0])]

    context, question, actual_answer = example['context'],  example['question'], example["answers"]

    answer = qa_pipeline(question=question, context=context)
    result = answer["answer"]

    print("Predicted Answer:", result)
    print("Reference Answers:", actual_answer)
    print("Confidence:", answer["score"])
    print("\n")
    print(calculate_semantic_similarity(result, actual_answer['text'][0]))
    print("Bert Scores:")
    print(calculate_bert_scores(result, actual_answer['text'][0]))

dataset_sqaud = load_dataset("squad", split='validation')
q_n_a_with_pipeline(dataset_sqaud)

Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted Answer:  of the first clues that
Reference Answers: {'text': ['become lighter', 'appear to become lighter', 'appear to become lighter', 'appear to become lighter and seem to lose something in the process', 'lighter'], 'answer_start': [441, 431, 431, 431, 448]}
Confidence: 7.851204281905666e-05


Spacy based Sementic Similarity: 0.4562017562515045

Bert Scores:


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1-Score: 0.8258848190307617
Precesion: 0.8055033683776855
Recall: 0.8473243713378906



# English to French Translation

In [45]:
def english_to_french_translation(dataset, model):

    prompt = 'Translate this sentence from English to French :\n[SENTENCE]\n.'
    example = dataset[int(np.random.randint(len(dataset), size=1)[0])]

    english, french = example['en'], example['fr']

    prompt = prompt.replace("[SENTENCE]", english)
    input_ids = tokenizer(prompt, max_length=512, truncation=True, return_tensors="pt").input_ids.to(device)

    outputs = model.generate(input_ids)
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    print("Input:\n", prompt)
    print("Predicted Translation:", result)
    print("Reference Translation:", french)
    print("\n")
    print(bert_score_for_en_to_fr(result[0], french))

translation_dataset = load_dataset("enimai/MuST-C-fr", split='validation')
english_to_french_translation(translation_dataset, model)

Input:
 Translate this sentence from English to French :
In order for us to understand each other, I want to start by showing you a rudimentary, very simple magic trick.
.
Predicted Translation: ["Enfin, à l'esprit, nous en soyez en mesure"]
Reference Translation: Pour qu'on se comprenne, je vais vous montrer quelque chose de rudimentaire, un tour de magie très simple.


BERT Scores:
F1-Score: 0.6583520770072937
Precision: 0.6821809411048889
Recall: 0.636131763458252



In [14]:
def english_to_french_translation_with_pipeline(dataset):

    translation_pipeline = pipeline("translation_en_to_fr", model="google/flan-t5-small")

    example = dataset[int(np.random.randint(len(dataset), size=1)[0])]

    english, french = example['en'], example['fr']

    french_translation = translation_pipeline(english)
    result = french_translation[0]["translation_text"]

    print("Predicted Translation:", result)
    print("Reference Translation:", french)
    print("\n")
    print(bert_score_for_en_to_fr(result[0], french))

translation_dataset = load_dataset("enimai/MuST-C-fr", split='validation')
english_to_french_translation_with_pipeline(translation_dataset)

Predicted Translation: Nous utilisons à l'ouverture du film.
Reference Translation: On l'utilisera pour l'ouverture du film.


BERT Scores:
F1-Score: 0.6093519926071167
Precision: 0.6383781433105469
Recall: 0.5828505754470825



# Names of all the model layers and their dimensions.

In [34]:
for name, param in model.named_parameters():
    if 'weight' in name:  # Print only for weight parameters
        print(f"Layer: {name} - Dimensions: {param.size()}")

Layer: shared.weight - Dimensions: torch.Size([32128, 512])
Layer: encoder.block.0.layer.0.SelfAttention.q.weight - Dimensions: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.k.weight - Dimensions: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.v.weight - Dimensions: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.o.weight - Dimensions: torch.Size([512, 384])
Layer: encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight - Dimensions: torch.Size([32, 6])
Layer: encoder.block.0.layer.0.layer_norm.weight - Dimensions: torch.Size([512])
Layer: encoder.block.0.layer.1.DenseReluDense.wi_0.weight - Dimensions: torch.Size([1024, 512])
Layer: encoder.block.0.layer.1.DenseReluDense.wi_1.weight - Dimensions: torch.Size([1024, 512])
Layer: encoder.block.0.layer.1.DenseReluDense.wo.weight - Dimensions: torch.Size([512, 1024])
Layer: encoder.block.0.layer.1.layer_norm.weight - Dimensions: torch.Size([512])
Layer: encoder.blo

# Total number of parameters/weights in this model.

In [35]:
# Get the total number of parameters
total_params = sum(p.numel() for p in model.parameters())

print(f"Total Number of Parameters: {total_params}")


Total Number of Parameters: 76961152


# Final layer (decoder.final_layer_norm.weight) to all zeros and verification of Q&A task.

In [17]:
# Set the tensor in the final layer to all zeros
model.decoder.final_layer_norm.weight.data.fill_(0)

q_n_a(dataset_sqaud, model)

Input:
 Given the following:
Hypersensitivity is an immune response that damages the body's own tissues. They are divided into four classes (Type I – IV) based on the mechanisms involved and the time course of the hypersensitive reaction. Type I hypersensitivity is an immediate or anaphylactic reaction, often associated with allergy. Symptoms can range from mild discomfort to death. Type I hypersensitivity is mediated by IgE, which triggers degranulation of mast cells and basophils when cross-linked by antigen. Type II hypersensitivity occurs when antibodies bind to antigens on the patient's own cells, marking them for destruction. This is also called antibody-dependent (or cytotoxic) hypersensitivity, and is mediated by IgG and IgM antibodies. Immune complexes (aggregations of antigens, complement proteins, and IgG and IgM antibodies) deposited in various tissues trigger Type III hypersensitivity reactions. Type IV hypersensitivity (also known as cell-mediated or delayed type hypersen

  similarity_score = original_text_v.similarity(generated_summary_v)


Spacy based Sementic Similarity: 0.0

Bert Scores:


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1-Score: 0.0
Precesion: 0.0
Recall: 0.0





# Custom layer
Replacing the decoder.final_layer_norm.weight with a layer of smaller dimensions (128) and adjust all the dependent layers to match the dimension.

The Cell below replaces the final layer norm and dependent layers of dimension 512 with a dimension of size 128.

The class CustomLayerNorm adds a linear layer to map the input from 512 to 128 dimension. It then adds a LayerNorm of size 128.

The class CustomDecoder extends T5Stack and adds the CustomLayerNorm at the top. As the output from the CustomLayerNorm is of size 128, it adds another linear to map the input from 128 to the vocab_size of the model.

The class CustomT5 extends T5ForConditionalGeneration and combines the encoder with the CustomDecoder.

The class CustomDecoder only overrides the final layer norm and classCustomT5 only overrides the decoder. The rest of the functions of google/flan-t5-small remains same.

In [39]:
class CustomLayerNorm(nn.Module):
    def __init__(self, hidden_size, input_dim=512, eps=1e-6):
        """
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        """
        super().__init__()
        self.projection = nn.Linear(input_dim, hidden_size)
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        hidden_states = self.projection(hidden_states)

        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)

        # convert into half-precision if necessary
        if self.weight.dtype in [torch.float16, torch.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)

        return self.weight * hidden_states

class CustomDecoder(T5Stack):
    def __init__(self, config, embed_tokens=None, old_dim=512, new_dim=128, vocab_size=31522):
        super().__init__(config, embed_tokens=embed_tokens)

        self.embed_tokens = embed_tokens
        self.is_decoder = config.is_decoder
        self.new_dim = new_dim

        self.block = nn.ModuleList(
            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
        )
        self.final_layer_norm = CustomLayerNorm(new_dim, input_dim=config.d_model, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

        # Initialize weights and apply final processing
        self.post_init()
        # Model parallel
        self.model_parallel = False
        self.device_map = None
        self.gradient_checkpointing = False
        # self.output_layer = nn.Linear(new_dim, vocab_size)

class CustomT5(T5ForConditionalGeneration):
    def __init__(self, config: T5Config):
        super().__init__(config)

        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        self.decoder = CustomDecoder(decoder_config, self.shared)

        if hasattr(self.decoder, 'new_dim'):
            new_dim = self.decoder.new_dim
        else:
            new_dim = decoder_config.d_model

        self.lm_head = nn.Linear(new_dim, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

        # Model parallel
        self.model_parallel = False
        self.device_map = None

In [40]:
custom_model = CustomT5(model.config)
for name, param in custom_model.named_parameters():
    if 'weight' in name:  # Print only for weight parameters
        print(f"Layer: {name} - Dimensions: {param.size()}")

Layer: shared.weight - Dimensions: torch.Size([32128, 512])
Layer: encoder.block.0.layer.0.SelfAttention.q.weight - Dimensions: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.k.weight - Dimensions: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.v.weight - Dimensions: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.o.weight - Dimensions: torch.Size([512, 384])
Layer: encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight - Dimensions: torch.Size([32, 6])
Layer: encoder.block.0.layer.0.layer_norm.weight - Dimensions: torch.Size([512])
Layer: encoder.block.0.layer.1.DenseReluDense.wi_0.weight - Dimensions: torch.Size([1024, 512])
Layer: encoder.block.0.layer.1.DenseReluDense.wi_1.weight - Dimensions: torch.Size([1024, 512])
Layer: encoder.block.0.layer.1.DenseReluDense.wo.weight - Dimensions: torch.Size([512, 1024])
Layer: encoder.block.0.layer.1.layer_norm.weight - Dimensions: torch.Size([512])
Layer: encoder.blo

In [41]:
custom_model.to(device)
q_n_a(dataset_sqaud, custom_model)

Input:
 Given the following:
After World War II, under a Communist regime set up by the conquering Soviets, the "Bricks for Warsaw" campaign was initiated, and large prefabricated housing projects were erected in Warsaw to address the housing shortage, along with other typical buildings of an Eastern Bloc city, such as the Palace of Culture and Science, a gift from the Soviet Union. The city resumed its role as the capital of Poland and the country's centre of political and economic life. Many of the historic streets, buildings, and churches were restored to their original form. In 1980, Warsaw's historic Old Town was inscribed onto UNESCO's World Heritage list..
Answer the following:
What building was a gift from the Soviet Union?.

Predicted Answer: ['Anzeigeatteinte201 Candidates30 détail appliance Package einer Forexsolution/19whoplaîtfolgen Videodescending ieși fee']
Reference Answers: {'text': ['Palace of Culture and Science', 'Palace of Culture and Science', 'Palace of Culture a

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1-Score: 0.7442085146903992
Precesion: 0.725654125213623
Recall: 0.7637367248535156



# Finetuning For SQUAD.

In [18]:
dataset = load_dataset("squad")
print("Context: ", dataset["train"][0]["context"])
print("Question: ", dataset["train"][0]["question"])
print("Answer: ", dataset["train"][0]["answers"])


Context:  Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Question:  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer:  {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}


In [19]:
def preprocess_function(examples, padding="max_length", max_input_length=512, max_target_length=32):

    answers = [example['text'][0] for example in examples["answers"]]
    questions = [q.strip() for q in examples["question"]]
    contexts = [context.strip() for context in examples["context"]]

    assert len(questions) == len(contexts)
    prompts = [f'SQUAD Reading Comprehension Task\n\nGiven the following passage:\n{context}\nAnswer the question:\n{question}\n'
               for context, question in list(zip(contexts, questions))]

    inputs = tokenizer(
        prompts,
        max_length=max_input_length,
        truncation=True,
        padding=True,
    )
    labels = tokenizer(answers, max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    inputs["labels"] = labels["input_ids"]
    return inputs

In [20]:
# Metric
metric = evaluate.load("squad")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels

def compute_metrics(preds, labels, valiation_set):
    final_preds = []

    for pred_list in preds:
        cur_pred = []
        for token in pred_list:
            if token == tokenizer.eos_token_id:
                break
            cur_pred.append(token)
        final_preds.append(cur_pred)
    decoded_preds = tokenizer.batch_decode(final_preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    for label_list in labels:
        for i in range(len(label_list)):
            if label_list[i] == -100:
                label_list[i] = tokenizer.pad_token_id
    # labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    assert len(decoded_preds) == len(valiation_set)
    references = []
    final_preds = []

    for i, pred in enumerate(decoded_preds):
        final_preds.append({
            "id": validation_set[i]['id'],
            "prediction_text": pred
        })
        references.append({
            "id": validation_set[i]['id'],
            "answers": validation_set[i]['answers']
        })

    result = metric.compute(predictions=final_preds, references=references)
    print(result)
    return result

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [21]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [22]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [23]:
validation_set = [example for example in tokenized_dataset["validation"]]

In [24]:
tokenized_dataset['train'] = tokenized_dataset['train'].remove_columns(["title","id", "context", "question", "answers"])
tokenized_dataset["validation"] = tokenized_dataset["validation"].remove_columns(["title", "id", "context", "question", "answers"])

In [25]:
train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["validation"], batch_size=8, collate_fn=data_collator
)

In [30]:
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

10950


In [27]:
def evaluate_model():
    model.eval()
    predictions = []
    labels = []
    print("Running Evaluation")
    for batch in tqdm(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        val_loss = outputs.loss
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.tolist())
        labels.extend(batch['labels'].tolist())
    print(f"Epoch [{epoch+1}/{num_epochs}], Step [{step+1}/{num_training_steps}], Val_loss: {val_loss.item():.4f}")
    compute_metrics(predictions, labels, validation_set)
    model.train()

In [29]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [31]:
progress_bar = tqdm(range(num_training_steps))
step = 0
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        if step%100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{step+1}/{num_training_steps}], lr: {optimizer.param_groups[0]['lr']}, Loss: {loss.item()}")
        if step > 0 and step%1000 == 0:
            evaluate_model()
        step+=1

    model.save_pretrained(f"checkpoint_{epoch}")


  0%|          | 0/10950 [00:00<?, ?it/s]

Epoch [1/1], Step [1/10950], lr: 4.999543378995434e-05, Loss: 0.8829897046089172
Epoch [1/1], Step [101/10950], lr: 4.9538812785388126e-05, Loss: 0.5722970962524414
Epoch [1/1], Step [201/10950], lr: 4.908219178082192e-05, Loss: 0.7051485180854797
Epoch [1/1], Step [301/10950], lr: 4.862557077625571e-05, Loss: 0.4848741888999939
Epoch [1/1], Step [401/10950], lr: 4.8168949771689495e-05, Loss: 0.5393522381782532
Epoch [1/1], Step [501/10950], lr: 4.771232876712329e-05, Loss: 0.500650942325592
Epoch [1/1], Step [601/10950], lr: 4.725570776255708e-05, Loss: 0.401753306388855
Epoch [1/1], Step [701/10950], lr: 4.679908675799087e-05, Loss: 0.6067529916763306
Epoch [1/1], Step [801/10950], lr: 4.6342465753424656e-05, Loss: 0.8947744369506836
Epoch [1/1], Step [901/10950], lr: 4.588584474885845e-05, Loss: 0.7757418751716614
Epoch [1/1], Step [1001/10950], lr: 4.542922374429224e-05, Loss: 0.7994036674499512
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [1001/10950], Val_loss: 0.9706
{'exact_match': 62.157048249763484, 'f1': 83.30412306065195}
Epoch [1/1], Step [1101/10950], lr: 4.4972602739726025e-05, Loss: 0.3247883915901184
Epoch [1/1], Step [1201/10950], lr: 4.451598173515982e-05, Loss: 0.2604353725910187
Epoch [1/1], Step [1301/10950], lr: 4.405936073059361e-05, Loss: 0.6397036910057068
Epoch [1/1], Step [1401/10950], lr: 4.3602739726027394e-05, Loss: 0.25009840726852417
Epoch [1/1], Step [1501/10950], lr: 4.3146118721461186e-05, Loss: 0.885596513748169
Epoch [1/1], Step [1601/10950], lr: 4.268949771689498e-05, Loss: 0.25569847226142883
Epoch [1/1], Step [1701/10950], lr: 4.223287671232877e-05, Loss: 0.8788554668426514
Epoch [1/1], Step [1801/10950], lr: 4.1776255707762555e-05, Loss: 0.41101524233818054
Epoch [1/1], Step [1901/10950], lr: 4.131963470319635e-05, Loss: 0.5633679628372192
Epoch [1/1], Step [2001/10950], lr: 4.086301369863014e-05, Loss: 0.6801570653915405
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [2001/10950], Val_loss: 1.2741
{'exact_match': 62.34626300851466, 'f1': 83.33969885455527}
Epoch [1/1], Step [2101/10950], lr: 4.0406392694063925e-05, Loss: 0.39851656556129456
Epoch [1/1], Step [2201/10950], lr: 3.9949771689497717e-05, Loss: 0.4204719364643097
Epoch [1/1], Step [2301/10950], lr: 3.949315068493151e-05, Loss: 0.2225559502840042
Epoch [1/1], Step [2401/10950], lr: 3.9036529680365294e-05, Loss: 0.5074851512908936
Epoch [1/1], Step [2501/10950], lr: 3.8579908675799086e-05, Loss: 0.471637487411499
Epoch [1/1], Step [2601/10950], lr: 3.812328767123288e-05, Loss: 0.22809427976608276
Epoch [1/1], Step [2701/10950], lr: 3.766666666666667e-05, Loss: 0.24807162582874298
Epoch [1/1], Step [2801/10950], lr: 3.7210045662100455e-05, Loss: 0.21011224389076233
Epoch [1/1], Step [2901/10950], lr: 3.675342465753425e-05, Loss: 0.27267295122146606
Epoch [1/1], Step [3001/10950], lr: 3.629680365296804e-05, Loss: 0.2662084400653839
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [3001/10950], Val_loss: 1.4444
{'exact_match': 62.61116367076632, 'f1': 83.38961116882741}
Epoch [1/1], Step [3101/10950], lr: 3.5840182648401824e-05, Loss: 0.3526085913181305
Epoch [1/1], Step [3201/10950], lr: 3.5383561643835616e-05, Loss: 0.26515090465545654
Epoch [1/1], Step [3301/10950], lr: 3.492694063926941e-05, Loss: 0.6486822366714478
Epoch [1/1], Step [3401/10950], lr: 3.447031963470319e-05, Loss: 0.13031990826129913
Epoch [1/1], Step [3501/10950], lr: 3.4013698630136985e-05, Loss: 0.6022990942001343
Epoch [1/1], Step [3601/10950], lr: 3.355707762557078e-05, Loss: 0.44931143522262573
Epoch [1/1], Step [3701/10950], lr: 3.310045662100457e-05, Loss: 0.3005525469779968
Epoch [1/1], Step [3801/10950], lr: 3.2643835616438354e-05, Loss: 0.6418405771255493
Epoch [1/1], Step [3901/10950], lr: 3.2187214611872146e-05, Loss: 0.444224089384079
Epoch [1/1], Step [4001/10950], lr: 3.173059360730594e-05, Loss: 0.23545964062213898
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [4001/10950], Val_loss: 1.5378
{'exact_match': 62.847682119205295, 'f1': 83.68457321590006}
Epoch [1/1], Step [4101/10950], lr: 3.1273972602739723e-05, Loss: 1.138485312461853
Epoch [1/1], Step [4201/10950], lr: 3.0817351598173515e-05, Loss: 0.22997736930847168
Epoch [1/1], Step [4301/10950], lr: 3.0360730593607307e-05, Loss: 0.4759815037250519
Epoch [1/1], Step [4401/10950], lr: 2.9904109589041096e-05, Loss: 0.4663701057434082
Epoch [1/1], Step [4501/10950], lr: 2.9447488584474885e-05, Loss: 0.46403422951698303
Epoch [1/1], Step [4601/10950], lr: 2.8990867579908677e-05, Loss: 0.6393237113952637
Epoch [1/1], Step [4701/10950], lr: 2.8534246575342465e-05, Loss: 0.4532080888748169
Epoch [1/1], Step [4801/10950], lr: 2.8077625570776257e-05, Loss: 1.2041321992874146
Epoch [1/1], Step [4901/10950], lr: 2.7621004566210046e-05, Loss: 0.38599342107772827
Epoch [1/1], Step [5001/10950], lr: 2.7164383561643834e-05, Loss: 0.9122545719146729
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [5001/10950], Val_loss: 1.1496
{'exact_match': 62.92336802270577, 'f1': 83.7440191112301}
Epoch [1/1], Step [5101/10950], lr: 2.6707762557077626e-05, Loss: 0.5558270812034607
Epoch [1/1], Step [5201/10950], lr: 2.6251141552511415e-05, Loss: 0.5456465482711792
Epoch [1/1], Step [5301/10950], lr: 2.5794520547945207e-05, Loss: 0.3776480555534363
Epoch [1/1], Step [5401/10950], lr: 2.5337899543378995e-05, Loss: 0.5700304508209229
Epoch [1/1], Step [5501/10950], lr: 2.4881278538812784e-05, Loss: 0.5542991161346436
Epoch [1/1], Step [5601/10950], lr: 2.4424657534246576e-05, Loss: 0.7786081433296204
Epoch [1/1], Step [5701/10950], lr: 2.3968036529680365e-05, Loss: 0.40575501322746277
Epoch [1/1], Step [5801/10950], lr: 2.3511415525114157e-05, Loss: 0.44301989674568176
Epoch [1/1], Step [5901/10950], lr: 2.3054794520547945e-05, Loss: 0.3938032388687134
Epoch [1/1], Step [6001/10950], lr: 2.2598173515981734e-05, Loss: 0.7207559943199158
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [6001/10950], Val_loss: 1.2414
{'exact_match': 63.37748344370861, 'f1': 83.95751326334576}
Epoch [1/1], Step [6101/10950], lr: 2.2141552511415526e-05, Loss: 0.5326229929924011
Epoch [1/1], Step [6201/10950], lr: 2.1684931506849314e-05, Loss: 0.5663157105445862
Epoch [1/1], Step [6301/10950], lr: 2.1228310502283106e-05, Loss: 0.43574583530426025
Epoch [1/1], Step [6401/10950], lr: 2.0771689497716895e-05, Loss: 0.5665603876113892
Epoch [1/1], Step [6501/10950], lr: 2.0315068493150687e-05, Loss: 0.6641477942466736
Epoch [1/1], Step [6601/10950], lr: 1.9858447488584475e-05, Loss: 0.21568375825881958
Epoch [1/1], Step [6701/10950], lr: 1.9401826484018264e-05, Loss: 0.38876163959503174
Epoch [1/1], Step [6801/10950], lr: 1.8945205479452056e-05, Loss: 0.2554287314414978
Epoch [1/1], Step [6901/10950], lr: 1.8488584474885845e-05, Loss: 0.19010747969150543
Epoch [1/1], Step [7001/10950], lr: 1.8031963470319637e-05, Loss: 0.49441808462142944
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [7001/10950], Val_loss: 1.2932
{'exact_match': 63.81267738883633, 'f1': 84.1018796371119}
Epoch [1/1], Step [7101/10950], lr: 1.7575342465753425e-05, Loss: 0.3507464528083801
Epoch [1/1], Step [7201/10950], lr: 1.7118721461187214e-05, Loss: 0.4361632764339447
Epoch [1/1], Step [7301/10950], lr: 1.6662100456621006e-05, Loss: 0.28387007117271423
Epoch [1/1], Step [7401/10950], lr: 1.6205479452054794e-05, Loss: 0.42002034187316895
Epoch [1/1], Step [7501/10950], lr: 1.5748858447488586e-05, Loss: 0.44485798478126526
Epoch [1/1], Step [7601/10950], lr: 1.5292237442922375e-05, Loss: 0.46357572078704834
Epoch [1/1], Step [7701/10950], lr: 1.4835616438356165e-05, Loss: 0.2565733790397644
Epoch [1/1], Step [7801/10950], lr: 1.4378995433789955e-05, Loss: 0.6171671748161316
Epoch [1/1], Step [7901/10950], lr: 1.3922374429223744e-05, Loss: 0.4805673360824585
Epoch [1/1], Step [8001/10950], lr: 1.3465753424657534e-05, Loss: 0.4782234728336334
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [8001/10950], Val_loss: 1.1790
{'exact_match': 63.9356669820246, 'f1': 84.21583618689448}
Epoch [1/1], Step [8101/10950], lr: 1.3009132420091325e-05, Loss: 0.47399070858955383
Epoch [1/1], Step [8201/10950], lr: 1.2552511415525115e-05, Loss: 0.5210881233215332
Epoch [1/1], Step [8301/10950], lr: 1.2095890410958905e-05, Loss: 0.2547300457954407
Epoch [1/1], Step [8401/10950], lr: 1.1639269406392694e-05, Loss: 0.6136345267295837
Epoch [1/1], Step [8501/10950], lr: 1.1182648401826484e-05, Loss: 0.6272262334823608
Epoch [1/1], Step [8601/10950], lr: 1.0726027397260274e-05, Loss: 0.5698612928390503
Epoch [1/1], Step [8701/10950], lr: 1.0269406392694065e-05, Loss: 0.25957202911376953
Epoch [1/1], Step [8801/10950], lr: 9.812785388127855e-06, Loss: 0.5838450193405151
Epoch [1/1], Step [8901/10950], lr: 9.356164383561643e-06, Loss: 0.28319817781448364
Epoch [1/1], Step [9001/10950], lr: 8.899543378995434e-06, Loss: 0.24632155895233154
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [9001/10950], Val_loss: 1.1111
{'exact_match': 63.35856196783349, 'f1': 83.96730084865327}
Epoch [1/1], Step [9101/10950], lr: 8.442922374429224e-06, Loss: 0.15208861231803894
Epoch [1/1], Step [9201/10950], lr: 7.986301369863014e-06, Loss: 0.3540201485157013
Epoch [1/1], Step [9301/10950], lr: 7.529680365296804e-06, Loss: 0.4582085907459259
Epoch [1/1], Step [9401/10950], lr: 7.073059360730594e-06, Loss: 0.49788564443588257
Epoch [1/1], Step [9501/10950], lr: 6.616438356164384e-06, Loss: 0.4667946696281433
Epoch [1/1], Step [9601/10950], lr: 6.159817351598174e-06, Loss: 0.976475715637207
Epoch [1/1], Step [9701/10950], lr: 5.703196347031964e-06, Loss: 1.163201093673706
Epoch [1/1], Step [9801/10950], lr: 5.246575342465753e-06, Loss: 0.2660379111766815
Epoch [1/1], Step [9901/10950], lr: 4.789954337899544e-06, Loss: 1.1620163917541504
Epoch [1/1], Step [10001/10950], lr: 4.333333333333334e-06, Loss: 0.35599175095558167
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [10001/10950], Val_loss: 1.0503
{'exact_match': 63.85998107852412, 'f1': 84.05158351280501}
Epoch [1/1], Step [10101/10950], lr: 3.876712328767123e-06, Loss: 0.3511672914028168
Epoch [1/1], Step [10201/10950], lr: 3.4200913242009136e-06, Loss: 0.4240044355392456
Epoch [1/1], Step [10301/10950], lr: 2.963470319634703e-06, Loss: 0.7607419490814209
Epoch [1/1], Step [10401/10950], lr: 2.5068493150684933e-06, Loss: 0.22961491346359253
Epoch [1/1], Step [10501/10950], lr: 2.050228310502283e-06, Loss: 0.3269726634025574
Epoch [1/1], Step [10601/10950], lr: 1.593607305936073e-06, Loss: 0.43342649936676025
Epoch [1/1], Step [10701/10950], lr: 1.1369863013698631e-06, Loss: 0.3753988742828369
Epoch [1/1], Step [10801/10950], lr: 6.80365296803653e-07, Loss: 0.4571496546268463
Epoch [1/1], Step [10901/10950], lr: 2.2374429223744292e-07, Loss: 0.5795358419418335


In [32]:
q_n_a(dataset_sqaud, model)



Input:
 Given the following:
An apoplectic stroke deprived him of his speech, and he died shortly afterwards at 2:45 a.m. on 18 February 1546, aged 62, in Eisleben, the city of his birth. He was buried in the Castle Church in Wittenberg, beneath the pulpit. The funeral was held by his friends Johannes Bugenhagen and Philipp Melanchthon. A year later, troops of Luther's adversary Charles V, Holy Roman Emperor entered the town, but were ordered by Charles not to disturb the grave..
Answer the following:
Who performed the funeral for Martin Luther?.

Predicted Answer: ['Charles V']
Reference Answers: {'text': ['Johannes Bugenhagen and Philipp Melanchthon', 'Johannes Bugenhagen and Philipp Melanchthon', 'Johannes Bugenhagen and Philipp Melanchthon'], 'answer_start': [265, 265, 265]}


Spacy based Sementic Similarity: 0.2803806050489905

Bert Scores:


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1-Score: 0.7892597913742065
Precesion: 0.8323561549186707
Recall: 0.7504065036773682



In [42]:
# Training using HuggingFace Trainer class.
model_id="google/flan-t5-small"
dataset_id="squad"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./qa_finetuned_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=1,
    max_steps=5000,
    # logging & evaluation strategies
    logging_dir=r"./qa_finetuned_model/logs",
    logging_strategy="steps",
    logging_steps=5,
    evaluation_strategy="no",
    save_strategy="no",
    save_total_limit=2,
    load_best_model_at_end=True,
    # report_to="wandb",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"]
)
trainer.train()

Step,Training Loss
5,0.6447
10,0.6863
15,0.5415
20,0.5087
25,0.536
30,0.4095
35,0.4967
40,0.6423
45,0.5181
50,0.5695


TrainOutput(global_step=5000, training_loss=0.5080767343759537, metrics={'train_runtime': 1668.0729, 'train_samples_per_second': 23.98, 'train_steps_per_second': 2.997, 'total_flos': 7435457533968384.0, 'train_loss': 0.5080767343759537, 'epoch': 0.46})

In [43]:
evaluate_model()

Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [10951/10950], Val_loss: 1.3660
{'exact_match': 63.28287606433302, 'f1': 83.74454674657792}
