This notebook is based off of the example provided by hugging face.  

Referenced: https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb


# Load Data and prepare features 

In [50]:
from datasets import load_dataset, load_metric 
from transformers import (AutoTokenizer,AutoModelForQuestionAnswering, TrainingArguments, Trainer, default_data_collator)
import numpy as np

max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [1]:
# uncomment before running on google colab 
!pip install transformers==4.8.1
!pip install datasets 

Collecting transformers==4.8.1
  Downloading transformers-4.8.1-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 5.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.7 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 75.6 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.12 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.8.1
Collecting datasets
  Downloading datasets-1.12.1-py3-none-any.whl (270 kB)
[K     |████████████████████████████████| 270 kB 5.1 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m

In [3]:
# load dataset 
datasets = load_dataset("squad_v2")

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/801k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# instantiate tokenizer 
tokenizer = AutoTokenizer.from_pretrained('google/electra-small-discriminator')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [34]:
def prepare_train_features(examples):
    ''' given examples from Squad dataset: 
     accounts for examples longer then 512 tokens 
    returns: tokenized examples with: input ids, attention_mask, answer start position index, answer end position index''' 

    # tokenize examples accounting for some of them being to long to fit in a single feature 
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # get mapping from features to corresponding example in dataset  
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    #  get offset_mapping to map tokens to character position in original context  
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # add keys for start_positions and end_positions of answers to tokenized_examples 
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    # iterate through all offset_mappings (the corresponding start and end character in the original text that gave our token.)
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i] # input ids for tokenized examples 
        #cls_index = input_ids.index(tokenizer.cls_token_id)         # get cls index to label impossible answers 


        # Grab the sequence  corresponding for example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans,  
        sample_index = sample_mapping[i] # index of the example containing this span of text.
        answers = examples["answers"][sample_index] # get answers for this example 
        # If no answers are given for this example set the answer start and end position to 0 
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(0)
            tokenized_examples["end_positions"].append(0)
        else: # answers are given 
            # get start and end character index of the answer in the context.
            start_char_index = answers["answer_start"][0]
            end_char_index = start_char_index + len(answers["text"][0])

            # get start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # get end token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # if the answer is out of the span 
            if not (offsets[token_start_index][0] <= start_char_index and offsets[token_end_index][1] >= end_char_index):
              #  set start and end position for answer to 0 since not in this span
                tokenized_examples["start_positions"].append(0)
                tokenized_examples["end_positions"].append(0)
            else: # the answer is in the span 
            # Move the token_start_index and token_end_index to the two ends of the answer.
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char_index: 
                    token_start_index += 1 # keep shifting to the right until token_start_index refers to where the answer starts in text 
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char_index: # start on right side and work down
                    token_end_index -= 1 # keep shifting left until token_end_index points to where answer ends in text 
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [35]:
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

  0%|          | 0/131 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

# Train 

In [None]:
# check which gpu connected to 
!nvidia-smi

In [49]:
# instantiate pretrained model for finetuning  
model = AutoModelForQuestionAnswering.from_pretrained('google/electra-small-discriminator')

Downloading:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['qa_outputs.weight', 'qa_output

In [52]:
# instantiate data collator 
data_collator = default_data_collator 

# arguments for trainer 
args = TrainingArguments(
    f"drive/MyDrive/electra-small-squad_v2_run1",
    #evaluation_strategy = "steps",
    #max_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy= 'epoch',)


In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer= tokenizer,    
    
)

In [None]:
trainer.train()


***** Running training *****
  Num examples = 130503
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 20395


Step,Training Loss
500,3.4521
1000,2.3634
1500,1.982
2000,1.7804
2500,1.6591
3000,1.584
3500,1.5182
4000,1.4498
4500,1.3738
5000,1.3578


Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run1/checkpoint-4079
Configuration saved in drive/MyDrive/electra-small-squad_v2_run1/checkpoint-4079/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run1/checkpoint-4079/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run1/checkpoint-4079/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run1/checkpoint-4079/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run1/checkpoint-8158
Configuration saved in drive/MyDrive/electra-small-squad_v2_run1/checkpoint-8158/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run1/checkpoint-8158/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run1/checkpoint-8158/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run1/checkpoint-8158/special_tokens_map.json
Sa

TrainOutput(global_step=20395, training_loss=1.2795918428421489, metrics={'train_runtime': 7982.9204, 'train_samples_per_second': 81.739, 'train_steps_per_second': 2.555, 'total_flos': 2.702807149925376e+16, 'train_loss': 1.2795918428421489, 'epoch': 5.0})

In [None]:
trainer.save_model('drive/MyDrive/electra_5e.pt')

Saving model checkpoint to drive/MyDrive/electra_5e.pt
Configuration saved in drive/MyDrive/electra_5e.pt/config.json
Model weights saved in drive/MyDrive/electra_5e.pt/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra_5e.pt/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra_5e.pt/special_tokens_map.json


# 2nd run 

In [None]:
# load model from last checkpoint 
model_5e = AutoModelForQuestionAnswering.from_pretrained('drive/MyDrive/electra_5e.pt')
tokenizer_5e = AutoTokenizer.from_pretrained('drive/MyDrive/electra_5e.pt')
# tokenize dataset for training 
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

args_run_2 = TrainingArguments(
    f"drive/MyDrive/electra-small-squad_v2_run2",
    #evaluation_strategy = "steps",
    #max_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=50,
    weight_decay=0.01,
    save_strategy= 'epoch',
)

trainer_run2 = Trainer(
    model_5e, 
    args_run_2,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer= tokenizer_5e,    
    
)

loading configuration file drive/MyDrive/electra_5e.pt/config.json
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.8.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

loading weights file drive/MyDrive/electra_5e.pt/pytorch_model.bin
All model checkpoint weights were used when initializing ElectraForQuestionAnswering.

All the weights of ElectraFo

In [None]:
trainer_run2.train()

***** Running training *****
  Num examples = 130503
  Num Epochs = 50
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 203950


Step,Training Loss
500,1.0018
1000,0.9945
1500,0.9822
2000,0.9576
2500,0.9421
3000,0.9417
3500,0.9328
4000,0.9096
4500,0.8416
5000,0.8468


Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run2/checkpoint-4079
Configuration saved in drive/MyDrive/electra-small-squad_v2_run2/checkpoint-4079/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run2/checkpoint-4079/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run2/checkpoint-4079/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run2/checkpoint-4079/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run2/checkpoint-8158
Configuration saved in drive/MyDrive/electra-small-squad_v2_run2/checkpoint-8158/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run2/checkpoint-8158/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run2/checkpoint-8158/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run2/checkpoint-8158/special_tokens_map.json
Sa

In [None]:
trainer_run2.save_model('drive/MyDrive/electra_55e.pt')

# 3rd run 

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments,Trainer, default_data_collator
model_r3 = AutoModelForQuestionAnswering.from_pretrained('drive/MyDrive/checkpoint-57106')
tokenizer = AutoTokenizer.from_pretrained('drive/MyDrive/checkpoint-57106')
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

args_run_3 = TrainingArguments(
    f"drive/MyDrive/electra-small-squad_v2_run3",
    #evaluation_strategy = "steps",
    #max_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy= 'epoch',
)

trainer_run3 = Trainer(
    model_r3, 
    args_run_3,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer= tokenizer,    
    
)

loading configuration file drive/MyDrive/checkpoint-57106/config.json
Model config ElectraConfig {
  "_name_or_path": "drive/MyDrive/electra_5e.pt",
  "architectures": [
    "ElectraForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.8.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

loading weights file drive/MyDrive/checkpoint-57106/pytorch_model.bin
All model checkpoint weights were used when initializing ElectraForQuestionAnswering.

All the weights of ElectraFor

In [None]:
trainer_run3.train()

***** Running training *****
  Num examples = 130503
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 40790


Step,Training Loss
500,0.3118
1000,0.313
1500,0.3159
2000,0.3206
2500,0.3153
3000,0.3178
3500,0.3245
4000,0.3207
4500,0.2736
5000,0.2773


Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run3/checkpoint-4079
Configuration saved in drive/MyDrive/electra-small-squad_v2_run3/checkpoint-4079/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run3/checkpoint-4079/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run3/checkpoint-4079/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run3/checkpoint-4079/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run3/checkpoint-8158
Configuration saved in drive/MyDrive/electra-small-squad_v2_run3/checkpoint-8158/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run3/checkpoint-8158/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run3/checkpoint-8158/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run3/checkpoint-8158/special_tokens_map.json
Sa

TrainOutput(global_step=40790, training_loss=0.2187850300311224, metrics={'train_runtime': 16053.6724, 'train_samples_per_second': 81.292, 'train_steps_per_second': 2.541, 'total_flos': 5.405614299850752e+16, 'train_loss': 0.2187850300311224, 'epoch': 10.0})

# run 4 

In [None]:
model_r4 = AutoModelForQuestionAnswering.from_pretrained('drive/MyDrive/run3-checkpoint-40790')
tokenizer = AutoTokenizer.from_pretrained('drive/MyDrive/run3-checkpoint-40790')
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

args_run_4 = TrainingArguments(
    f"drive/MyDrive/electra-small-squad_v2_run4",
    #evaluation_strategy = "steps",
    #max_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy= 'epoch',
)

trainer_run4 = Trainer(
    model_r4, 
    args_run_4,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer= tokenizer,    
    
)

loading configuration file drive/MyDrive/run3-checkpoint-40790/config.json
Model config ElectraConfig {
  "_name_or_path": "drive/MyDrive/checkpoint-57106",
  "architectures": [
    "ElectraForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.8.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

loading weights file drive/MyDrive/run3-checkpoint-40790/pytorch_model.bin
All model checkpoint weights were used when initializing ElectraForQuestionAnswering.

All the weights 

  0%|          | 0/131 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer_run4.train()

***** Running training *****
  Num examples = 130503
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 40790


Step,Training Loss
500,0.12
1000,0.1206
1500,0.1271
2000,0.1326
2500,0.1268
3000,0.1318
3500,0.1351
4000,0.1422
4500,0.1188
5000,0.1228


Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run4/checkpoint-4079
Configuration saved in drive/MyDrive/electra-small-squad_v2_run4/checkpoint-4079/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run4/checkpoint-4079/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run4/checkpoint-4079/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run4/checkpoint-4079/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run4/checkpoint-8158
Configuration saved in drive/MyDrive/electra-small-squad_v2_run4/checkpoint-8158/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run4/checkpoint-8158/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run4/checkpoint-8158/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run4/checkpoint-8158/special_tokens_map.json
Sa

TrainOutput(global_step=40790, training_loss=0.11120225214788686, metrics={'train_runtime': 16034.0753, 'train_samples_per_second': 81.391, 'train_steps_per_second': 2.544, 'total_flos': 5.405614299850752e+16, 'train_loss': 0.11120225214788686, 'epoch': 10.0})

# Run 5 


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments,Trainer, default_data_collator
model_r5 = AutoModelForQuestionAnswering.from_pretrained('drive/MyDrive/electra-small-squad_v2_run4/checkpoint-40790')
tokenizer = AutoTokenizer.from_pretrained('drive/MyDrive/electra-small-squad_v2_run4/checkpoint-40790')
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

args_run_5 = TrainingArguments(
    f"drive/MyDrive/electra-small-squad_v2_run5",
    #evaluation_strategy = "steps",
    #max_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy= 'epoch',
)

trainer_run5 = Trainer(
    model_r5, 
    args_run_5,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer= tokenizer,    
    
)

loading configuration file drive/MyDrive/electra-small-squad_v2_run4/checkpoint-40790/config.json
Model config ElectraConfig {
  "_name_or_path": "drive/MyDrive/run3-checkpoint-40790",
  "architectures": [
    "ElectraForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.8.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

loading weights file drive/MyDrive/electra-small-squad_v2_run4/checkpoint-40790/pytorch_model.bin
All model checkpoint weights were used when initiali

  0%|          | 0/131 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer_run5.train()

***** Running training *****
  Num examples = 130503
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 40790


Step,Training Loss
500,0.0366
1000,0.0457
1500,0.0486
2000,0.0551
2500,0.0579
3000,0.0586
3500,0.0589
4000,0.0651
4500,0.0668
5000,0.0571


Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run5/checkpoint-4079
Configuration saved in drive/MyDrive/electra-small-squad_v2_run5/checkpoint-4079/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run5/checkpoint-4079/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run5/checkpoint-4079/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run5/checkpoint-4079/special_tokens_map.json


Step,Training Loss
500,0.0366
1000,0.0457
1500,0.0486
2000,0.0551
2500,0.0579
3000,0.0586
3500,0.0589
4000,0.0651
4500,0.0668
5000,0.0571


Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run5/checkpoint-8158
Configuration saved in drive/MyDrive/electra-small-squad_v2_run5/checkpoint-8158/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run5/checkpoint-8158/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run5/checkpoint-8158/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run5/checkpoint-8158/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run5/checkpoint-12237
Configuration saved in drive/MyDrive/electra-small-squad_v2_run5/checkpoint-12237/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run5/checkpoint-12237/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run5/checkpoint-12237/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run5/checkpoint-12237/special_tokens_map.js

TrainOutput(global_step=40790, training_loss=0.06777338325816587, metrics={'train_runtime': 16067.3573, 'train_samples_per_second': 81.222, 'train_steps_per_second': 2.539, 'total_flos': 5.405614299850752e+16, 'train_loss': 0.06777338325816587, 'epoch': 10.0})

# run 6


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments,Trainer, default_data_collator
model_r6 = AutoModelForQuestionAnswering.from_pretrained('drive/MyDrive/electra-small-squad_v2_run5/checkpoint-40790')
tokenizer = AutoTokenizer.from_pretrained('drive/MyDrive/electra-small-squad_v2_run5/checkpoint-40790')
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

args_run_6 = TrainingArguments(
    f"drive/MyDrive/electra-small-squad_v2_run6",
    #evaluation_strategy = "steps",
    #max_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy= 'epoch',
    #evaluation_strategy='steps',
    #eval_steps=1
)

trainer_run6 = Trainer(
    model_r6, 
    args_run_6,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer= tokenizer,  
    #compute_metrics=compute_metrics  
    
)

loading configuration file drive/MyDrive/electra-small-squad_v2_run5/checkpoint-40790/config.json
Model config ElectraConfig {
  "_name_or_path": "drive/MyDrive/electra-small-squad_v2_run4/checkpoint-40790",
  "architectures": [
    "ElectraForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.8.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

loading weights file drive/MyDrive/electra-small-squad_v2_run5/checkpoint-40790/pytorch_model.bin
All model checkpoint weights 

In [None]:
trainer_run6.train()

***** Running training *****
  Num examples = 130503
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 40790


Step,Training Loss
500,0.2447
1000,0.2312
1500,0.236
2000,0.2308
2500,0.2269
3000,0.2389
3500,0.2325
4000,0.2443
4500,0.2039
5000,0.2


Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run6/checkpoint-4079
Configuration saved in drive/MyDrive/electra-small-squad_v2_run6/checkpoint-4079/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run6/checkpoint-4079/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run6/checkpoint-4079/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run6/checkpoint-4079/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/electra-small-squad_v2_run6/checkpoint-8158
Configuration saved in drive/MyDrive/electra-small-squad_v2_run6/checkpoint-8158/config.json
Model weights saved in drive/MyDrive/electra-small-squad_v2_run6/checkpoint-8158/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/electra-small-squad_v2_run6/checkpoint-8158/tokenizer_config.json
Special tokens file saved in drive/MyDrive/electra-small-squad_v2_run6/checkpoint-8158/special_tokens_map.json
Sa

TrainOutput(global_step=40790, training_loss=0.15624639955797917, metrics={'train_runtime': 25679.3311, 'train_samples_per_second': 50.82, 'train_steps_per_second': 1.588, 'total_flos': 5.405614299850752e+16, 'train_loss': 0.15624639955797917, 'epoch': 10.0})

# run 7


In [None]:
from transformers import default_data_collator
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments,Trainer, default_data_collator

data_collator = default_data_collator
model_r7 = AutoModelForQuestionAnswering.from_pretrained('drive/MyDrive/electra-small-squad_v2_run6/checkpoint-40790')
tokenizer = AutoTokenizer.from_pretrained('drive/MyDrive/electra-small-squad_v2_run6/checkpoint-40790')
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

args_run_7 = TrainingArguments(
    f"drive/MyDrive/electra-small-squad_v2_run7",
    #evaluation_strategy = "epoch",
    #max_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy= 'epoch',
    evaluation_strategy='steps',
    eval_steps=500,
    prediction_loss_only=True
)

trainer_run7 = Trainer(
    model_r7, 
    args_run_7,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"].shuffle(seed=42).select(range(1000)),
    data_collator=data_collator,
    tokenizer= tokenizer,    
)

trainer_run7.train()

# Run 8 



In [None]:
from transformers import default_data_collator
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments,Trainer, default_data_collator

checkpoint= 'drive/MyDrive/electra-small-squad_v2_run7/checkpoint-20395'

data_collator = default_data_collator
model_r8 = AutoModelForQuestionAnswering.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

args_run_8 = TrainingArguments(
    f"drive/MyDrive/electra-small-squad_v2_run8",
    #evaluation_strategy = "epoch",
    #max_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy= 'epoch',
    #evaluation_strategy='steps',
    #eval_steps=500,
    #prediction_loss_only=True
)

trainer_run8 = Trainer(
    model_r8, 
    args_run_8,
    train_dataset=tokenized_datasets["train"],
    #eval_dataset=tokenized_datasets["validation"].shuffle(seed=42).select(range(1000)),
    data_collator=data_collator,
    tokenizer= tokenizer,    
)

trainer_run8.train()