<a href="https://colab.research.google.com/github/taxicabno1729/deeplearning/blob/main/Training_a_Question_Answering_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-26h5oabr
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-26h5oabr
  Resolved https://github.com/huggingface/transformers.git to commit ee88ae59940fd4b2c8fc119373143d7a1175c651
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.31.0.dev0)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0.dev0)
  

In [None]:
from datasets import load_dataset

In [None]:
datasets = load_dataset("csv", data_files = 'train-squad.csv', split='train').train_test_split(test_size=0.1)



DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'context', 'question', 'id', 'answer_start', 'text'],
        num_rows: 78138
    })
    test: Dataset({
        features: ['Unnamed: 0', 'context', 'question', 'id', 'answer_start', 'text'],
        num_rows: 8683
    })
})

In [22]:
datasets['train'][0]

{'Unnamed: 0': 158,
 'context': 'In early 1942, the governments of smaller powers began to push for an inter-governmental Asia-Pacific war council, based in Washington, D.C.. A council was established in London, with a subsidiary body in Washington. However, the smaller powers continued to push for an American-based body. The Pacific War Council was formed in Washington, on 1 April 1942, with President Franklin D. Roosevelt, his key advisor Harry Hopkins, and representatives from Britain, China, Australia, the Netherlands, New Zealand, and Canada. Representatives from India and the Philippines were later added. The council never had any direct operational control, and any decisions it made were referred to the U.S.-UK Combined Chiefs of Staff, which was also in Washington. Allied resistance, at first symbolic, gradually began to stiffen. Australian and Dutch forces led civilians in a prolonged guerilla campaign in Portuguese Timor.',
 'question': "Who was President Roosevelt's key advi

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
max_length = 384  # The maximum length of a feature (question and context)
doc_stride = (
    128  # The authorized overlap between two part of the context when splitting
)
# it is needed.

In [None]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a
    # stride. This results in one example possible giving several features when a context is long,
    # each of those features having a context that overlaps a bit the context of the previous
    # feature.
    examples["question"] = [q.lstrip() for q in examples["question"]]
    examples["context"] = [c.lstrip() for c in examples["context"]]
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a
    # map from a feature to its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original
    # context. This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what
        # is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this
        # span of text.
        sample_index = sample_mapping[i]
        # If no answers are given, set the cls_index as answer.
        if len(examples["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = examples["answer_start"][0]
            end_char = start_char + len(examples["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the
            # CLS index).
            if not (
                offsets[token_start_index][0] <= start_char
                and offsets[token_end_index][1] >= end_char
            ):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the
                # answer.
                # Note: we could go after the last offset if the answer is the last word (edge
                # case).
                while (
                    token_start_index < len(offsets)
                    and offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples


In [None]:
tokenized_datasets = datasets.map(
    prepare_train_features,
    batched=True,
    remove_columns=datasets["train"].column_names,
    num_proc=3,
)

Map (num_proc=3):   0%|          | 0/78138 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/8683 [00:00<?, ? examples/s]

In [None]:
train_val_set = tokenized_datasets["train"].train_test_split(test_size=0.2)
train_set = train_val_set['train'].with_format('numpy')[:]
validation_set = train_val_set['test'].with_format('numpy')[:]
test_set = tokenized_datasets["test"].with_format('numpy')[:]

In [None]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForQuestionAnswering: ['vocab_transform', 'vocab_projector', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['dropout_39', 'qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# train_set = model.prepare_tf_dataset(
#     train_set,
#     shuffle=True,
#     batch_size=32,
# )

# validation_set = model.prepare_tf_dataset(
#     train_val_set['test'],
#     shuffle=False,
#     batch_size=32,
# )

In [None]:
import tensorflow as tf
from tensorflow import keras

optimizer = keras.optimizers.legacy.Adam(learning_rate=5e-5)

In [None]:
# Optionally uncomment the next line for float16 training
keras.mixed_precision.set_global_policy("mixed_float16")

model.compile(optimizer=optimizer)

In [None]:
model.summary()

Model: "tf_distil_bert_for_question_answering_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 65190912  
 nLayer)                                                         
                                                                 
 qa_outputs (Dense)          multiple                  1538      
                                                                 
 dropout_39 (Dropout)        multiple                  0         
                                                                 
Total params: 65,192,450
Trainable params: 65,192,450
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(train_set, validation_data=validation_set, epochs=1)

KeyboardInterrupt: ignored