In [None]:
!pip install transformers tensorflow datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00

In [16]:
import tensorflow as tf
from transformers import BertTokenizerFast, TFBertForQuestionAnswering, create_optimizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

## Data Collection and Preprocessing:

In [3]:
# 1. Load the JSON data
json_data = pd.read_json('/content/dev-v1.1.json')

# 2. Use json_normalize to flatten the data
df = pd.json_normalize(
    json_data['data'],
    record_path=['paragraphs', 'qas'],
    meta=['title', ['paragraphs', 'context']],
    errors='ignore'
)
display(df.head())

# count how many answers for each question by counting the number of answers in the 'answers' column
df['answer_count'] = df['answers'].apply(len)

print(df['answer_count'].value_counts())

Unnamed: 0,answers,question,id,title,paragraphs.context
0,"[{'answer_start': 177, 'text': 'Denver Broncos...",Which NFL team represented the AFC at Super Bo...,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to...
1,"[{'answer_start': 249, 'text': 'Carolina Panth...",Which NFL team represented the NFC at Super Bo...,56be4db0acb8001400a502ed,Super_Bowl_50,Super Bowl 50 was an American football game to...
2,"[{'answer_start': 403, 'text': 'Santa Clara, C...",Where did Super Bowl 50 take place?,56be4db0acb8001400a502ee,Super_Bowl_50,Super Bowl 50 was an American football game to...
3,"[{'answer_start': 177, 'text': 'Denver Broncos...",Which NFL team won Super Bowl 50?,56be4db0acb8001400a502ef,Super_Bowl_50,Super Bowl 50 was an American football game to...
4,"[{'answer_start': 488, 'text': 'gold'}, {'answ...",What color was used to emphasize the 50th anni...,56be4db0acb8001400a502f0,Super_Bowl_50,Super Bowl 50 was an American football game to...


answer_count
3    8490
5    1147
4     759
2     136
6      35
1       3
Name: count, dtype: int64


In [4]:
# 3. Expand the dataframe to handle multiple answers
df['answer_count'] = df['answers'].apply(len)
df_expanded = df.explode('answers').reset_index(drop=True)

# Extract the start position of the answer
df_expanded['answer_start'] = df_expanded['answers'].apply(lambda ans: ans['answer_start'])

# Extract the answer text
df_expanded['answer'] = df_expanded['answers'].apply(lambda ans: ans['text'])

# Drop unnecessary columns
df_expanded = df_expanded.drop(columns=['answers', 'answer_count'])

# Drop duplicates and missing values
df_expanded = df_expanded.dropna(subset=['answer', 'question', 'answer_start'])

display(df_expanded.head())

Unnamed: 0,question,id,title,paragraphs.context,answer_start,answer
0,Which NFL team represented the AFC at Super Bo...,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to...,177,Denver Broncos
1,Which NFL team represented the AFC at Super Bo...,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to...,177,Denver Broncos
2,Which NFL team represented the AFC at Super Bo...,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to...,177,Denver Broncos
3,Which NFL team represented the NFC at Super Bo...,56be4db0acb8001400a502ed,Super_Bowl_50,Super Bowl 50 was an American football game to...,249,Carolina Panthers
4,Which NFL team represented the NFC at Super Bo...,56be4db0acb8001400a502ed,Super_Bowl_50,Super Bowl 50 was an American football game to...,249,Carolina Panthers


In [5]:
print(df_expanded.shape)

(34726, 6)


In [6]:
# 4. Prepare training and validation datasets

# take 2000 rows from df_expanded
df_expanded = df_expanded.head(2000)

# do validation and train split
train_df, val_df = train_test_split(df_expanded, test_size=0.2, random_state=42)

# save train_texts, train_contexts, train_starts, train_answers
train_texts = train_df['question'].tolist()
train_contexts = train_df['paragraphs.context'].tolist()
train_starts = train_df['answer_start'].tolist()
train_answers = train_df['answer'].tolist()

# save val_texts, val_contexts, val_starts, val_answers
val_texts = val_df['question'].tolist()
val_contexts = val_df['paragraphs.context'].tolist()
val_starts = val_df['answer_start'].tolist()
val_answers = val_df['answer'].tolist()

display(train_texts[:5])
display(train_contexts[:5])
display(train_starts[:5])
display(train_answers[:5])


['What was the final score of the game between the Broncos and Steelers?',
 'Which network broadcasted Super Bowl 50 in the U.S.?',
 'Who had the most rushing yards on the Broncos?',
 'How many touchdowns did Jonathan Stewart have in 13 games?',
 'When were the finalists announced?']

["The Broncos defeated the Pittsburgh Steelers in the divisional round, 23–16, by scoring 11 points in the final three minutes of the game. They then beat the defending Super Bowl XLIX champion New England Patriots in the AFC Championship Game, 20–18, by intercepting a pass on New England's 2-point conversion attempt with 17 seconds left on the clock. Despite Manning's problems with interceptions during the season, he didn't throw any in their two playoff games.",
 'CBS broadcast Super Bowl 50 in the U.S., and charged an average of $5 million for a 30-second commercial during the game. The Super Bowl 50 halftime show was headlined by the British rock group Coldplay with special guest performers Beyoncé and Bruno Mars, who headlined the Super Bowl XLVII and Super Bowl XLVIII halftime shows, respectively. It was the third-most watched U.S. broadcast ever.',
 "Manning finished the year with a career-low 67.9 passer rating, throwing for 2,249 yards and nine touchdowns, with 17 interception

[70, 0, 578, 1026, 24]

['23–16', 'CBS', 'Anderson', 'six', 'October 16, 2012']

In [7]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')  # Initialize tokenizer
model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# 5. Tokenize the inputs (questions and contexts)
def encode_data(questions, contexts, answers, answer_starts):
    encodings = tokenizer(questions, contexts, truncation=False, padding=True, return_tensors='tf', return_offsets_mapping=True,)
    start_positions = []
    end_positions = []

    for i, answer in enumerate(answers):
        answer_start = answer_starts[i]
        answer_end = answer_start + len(answer)

        # get offset_mapping
        offset_mapping = encodings.offset_mapping[i]

        start_position = None
        end_position = None

        for j, (start, end) in enumerate(offset_mapping):
            if start <= answer_start and end >= answer_start:
                start_position = j
            if start <= answer_end and end >= answer_end:
                end_position = j
            # break if both start and end positions are found
            if start_position is not None and end_position is not None:
                break

        # Debug output for first 5 answers
        if i < 5:
            # Convert token IDs back to the original tokens for better visualization
            tokens = tokenizer.convert_ids_to_tokens(encodings['input_ids'][i])
            print(f"Answer: {answer}")
            print(f"Answer start: {answer_start}")
            print(f"Answer end: {answer_end}")
            print("Tokens:", tokens)
            print("Answer tokens:", tokens[start_position:end_position + 1])
            print()

        if start_position is None or end_position is None:
            start_positions.append(0)
            end_positions.append(0)
            continue

        start_positions.append(start_position)
        end_positions.append(end_position)

    encodings.update({'start_positions': tf.convert_to_tensor(start_positions), 'end_positions': tf.convert_to_tensor(end_positions)})
    return encodings


# Encode training and validation datasets
train_encodings = encode_data(train_texts, train_contexts, train_answers, train_starts)
val_encodings = encode_data(val_texts, val_contexts, val_answers, val_starts)

Answer: 23–16
Answer start: 70
Answer end: 75
Tokens: ['[CLS]', 'what', 'was', 'the', 'final', 'score', 'of', 'the', 'game', 'between', 'the', 'broncos', 'and', 'steelers', '?', '[SEP]', 'the', 'broncos', 'defeated', 'the', 'pittsburgh', 'steelers', 'in', 'the', 'divisional', 'round', ',', '23', '–', '16', ',', 'by', 'scoring', '11', 'points', 'in', 'the', 'final', 'three', 'minutes', 'of', 'the', 'game', '.', 'they', 'then', 'beat', 'the', 'defending', 'super', 'bowl', 'xl', '##ix', 'champion', 'new', 'england', 'patriots', 'in', 'the', 'afc', 'championship', 'game', ',', '20', '–', '18', ',', 'by', 'intercept', '##ing', 'a', 'pass', 'on', 'new', 'england', "'", 's', '2', '-', 'point', 'conversion', 'attempt', 'with', '17', 'seconds', 'left', 'on', 'the', 'clock', '.', 'despite', 'manning', "'", 's', 'problems', 'with', 'interceptions', 'during', 'the', 'season', ',', 'he', 'didn', "'", 't', 'throw', 'any', 'in', 'their', 'two', 'playoff', 'games', '.', '[SEP]', '[PAD]', '[PAD]', '[PA

In [10]:
# verify inputs
print(tokenizer.convert_ids_to_tokens(train_encodings['input_ids'][0]))
print(train_encodings['start_positions'][0])
print(train_encodings['end_positions'][0])

['[CLS]', 'what', 'was', 'the', 'final', 'score', 'of', 'the', 'game', 'between', 'the', 'broncos', 'and', 'steelers', '?', '[SEP]', 'the', 'broncos', 'defeated', 'the', 'pittsburgh', 'steelers', 'in', 'the', 'divisional', 'round', ',', '23', '–', '16', ',', 'by', 'scoring', '11', 'points', 'in', 'the', 'final', 'three', 'minutes', 'of', 'the', 'game', '.', 'they', 'then', 'beat', 'the', 'defending', 'super', 'bowl', 'xl', '##ix', 'champion', 'new', 'england', 'patriots', 'in', 'the', 'afc', 'championship', 'game', ',', '20', '–', '18', ',', 'by', 'intercept', '##ing', 'a', 'pass', 'on', 'new', 'england', "'", 's', '2', '-', 'point', 'conversion', 'attempt', 'with', '17', 'seconds', 'left', 'on', 'the', 'clock', '.', 'despite', 'manning', "'", 's', 'problems', 'with', 'interceptions', 'during', 'the', 'season', ',', 'he', 'didn', "'", 't', 'throw', 'any', 'in', 'their', 'two', 'playoff', 'games', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA

In [11]:
# 6. Convert to TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': train_encodings['input_ids'],
        'attention_mask': train_encodings['attention_mask']
    },
    {
        'start_positions': train_encodings['start_positions'],
        'end_positions': train_encodings['end_positions']
    }
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': val_encodings['input_ids'],
        'attention_mask': val_encodings['attention_mask']
    },
    {
        'start_positions': val_encodings['start_positions'],
        'end_positions': val_encodings['end_positions']
    }
))

# 7. Batch the datasets
train_dataset = train_dataset.batch(8)
val_dataset = val_dataset.batch(8)

# Model Selection and Training

In [12]:
# 8. Define optimizer and learning rate schedule
num_train_steps = len(train_dataset) * 5
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps
)

# 9. Compile the model with the optimizer and loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# 10. Train the model
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5
)

# 11. Save the model and tokenizer
model.save_pretrained('./saved_qa_model')
tokenizer.save_pretrained('./saved_qa_model')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


('./saved_qa_model/tokenizer_config.json',
 './saved_qa_model/special_tokens_map.json',
 './saved_qa_model/vocab.txt',
 './saved_qa_model/added_tokens.json',
 './saved_qa_model/tokenizer.json')

## Model Evaluation

In [14]:
# 12. test the model
model = TFBertForQuestionAnswering.from_pretrained('./saved_qa_model')
tokenizer = BertTokenizerFast.from_pretrained('./saved_qa_model')

example_context = "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary."
question = "What is atop the Main Building's gold dome?"

inputs = tokenizer(question, example_context, return_tensors='tf')
outputs = model(inputs)

answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0]
answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0]

answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end+1]))

print(f"Question: {question}")
print(f"Answer: {answer}")

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

All the layers of TFBertForQuestionAnswering were initialized from the model checkpoint at ./saved_qa_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


Question: What is atop the Main Building's gold dome?
Answer: golden statue of the virgin mary


In [17]:
# 12. Evaluate the model
def evaluate_model(model, tokenizer, val_texts, val_contexts, val_answers, val_starts):
    total = len(val_texts)
    exact_match = 0
    f1 = 0.0

    for i in range(total):
        question = val_texts[i]
        context = val_contexts[i]
        true_answer = val_answers[i]

        # Tokenize question and context
        inputs = tokenizer(question, context, return_tensors='tf')
        outputs = model(inputs)

        # Get predicted answer start and end positions
        answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0]
        answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0]

        # Extract the predicted answer
        predicted_answer = tokenizer.convert_tokens_to_string(
            tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end+1])
        )

        # Remove extra spaces in both true and predicted answers
        true_answer = true_answer.strip()
        predicted_answer = predicted_answer.strip()

        # Calculate exact match
        if true_answer == predicted_answer:
            exact_match += 1

        # Calculate F1 score
        true_tokens = set(true_answer.split())
        predicted_tokens = set(predicted_answer.split())

        common_tokens = true_tokens & predicted_tokens
        if len(common_tokens) == 0:
            f1 += 0
        else:
            precision = len(common_tokens) / len(predicted_tokens)
            recall = len(common_tokens) / len(true_tokens)
            f1 += 2 * (precision * recall) / (precision + recall)

    exact_match_score = exact_match / total * 100
    f1_score_value = f1 / total * 100

    print(f"Exact Match: {exact_match_score:.2f}%")
    print(f"F1 Score: {f1_score_value:.2f}%")
    return exact_match_score, f1_score_value

# Call the evaluate function
exact_match_score, f1_score_value = evaluate_model(model, tokenizer, val_texts, val_contexts, val_answers, val_starts)


Exact Match: 19.00%
F1 Score: 25.22%


## Summary

* We trained a BERT-based model for question answering using a subset of 2,000 data points from the SQuAD dataset.
* The data was preprocessed to extract relevant question-answer pairs:
  * We tokenized both the questions and contexts using the BertTokenizerFast
  * For training, we used TensorFlow's TFBertForQuestionAnswering model and compiled it with an Adam optimizer and a sparse categorical cross-entropy loss function.
* After training for 5 epochs, we evaluated the model on the validation set using Exact Match (EM) and F1 score metrics.

The results on the validation set were as follows:

* **Exact Match (EM)**: 19.00%
* **F1 Score:** 25.22%


### Analysis of Results:
The Exact Match (EM) score of 19.00% and F1 score of 25.22% are relatively low, which suggests that the model struggles to correctly identify the precise span of the answers.

There could be several potential reasons for this:

* **Data Size:** We trained the model on a small subset of 2,000 data points. BERT models typically require larger datasets to capture the nuances of language and provide accurate predictions.

* **Tokenization Misalignment:** The tokenizer might have difficulties precisely mapping token offsets, especially for longer or more complex answers. Small misalignments in token boundaries can significantly affect the EM score.

* **Model Hyperparameters:** The learning rate and the number of training epochs could have been suboptimal. Additionally, using more advanced learning rate schedules or warmup steps might improve performance.

* **Truncation Issues:** We set truncation to False during tokenization, which could lead to the context being too long for the model’s input size. This could result in the model missing relevant parts of the context, leading to lower accuracy.

### Future Improvements:
To improve the performance of the model, we could consider the following steps:

* **Increase the Dataset Size:** Training on a larger dataset would help the model generalize better and capture more varied patterns in the data.

* **Fine-tuning Hyperparameters:** We can further experiment with different learning rates, batch sizes, and epochs. Additionally, introducing learning rate warmup and weight decay could improve optimization.

* **Tokenization Refinement:** Adjusting the tokenization process, including the use of truncation and padding, might help the model handle long contexts more effectively. We could also explore different strategies for handling token offset mapping more accurately.

* **Data Augmentation:** Using data augmentation techniques to increase the diversity and size of the training dataset could help improve the model’s robustness.

With these refinements, we expect the model's ability to predict accurate answer spans and improve its overall performance on the task.

## Google Collab Utils

In [18]:
import shutil

# Create a zip file of the saved model directory
shutil.make_archive('saved_qa_model', 'zip', './saved_qa_model')

'/content/saved_qa_model.zip'

In [19]:
from google.colab import files

# Download the zip file
files.download('saved_qa_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
!apt-get install -y texlive-xetex texlive-fonts-recommended texlive-plain-generic
!pip install nbconvert

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  dvisvgm fonts-droid-fallback fonts-lato fonts-lmodern fonts-noto-mono fonts-texgyre
  fonts-urw-base35 libapache-pom-java libcommons-logging-java libcommons-parent-java
  libfontbox-java libfontenc1 libgs9 libgs9-common libidn12 libijs-0.35 libjbig2dec0 libkpathsea6
  libpdfbox-java libptexenc1 libruby3.0 libsynctex2 libteckit0 libtexlua53 libtexluajit2 libwoff1
  libzzip-0-13 lmodern poppler-data preview-latex-style rake ruby ruby-net-telnet ruby-rubygems
  ruby-webrick ruby-xmlrpc ruby3.0 rubygems-integration t1utils teckit tex-common tex-gyre
  texlive-base texlive-binaries texlive-latex-base texlive-latex-extra texlive-latex-recommended
  texlive-pictures tipa xfonts-encodings xfonts-utils
Suggested packages:
  fonts-noto fonts-freefont-otf | fonts-freefont-ttf libavalon-framework-java
  libcommons-logging-java-doc libexcalibur-logk

In [21]:
!jupyter nbconvert --to pdf standford_qa_bert.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

In [None]:
from google.colab import files
files.download('standford_qa_bert.pdf')