In [13]:
import json
import os
import pickle
import sys
from pathlib import Path

import joblib
import numpy as np
import pandas as pd

from transformers import BertConfig, BertTokenizer, TFBertModel
from collections import defaultdict, Counter

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf


tf.get_logger().setLevel("INFO")

In [14]:
script_path = Path.cwd()
script_path

PosixPath('/tf/notebooks/w266/Final_Project/266_final_project_summer_2023/models')

In [15]:
if "__file__" in globals():
    script_path = Path(__file__).parent.absolute()
else:
    script_path = Path.cwd()

# setup for multi-gpu training
mirrored_strategy = tf.distribute.MirroredStrategy()
checkpoint_dir = script_path.joinpath("training_checkpoints")
checkpoint_fullpath = checkpoint_dir.joinpath("ckpt_{epoch:04d}.ckpt")

# load pkl file
# print("Loading dev_examples.pkl")
# dev_example_path = script_path.joinpath("dev_examples.pkl")
# dev_examples = joblib.load(dev_example_path, pickle.HIGHEST_PROTOCOL

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


In [16]:
# Load dataset from cache
print("Loading squadv2_train_tf")
tf_dataset_path = script_path.joinpath(
    "squadv2_train_tf"
)
ds_train = tf.data.Dataset.load(str(tf_dataset_path))
ds_train = ds_train.cache()
ds_train = ds_train.prefetch(tf.data.AUTOTUNE)

max_seq_length = 386

bert_tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

Loading squadv2_train_tf


In [17]:
def create_bert_qa_model(
    MODEL_NAME="bert-large-uncased",
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
):
    with mirrored_strategy.scope():
        bert_config = BertConfig.from_pretrained(
            MODEL_NAME,
            output_hidden_states=True,
        )

        bert_model = TFBertModel.from_pretrained(MODEL_NAME, config=bert_config)

        input_ids = tf.keras.layers.Input(
            shape=(max_seq_length,), dtype=tf.int64, name="input_ids"
        )
        attention_mask = tf.keras.layers.Input(
            shape=(max_seq_length,), dtype=tf.int64, name="input_masks"
        )
        token_type_ids = tf.keras.layers.Input(
            shape=(max_seq_length,), dtype=tf.int64, name="token_type_ids"
        )

        bert_inputs = {
            "input_ids": input_ids,
            "token_type_ids": token_type_ids,
            "attention_mask": attention_mask,
        }

        sequence_embeddings = bert_model(bert_inputs).last_hidden_state

        logits = tf.keras.layers.Dense(2, name="logits")(sequence_embeddings)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)

        softmax_start_logits = tf.keras.layers.Softmax()(start_logits)
        softmax_end_logits = tf.keras.layers.Softmax()(end_logits)

        # Need to do argmax after softmax to get most likely index
        bert_qa_model = tf.keras.Model(
            inputs=[input_ids, token_type_ids, attention_mask],
            outputs=[softmax_start_logits, softmax_end_logits],
        )

        bert_qa_model.trainable = True

        bert_qa_model.compile(
            optimizer=optimizer,
            loss=[
                tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
            ],
            metrics=[
                tf.keras.metrics.SparseCategoricalAccuracy(name="start_accuracy"),
                tf.keras.metrics.SparseCategoricalAccuracy(name="end_accuracy"),
            ],
        )
    return bert_qa_model

def return_prediction_string(bert_tokenizer, input_ids, predictions):
    pass

def combine_bert_subwords(bert_tokenizer, input_ids, predictions):
    all_predictions = []
    for x in range(len(predictions[0])):
        answer = ""
        token_list = bert_tokenizer.convert_ids_to_tokens(
            input_ids[x][
                np.argmax(predictions[0][x]) : np.argmax(predictions[1][x]) + 1
            ]
        )
        if len(token_list) == 0:
            answer = ""
        elif token_list[0] == "[CLS]":
            answer = ""
        else:
            for i, token in enumerate(token_list):
                if token.startswith("##"):
                    answer += token[2:]
                else:
                    if i != 0:
                        answer += " "
                    answer += token
        all_predictions.append(answer)
    return all_predictions

In [18]:

print("Prepare data...")
# sample dataset for predictions
# samples = ds_train.take(ds_train.cardinality().numpy())
samples = ds_train.take(1000)
input_ids = []
input_ids = []
token_type_ids = []
attention_mask = []
impossible = []
qas_id = []
start_positions = []
end_positions = []

for sample in samples:
    input_ids.append(sample[0]["input_ids"])
    token_type_ids.append(sample[0]["token_type_ids"])
    attention_mask.append(sample[0]["attention_mask"])
    impossible.append(sample[1]["is_impossible"].numpy())
    qas_id.append(sample[0]["qas_id"].numpy().decode("utf-8"))
    start_positions.append(sample[1]["start_positions"])
    end_positions.append(sample[1]["end_positions"])

input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int64)
token_type_ids = tf.convert_to_tensor(token_type_ids, dtype=tf.int64)
attention_mask = tf.convert_to_tensor(attention_mask, dtype=tf.int64)
start_positions = tf.convert_to_tensor(start_positions, dtype=tf.int64)
end_positions = tf.convert_to_tensor(end_positions, dtype=tf.int64)

Prepare data...


2023-07-08 01:57:29.336826: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [19]:
input_ids[0]

<tf.Tensor: shape=(386,), dtype=int64, numpy=
array([  101,  2043,  2106, 20773,  2707,  3352,  2759,  1029,   102,
       20773, 21025, 19358, 22815,  1011,  5708,  1006,  1013, 12170,
       23432, 29715,  3501, 29678, 12325, 29685,  1013, 10506,  1011,
       10930,  2078,  1011,  2360,  1007,  1006,  2141,  2244,  1018,
        1010,  3261,  1007,  2003,  2019,  2137,  3220,  1010,  6009,
        1010,  2501,  3135,  1998,  3883,  1012,  2141,  1998,  2992,
        1999,  5395,  1010,  3146,  1010,  2016,  2864,  1999,  2536,
        4823,  1998,  5613,  6479,  2004,  1037,  2775,  1010,  1998,
        3123,  2000,  4476,  1999,  1996,  2397,  4134,  2004,  2599,
        3220,  1997,  1054,  1004,  1038,  2611,  1011,  2177, 10461,
        1005,  1055,  2775,  1012,  3266,  2011,  2014,  2269,  1010,
       25436, 22815,  1010,  1996,  2177,  2150,  2028,  1997,  1996,
        2088,  1005,  1055,  2190,  1011,  4855,  2611,  2967,  1997,
        2035,  2051,  1012,  2037, 14221,  2

In [None]:
# Change optimizer based on
# https://www.tensorflow.org/tfmodels/nlp/fine_tune_bert
# https://arxiv.org/pdf/1810.04805.pdf
epochs = 1
batch_size = 48
steps_per_epoch = len(input_ids) // batch_size
num_train_steps = steps_per_epoch * epochs
warmup_steps = num_train_steps // 10
initial_learning_rate = 5e-5

optimizer = tf.keras.optimizers.experimental.AdamW(learning_rate=initial_learning_rate)

bert_qa_model = create_bert_qa_model(optimizer=optimizer)
# tf.keras.utils.plot_model(bert_qa_model, show_shapes=True)
# bert_qa_model.summary()

history = bert_qa_model.fit(
    [input_ids, token_type_ids, attention_mask],
    [start_positions, end_positions],
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_fullpath,
            verbose=1,
            save_weights_only=True,
            save_freq="epoch",
        ),
    ],
)

joblib.dump(
    history,
    "bert-model-train-history.pkl",
    compress=False,
    protocol=pickle.HIGHEST_PROTOCOL,
)

# bert_qa_model.save_weights("backupsaveend.h5")

exit()

In [None]:
print("Execute predictions...")
new_predictions = bert_qa_model.predict([input_ids, token_type_ids, attention_mask])


print("Done with Predictions...")
new_answers = combine_bert_subwords(bert_tokenizer, input_ids, new_predictions)

print("Calculate probabilities for split answers...")
probabilities = []
for i, prediction in enumerate(new_predictions[0]):
    probabilities.append(
        np.amax(new_predictions[0][i]) * np.amax(new_predictions[1][i])
    )

print("Choose best answer for split answers...")


# duplicate_ids = [ x for x,  count in collections.Counter(qas_id).items() if count > 1]
def list_duplicates(seq):
    tally = defaultdict(list)
    for i, item in enumerate(seq):
        tally[item].append(i)
    return ((key, locs) for key, locs in tally.items() if len(locs) > 1)


duplicate_ids = sorted(list_duplicates(qas_id))

scoring_dict = {}
for d in duplicate_ids:
    maxp = None
    for i in d[1]:
        if maxp == None or probabilities[i] > maxp:
            maxp = probabilities[i]
            maxindex = i
    scoring_dict[qas_id[maxindex]] = new_answers[maxindex]
    print(f"{scoring_dict[qas_id[maxindex]]} {maxp}")
for i, q in enumerate(new_answers):
    if qas_id[i] not in scoring_dict:
        scoring_dict[qas_id[i]] = q

# diagnose impossible questions Highly inefficient
for i, q in enumerate(qas_id):
    answer = ""
    question = ""
    for t in train_examples:
        if t.qas_id == qas_id[i]:
            answer = t.answer_text
            question = t.question_text
            break
    if impossible[i] == 1:
        print(f"Index: {i}")
        print(f"QAS_ID: {qas_id[i]}")
        print(f"Question: {question}")
        print(f"Answer: {answer}")
        print(f"Prediction: {new_answers[i]}")
        print(80 * "-")

with open("scoring_dict.json", "w", encoding="utf-8") as f:
    json.dump(scoring_dict, f, ensure_ascii=False, indent=4)
print("Wrote scoring_dict.json")
