## Adding a hardware accelerator

`Edit > Notebook Settings > Hardware accelerator > (GPU)`


Run the following cell to confirm that the GPU is detected.

In [50]:
import numpy as np

In [1]:
import torch

# Confirm that the GPU is detected
assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")

Found device: Tesla T4, n_gpu: 1


Setup

In [55]:
#!pip install -q transformers==4.17.0  rich[jupyter]

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [54]:
#!pip install datasets

## Dataset

In [4]:
from datasets import load_dataset

In [6]:
raw_datasets = load_dataset('squad_es', 'v1.1.0')


Downloading and preparing dataset squad_es/v1.1.0 to /root/.cache/huggingface/datasets/squad_es/v1.1.0/1.1.0/bcada4f600192451443b95e24f609325705c5185b8aad97bffa8bc3784a867ad...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87595 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad_es downloaded and prepared to /root/.cache/huggingface/datasets/squad_es/v1.1.0/1.1.0/bcada4f600192451443b95e24f609325705c5185b8aad97bffa8bc3784a867ad. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Save data to Google Drive

In [10]:
for split, dataset in raw_datasets.items():
  dataset.to_csv(f"drive/MyDrive/ColabData/cse256FinalProject/squad-{split}.csv", index=None)

Creating CSV from Arrow format:   0%|          | 0/88 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

In [26]:
data_files = {
    "train": "drive/MyDrive/ColabData/cse256FinalProject/squad-train.csv",
    "validation": "drive/MyDrive/ColabData/cse256FinalProject/squad-validation.csv",
}

csv_datasets_reloaded = load_dataset("csv", data_files=data_files)
csv_datasets_reloaded

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-ea94f3aa6398cb77/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-ea94f3aa6398cb77/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87595
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [27]:
train_ds = csv_datasets_reloaded['train']
train_ds.shape

(87595, 5)

In [28]:
validation_ds = csv_datasets_reloaded['validation']
validation_ds.shape

(10570, 5)

Get helper functions from A2 located in Google Drive

We will use the validation_ds which contains 10.5k records for our train,val, test split.

In [30]:
# from helpers import tokenize_and_format, flat_accuracy
import pandas as pd

df = pd.read_csv('drive/MyDrive/ColabData/cse256FinalProject/squad-validation.csv')

df = df.sample(frac=0.1).reset_index(drop=True) #10% ~ 3k
print("df.shape: ", df.shape)

df.shape:  (1057, 5)


In [18]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import squad_convert_examples_to_features
from transformers.data.processors.squad import SquadV2Processor, SquadExample
from torch.utils.data import DataLoader
# from transformers import squad_metrics

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("IIC/roberta-base-spanish-sqac")
model = AutoModelForQuestionAnswering.from_pretrained("IIC/roberta-base-spanish-sqac")


In [53]:
#!pip install evaluate

-------------------------

In [21]:
import evaluate

metric = evaluate.load("squad")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [38]:
#raw_datasets = load_dataset("squad")
#raw_datasets = load_dataset('squad_es', 'v1.1.0')

small_eval_set = raw_datasets["validation"].select(range(100))

In [39]:
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set
]

In [40]:
max_length = 384
stride = 128

def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [41]:
eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
eval_set

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 100
})

In [42]:
# example_to_features
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

In [47]:
import torch
from transformers import AutoModelForQuestionAnswering
from transformers import RobertaTokenizer, RobertaForQuestionAnswering


eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}

model = RobertaForQuestionAnswering.from_pretrained("IIC/roberta-base-spanish-sqac").to(device)


with torch.no_grad():
    outputs = model(**batch)

In [48]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [51]:
# get prediceted answers
n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [52]:
metric.compute(predictions=predicted_answers, references=theoretical_answers)

{'exact_match': 41.0, 'f1': 61.1333333333333}

-----------------------------------


## Model:

In [None]:
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
import torch

In [None]:

tokenizer = RobertaTokenizer.from_pretrained("IIC/roberta-base-spanish-sqac")
model = RobertaForQuestionAnswering.from_pretrained("IIC/roberta-base-spanish-sqac")

question, text = "Quién es el padre de Luke Skywalker?", "En la famosa película, Darth Veider le dice a Luke Skywalker aquella frase que todos recordamos: yo soy tu padre."
inputs = tokenizer(question, text, return_tensors="pt")
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])

outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits

In [None]:
from transformers import AdamW, BertConfig

batch_size = 40 # 99
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8, # args.adam_epsilon  - default is 1e-8
                  weight_decay=1e-3
                )
epochs = 5

Evaluate Base model on test set

In [None]:
import numpy as np
# function to get validation accuracy
def get_validation_performance(val_set):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0

    num_batches = int(len(val_set)/batch_size) + 1

    total_correct = 0

    for i in range(num_batches):

      end_index = min(batch_size * (i+1), len(val_set))

      batch = val_set[i*batch_size:end_index]

      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])

      # Move tensors to the GPU
      # b_input_ids = input_id_tensors.to(device)
      # b_input_mask = input_mask_tensors.to(device)
      # b_labels = label_tensors.to(device)
      b_input_ids = input_id_tensors
      b_input_mask = input_mask_tensors
      b_labels = label_tensors

      # Tell pytorch not to bother with constructing the compute graph during
      # the forward pass, since this is only needed for backprop (training).
      with torch.no_grad():

        # Forward pass, calculate logit predictions.
        outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask,
                                labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the number of correctly labeled examples in batch
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        num_correct = np.sum(pred_flat == labels_flat)
        total_correct += num_correct

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_correct / len(val_set)
    return avg_val_accuracy



In [None]:
val_set.shape

# Fine-tune model


In [None]:
import random

# training loop

# For each epoch...
for epoch_i in range(0, epochs):
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    num_batches = int(len(train_set)/batch_size) + 1

    for i in range(num_batches):
      end_index = min(batch_size * (i+1), len(train_set))

      batch = train_set[i*batch_size:end_index]

      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])

      # Move tensors to the GPU
      # b_input_ids = input_id_tensors.to(device)
      # b_input_mask = input_mask_tensors.to(device)
      # b_labels = label_tensors.to(device)
      b_input_ids = input_id_tensors
      b_input_mask = input_mask_tensors
      b_labels = label_tensors

      # Clear the previously calculated gradient
      model.zero_grad()

      # Perform a forward pass (evaluate the model on this training batch).
      outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)
      loss = outputs.loss
      logits = outputs.logits

      total_train_loss += loss.item()

      # Perform a backward pass to calculate the gradients.
      loss.backward()

      # Update parameters and take a step using the computed gradient.
      optimizer.step()

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set. Implement this function in the cell above.
    print(f"Total loss: {total_train_loss}")
    val_acc = get_validation_performance(val_set)
    print(f"Validation accuracy: {val_acc}")

print("")
print("Training complete!")
