## Adding a hardware accelerator

`Edit > Notebook Settings > Hardware accelerator > (GPU)`


Run the following cell to confirm that the GPU is detected.

In [111]:
import numpy as np

In [2]:
import torch

# Confirm that the GPU is detected
assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")

Found device: Tesla T4, n_gpu: 1


Setup

In [3]:
!pip install -q transformers==4.17.0  rich[jupyter]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m93.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
Collec

## Dataset

In [6]:
from datasets import load_dataset

In [7]:
raw_datasets = load_dataset('squad_es', 'v1.1.0')


Downloading builder script:   0%|          | 0.00/5.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.92k [00:00<?, ?B/s]

Downloading and preparing dataset squad_es/v1.1.0 to /root/.cache/huggingface/datasets/squad_es/v1.1.0/1.1.0/bcada4f600192451443b95e24f609325705c5185b8aad97bffa8bc3784a867ad...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87595 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad_es downloaded and prepared to /root/.cache/huggingface/datasets/squad_es/v1.1.0/1.1.0/bcada4f600192451443b95e24f609325705c5185b8aad97bffa8bc3784a867ad. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
raw_datasets["train"]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87595
})

In [9]:
raw_datasets["train"][0]

{'id': '5733be284776f41900661182',
 'title': 'Universidad _ de _ Notre _ Dame',
 'context': 'Arquitectónicamente, la escuela tiene un carácter católico. Encima de la cúpula de oro del edificio principal hay una estatua dorada de la Virgen María. Inmediatamente delante del edificio principal y frente a él, hay una estatua de cobre de Cristo con los brazos levantados con la leyenda "Venite Ad Me Omnes". Junto al edificio principal está la Basílica del Sagrado Corazón. Inmediatamente detrás de la basílica está la Gruta, un lugar mariano de oración y reflexión. Es una réplica de la gruta de Lourdes, Francia, donde la Virgen María supuestamente se le apareció a Santa Bernadette Soubirous en 1858. Al final de la unidad principal (y en una línea directa que se conecta a través de 3 estatuas y la Cúpula de Oro), hay una simple y moderna estatua de piedra de María.',
 'question': '¿A quién acudió la Virgen María supuestamente en 1858 en Lourdes France?',
 'answers': {'text': ['Santa Bernadette 

In [5]:
raw_datasets_squad = load_dataset('squad')
raw_datasets_squad["train"]

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

In [None]:
raw_datasets_squad["train"][0]

Save data to Google Drive

In [None]:
for split, dataset in raw_datasets.items():
  dataset.to_csv(f"drive/MyDrive/ColabData/cse256FinalProject/squad-{split}.csv", index=None)

Creating CSV from Arrow format:   0%|          | 0/88 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

In [None]:
data_files = {
    "train": "drive/MyDrive/ColabData/cse256FinalProject/squad-train.csv",
    "validation": "drive/MyDrive/ColabData/cse256FinalProject/squad-validation.csv",
}

csv_datasets_reloaded = load_dataset("csv", data_files=data_files)
csv_datasets_reloaded

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-fb10bcc7ece87f5b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-fb10bcc7ece87f5b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87595
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
train_ds = csv_datasets_reloaded['train']
train_ds.shape

(87595, 5)

In [None]:
validation_ds = csv_datasets_reloaded['validation']
validation_ds.shape

(10570, 5)

Get helper functions from A2 located in Google Drive

We will use the validation_ds which contains 10.5k records for our train,val, test split.

In [None]:
# from helpers import tokenize_and_format, flat_accuracy
import pandas as pd

df = pd.read_csv('drive/MyDrive/ColabData/cse256FinalProject/squad-validation.csv')

df = df.sample(frac=0.1).reset_index(drop=True) #10% ~ 3k
print("df.shape: ", df.shape)

df.shape:  (1057, 5)


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import squad_convert_examples_to_features
from transformers.data.processors.squad import SquadV2Processor, SquadExample
from torch.utils.data import DataLoader
# from transformers import squad_metrics

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("IIC/roberta-base-spanish-sqac")
model = AutoModelForQuestionAnswering.from_pretrained("IIC/roberta-base-spanish-sqac")


In [None]:
#!pip install evaluate

-------------------------

In [None]:
import evaluate

metric = evaluate.load("squad")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [None]:
#raw_datasets = load_dataset("squad")
#raw_datasets = load_dataset('squad_es', 'v1.1.0')

small_eval_set = raw_datasets["validation"].select(range(100))

In [None]:
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set
]

In [None]:
max_length = 384
stride = 128

def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:
eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
eval_set

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 100
})

In [None]:
# example_to_features
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering
from transformers import RobertaTokenizer, RobertaForQuestionAnswering


eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}

model = RobertaForQuestionAnswering.from_pretrained("IIC/roberta-base-spanish-sqac").to(device)


with torch.no_grad():
    outputs = model(**batch)

In [None]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [None]:

with torch.no_grad():
    outputs = model(**batch)

start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [None]:
# get predicted answers
n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [None]:
metric.compute(predictions=predicted_answers, references=theoretical_answers)

{'exact_match': 41.0, 'f1': 61.1333333333333}

-----------------------------------


## Model:

In [None]:
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
import torch

In [None]:

tokenizer = RobertaTokenizer.from_pretrained("IIC/roberta-base-spanish-sqac")
model = RobertaForQuestionAnswering.from_pretrained("IIC/roberta-base-spanish-sqac")

question, text = "Quién es el padre de Luke Skywalker?", "En la famosa película, Darth Veider le dice a Luke Skywalker aquella frase que todos recordamos: yo soy tu padre."
inputs = tokenizer(question, text, return_tensors="pt")
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])

outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits

# Fine-tune model


In [2]:
!pip install -q transformers==4.17.0  rich[jupyter]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [3]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collec

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
import torch

# Confirm that the GPU is detected
assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")

Found device: Tesla T4, n_gpu: 1


Load annotated data into dataset

In [6]:
from datasets import load_dataset

In [59]:

# data_files = {"retrain": "drive/MyDrive/ColabData/cse256FinalProject/answers.json"}
# train_set = load_dataset("json", data_files=data_files, split="retrain")

In [60]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split


In [61]:
df = pd.read_csv('drive/MyDrive/ColabData/cse256FinalProject/answers.csv')
# annotated_data = Dataset.from_pandas(df)


In [62]:

train_percentage = 0.8
val_percentage = 0.1
test_percentage = 0.1

# Split the dataset
train_dataset, temp_dataset = train_test_split(df, train_size=train_percentage, random_state=1)
val_dataset, test_dataset = train_test_split(temp_dataset, train_size=val_percentage/(val_percentage+test_percentage), random_state=1)


In [63]:
val_dataset = temp_dataset

In [64]:
print(train_dataset.shape, val_dataset.shape, test_dataset.shape)

(6, 10) (2, 10) (1, 10)


In [65]:
train_dataset

Unnamed: 0,answer_id,document_id,question_id,text,answer_start,answer_end,answer_category,question,file_name,context
1,916254,1521709,1031492,veintitrés (23) de mayo de dos mil veintitrés ...,13.0,65.0,,En que ciudad se radico la sentencia?,,"Bogotá D.C., veintitrés (23) de mayo de dos mi..."
6,916262,1521711,1031494,El Juzgado Treinta Laboral del Circuito de Bogotá,44.0,93.0,,Que juzgado resolvio la sentencia de primera i...,,I. SENTENCIA DE PRIMERA INSTANCIA El...
0,916327,1521714,1031511,"CONDENAR a la FIDUCIARIA LA PREVISORA, en su ...",157.0,372.0,,Como fue condenada la FIDUCIARIA LA PREVISORA?,,I. SENTENCIA DE PRIMERA INSTANCIA El...
4,916258,1521710,1031489,"JOSÉ DANIEL BUITRAGO VEGA, y la FEDERACIÓN NAC...",57.0,133.0,,Quien interpuso la sentencia?,,Decide la Sala los recursos de casación interp...
3,916257,1521710,1031492,Bogotá,255.0,261.0,,En que ciudad se radico la sentencia?,,Decide la Sala los recursos de casación interp...
5,916259,1521710,1031493,,,,NOT_GIVEN,Cuando se radico la sentencia?,,Decide la Sala los recursos de casación interp...


In [66]:
#train_dataset = Dataset.from_pandas(train_dataset)
#val_dataset = Dataset.from_pandas(val_dataset)
#test_dataset = Dataset.from_pandas(test_dataset)


In [67]:
train_dataset

Unnamed: 0,answer_id,document_id,question_id,text,answer_start,answer_end,answer_category,question,file_name,context
1,916254,1521709,1031492,veintitrés (23) de mayo de dos mil veintitrés ...,13.0,65.0,,En que ciudad se radico la sentencia?,,"Bogotá D.C., veintitrés (23) de mayo de dos mi..."
6,916262,1521711,1031494,El Juzgado Treinta Laboral del Circuito de Bogotá,44.0,93.0,,Que juzgado resolvio la sentencia de primera i...,,I. SENTENCIA DE PRIMERA INSTANCIA El...
0,916327,1521714,1031511,"CONDENAR a la FIDUCIARIA LA PREVISORA, en su ...",157.0,372.0,,Como fue condenada la FIDUCIARIA LA PREVISORA?,,I. SENTENCIA DE PRIMERA INSTANCIA El...
4,916258,1521710,1031489,"JOSÉ DANIEL BUITRAGO VEGA, y la FEDERACIÓN NAC...",57.0,133.0,,Quien interpuso la sentencia?,,Decide la Sala los recursos de casación interp...
3,916257,1521710,1031492,Bogotá,255.0,261.0,,En que ciudad se radico la sentencia?,,Decide la Sala los recursos de casación interp...
5,916259,1521710,1031493,,,,NOT_GIVEN,Cuando se radico la sentencia?,,Decide la Sala los recursos de casación interp...


In [68]:
import math
data = []
# assuming train_dataset is a dataframe
for _, record in train_dataset.iterrows():
    # example = {
    #     'text': row['text'],
    #     'label': row['label']
    # }

    record_output = dict()
    if record['text'] and not record['text'] != record['text'] : # not nan
      # print('text: ', record['text'])
      # for x in ['question_id','document_id', 'context', 'question','text']:
      #   assert x in record.keys() #, "{} not found in record keys".format(x)
      record_output['id'] = record['question_id']
      record_output['title'] = "doc"+ str(record['document_id'])
      record_output['context'] = record['context']
      record_output['question'] = record['question']
      print(record['text'])
      record_output['answers'] = {'text': [record['text']],\
                                  'answer_start' : [record['answer_start']]}
    data.append(record_output)


veintitrés (23) de mayo de dos mil veintitrés (2023)
El Juzgado Treinta Laboral del Circuito de Bogotá
 CONDENAR a la FIDUCIARIA LA PREVISORA, en su calidad de vocera y administradora del PATRIMONIO AUTÓNOMO PANFLOTA a pagar el cálculo actuarial correspondiente a la ADMINISTRADORA COLOMBIANA DE PENSIONES COLPENSIONES
JOSÉ DANIEL BUITRAGO VEGA, y la FEDERACIÓN NACIONAL DE CAFETEROS DE COLOMBIA
Bogotá


In [69]:
df_train = pd.DataFrame(data).dropna()
df_train


Unnamed: 0,id,title,context,question,answers
0,1031492.0,doc1521709,"Bogotá D.C., veintitrés (23) de mayo de dos mi...",En que ciudad se radico la sentencia?,{'text': ['veintitrés (23) de mayo de dos mil ...
1,1031494.0,doc1521711,I. SENTENCIA DE PRIMERA INSTANCIA El...,Que juzgado resolvio la sentencia de primera i...,{'text': ['El Juzgado Treinta Laboral del Circ...
2,1031511.0,doc1521714,I. SENTENCIA DE PRIMERA INSTANCIA El...,Como fue condenada la FIDUCIARIA LA PREVISORA?,{'text': [' CONDENAR a la FIDUCIARIA LA PREVIS...
3,1031489.0,doc1521710,Decide la Sala los recursos de casación interp...,Quien interpuso la sentencia?,"{'text': ['JOSÉ DANIEL BUITRAGO VEGA, y la FED..."
4,1031492.0,doc1521710,Decide la Sala los recursos de casación interp...,En que ciudad se radico la sentencia?,"{'text': ['Bogotá'], 'answer_start': [255.0]}"


In [70]:
# augmenting data
df_train = pd.concat([df_train] * 10, ignore_index=True)


In [71]:
train_dataset = Dataset.from_pandas(df_train)
train_dataset


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 50
})

In [72]:

# train_dataset = train_dataset.remove_columns('__index_level_0__')


In [73]:
train_dataset[0]

{'id': 1031492.0,
 'title': 'doc1521709',
 'context': 'Bogotá D.C., veintitrés (23) de mayo de dos mil veintitrés (2023).',
 'question': 'En que ciudad se radico la sentencia?',
 'answers': {'answer_start': [13.0],
  'text': ['veintitrés (23) de mayo de dos mil veintitrés (2023)']}}

In [74]:
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
from transformers import AutoTokenizer

# tokenizer = RobertaTokenizer.from_pretrained("IIC/roberta-base-spanish-sqac")
# what if I change this?
tokenizer = AutoTokenizer.from_pretrained("IIC/roberta-base-spanish-sqac")

loading file https://huggingface.co/IIC/roberta-base-spanish-sqac/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/b99f698d7d0bf5d6ebab5986b84276eb5fd79a3b0ba9fe2dff55de225d9f47e5.0b968504b15b2f5e6e9e491723f31987782cb976064aff9a7c6188cc2d5ce8bc
loading file https://huggingface.co/IIC/roberta-base-spanish-sqac/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/4bd923af799a50dd8bc05121278224bb5ff12c6e25d75f2c2537af7e83bfa0dd.0d24ae8bd5fabb1f5020f91bc602cefeb5a2938ab77e21769d28776345634b23
loading file https://huggingface.co/IIC/roberta-base-spanish-sqac/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/cd63054da37a8685cdc12e3530b0a2e35ed7c42cfe67c459b907f6461e7aba17.d58d4af36cc8b1c92a9d7628dc98319ac11a13c89b5f525a1e46443e72cea61b
loading file https://huggingface.co/IIC/roberta-base-spanish-sqac/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/IIC/roberta-base-spanish-sqa

In [75]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [76]:
train_dataset = train_dataset.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=train_dataset.column_names,
)
train_dataset

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 50
})

------------------

Get Validation

In [78]:
val_dataset

Unnamed: 0,answer_id,document_id,question_id,text,answer_start,answer_end,answer_category,question,file_name,context
7,916266,1521712,1031498,la ADMINISTRADORA DE PENSIONES Y CESANTÍAS POR...,170.0,383.0,,Cual fue la condena en la primera instancia?,,I. SENTENCIA DE PRIMERA INSTANCIA El...
2,916255,1521709,1031489,,,,NOT_GIVEN,Quien interpuso la sentencia?,,"Bogotá D.C., veintitrés (23) de mayo de dos mi..."


Get Train and Validation datasets

In [79]:
import math
data_val = []
# assuming train_dataset is a dataframe
for _, record in val_dataset.iterrows():
    # example = {
    #     'text': row['text'],
    #     'label': row['label']
    # }

    record_output = dict()
    if record['text'] and not record['text'] != record['text'] : # not nan
      # print('text: ', record['text'])
      # for x in ['question_id','document_id', 'context', 'question','text']:
      #   assert x in record.keys() #, "{} not found in record keys".format(x)
      record_output['id'] = record['question_id']
      record_output['title'] = "doc"+ str(record['document_id'])
      record_output['context'] = record['context']
      record_output['question'] = record['question']
      print(record['text'])
      record_output['answers'] = {'text': [record['text']],\
                                  'answer_start' : [int(record['answer_start'])]}
    data_val.append(record_output)


la ADMINISTRADORA DE PENSIONES Y CESANTÍAS PORVENIR S.A. a devolver a la ADMINISTRADORA COLOMBIANA DE PENSIONES COLPENSIONES todos los valores de la cuenta de ahorro individual del señor JOSÉ DANIEL BUITRAGO VEGA,


In [80]:
df_val = pd.DataFrame(data_val).dropna()
df_val

Unnamed: 0,id,title,context,question,answers
0,1031498.0,doc1521712,I. SENTENCIA DE PRIMERA INSTANCIA El...,Cual fue la condena en la primera instancia?,{'text': ['la ADMINISTRADORA DE PENSIONES Y CE...


In [81]:
# augmenting data
df_val = pd.concat([df_val] * 10, ignore_index=True)


In [82]:
val_dataset = Dataset.from_pandas(df_val)
val_dataset


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10
})

In [83]:
# val_dataset = val_dataset.remove_columns('__index_level_0__')
val_dataset[0]

{'id': 1031498.0,
 'title': 'doc1521712',
 'context': 'I.\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 SENTENCIA DE PRIMERA INSTANCIA El Juzgado Treinta Laboral del Circuito de Bogotá, mediante sentencia del 14 de marzo de 2017, resolvió: SEGUNDO: Condénese a la ADMINISTRADORA DE PENSIONES Y CESANTÍAS PORVENIR S.A. a devolver a la ADMINISTRADORA COLOMBIANA DE PENSIONES COLPENSIONES todos los valores de la cuenta de ahorro individual del señor JOSÉ DANIEL BUITRAGO VEGA, junto con sus rendimientos y los costos cobrados por concepto de administración durante todo el tiempo que permaneció en el régimen de ahorro individual con solidaridad, conforme a lo expuesto.',
 'question': 'Cual fue la condena en la primera instancia?',
 'answers': {'answer_start': [170],
  'text': ['la ADMINISTRADORA DE PENSIONES Y CESANTÍAS PORVENIR S.A. a devolver a la ADMINISTRADORA COLOMBIANA DE PENSIONES COLPENSIONES todos los valores de la cuenta de ahorro individual del señor JOSÉ DANIEL BUITRAGO VEGA,']}}

In [84]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [85]:
validation_dataset = val_dataset.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=val_dataset.column_names,
)
validation_dataset

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 10
})

----

Load model

In [86]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [87]:
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
import torch

tokenizer = RobertaTokenizer.from_pretrained("IIC/roberta-base-spanish-sqac")
model = RobertaForQuestionAnswering.from_pretrained("IIC/roberta-base-spanish-sqac")
#.to(device)
# model = AutoModelForQuestionAnswering.from_pretrained("IIC/roberta-base-spanish-sqac").to(device)



loading file https://huggingface.co/IIC/roberta-base-spanish-sqac/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/b99f698d7d0bf5d6ebab5986b84276eb5fd79a3b0ba9fe2dff55de225d9f47e5.0b968504b15b2f5e6e9e491723f31987782cb976064aff9a7c6188cc2d5ce8bc
loading file https://huggingface.co/IIC/roberta-base-spanish-sqac/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/4bd923af799a50dd8bc05121278224bb5ff12c6e25d75f2c2537af7e83bfa0dd.0d24ae8bd5fabb1f5020f91bc602cefeb5a2938ab77e21769d28776345634b23
loading file https://huggingface.co/IIC/roberta-base-spanish-sqac/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/IIC/roberta-base-spanish-sqac/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/f8badbd2366ff035b4b91c687c951b745bee175c63c3a13dc681da51089b284a.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0
loading file https://huggingface.co/IIC/roberta-base-sp

In [88]:
from transformers import TrainingArguments

args = TrainingArguments(
    "roberta-finetuned-squad-v1",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=1e-3,
    fp16=True,
    push_to_hub=False,
    remove_unused_columns=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [89]:
from transformers import Trainer

tokenizer = AutoTokenizer.from_pretrained("IIC/roberta-base-spanish-sqac",truncation=True, padding=True)

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.train()

loading file https://huggingface.co/IIC/roberta-base-spanish-sqac/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/b99f698d7d0bf5d6ebab5986b84276eb5fd79a3b0ba9fe2dff55de225d9f47e5.0b968504b15b2f5e6e9e491723f31987782cb976064aff9a7c6188cc2d5ce8bc
loading file https://huggingface.co/IIC/roberta-base-spanish-sqac/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/4bd923af799a50dd8bc05121278224bb5ff12c6e25d75f2c2537af7e83bfa0dd.0d24ae8bd5fabb1f5020f91bc602cefeb5a2938ab77e21769d28776345634b23
loading file https://huggingface.co/IIC/roberta-base-spanish-sqac/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/cd63054da37a8685cdc12e3530b0a2e35ed7c42cfe67c459b907f6461e7aba17.d58d4af36cc8b1c92a9d7628dc98319ac11a13c89b5f525a1e46443e72cea61b
loading file https://huggingface.co/IIC/roberta-base-spanish-sqac/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/IIC/roberta-base-spanish-sqa

Step,Training Loss


Saving model checkpoint to roberta-finetuned-squad-v1/checkpoint-7
Configuration saved in roberta-finetuned-squad-v1/checkpoint-7/config.json
Model weights saved in roberta-finetuned-squad-v1/checkpoint-7/pytorch_model.bin
tokenizer config file saved in roberta-finetuned-squad-v1/checkpoint-7/tokenizer_config.json
Special tokens file saved in roberta-finetuned-squad-v1/checkpoint-7/special_tokens_map.json
Saving model checkpoint to roberta-finetuned-squad-v1/checkpoint-14
Configuration saved in roberta-finetuned-squad-v1/checkpoint-14/config.json
Model weights saved in roberta-finetuned-squad-v1/checkpoint-14/pytorch_model.bin
tokenizer config file saved in roberta-finetuned-squad-v1/checkpoint-14/tokenizer_config.json
Special tokens file saved in roberta-finetuned-squad-v1/checkpoint-14/special_tokens_map.json
Saving model checkpoint to roberta-finetuned-squad-v1/checkpoint-21
Configuration saved in roberta-finetuned-squad-v1/checkpoint-21/config.json
Model weights saved in roberta-fi

TrainOutput(global_step=21, training_loss=0.9452008746919178, metrics={'train_runtime': 40.5411, 'train_samples_per_second': 3.7, 'train_steps_per_second': 0.518, 'total_flos': 29395885132800.0, 'train_loss': 0.9452008746919178, 'epoch': 3.0})

In [90]:
predictions, _, _ = trainer.predict(validation_dataset)
# start_logits, end_logits = predicted_answers #predictions

The following columns in the test set  don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id. If offset_mapping, example_id are not expected by `RobertaForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10
  Batch size = 8


In [91]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [92]:
import evaluate

metric = evaluate.load("squad")

In [93]:
val_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10
})

Get Predictions:

In [103]:
validation_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 10
})

In [107]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(validation_dataset):
    example_to_features[feature["example_id"]].append(idx)

In [108]:
eval_set_for_model = validation_dataset.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}

In [112]:
with torch.no_grad():
    outputs = model(**batch)

start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

# get predicted answers
n_best = 20
max_answer_length = 30
predicted_answers = []

for example in val_dataset:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = validation_dataset["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [113]:
predicted_answers

[{'id': 1031498.0,
  'prediction_text': 'El Juzgado Treinta Laboral del Circuito de Bogotá'},
 {'id': 1031498.0,
  'prediction_text': 'El Juzgado Treinta Laboral del Circuito de Bogotá'},
 {'id': 1031498.0,
  'prediction_text': 'El Juzgado Treinta Laboral del Circuito de Bogotá'},
 {'id': 1031498.0,
  'prediction_text': 'El Juzgado Treinta Laboral del Circuito de Bogotá'},
 {'id': 1031498.0,
  'prediction_text': 'El Juzgado Treinta Laboral del Circuito de Bogotá'},
 {'id': 1031498.0,
  'prediction_text': 'El Juzgado Treinta Laboral del Circuito de Bogotá'},
 {'id': 1031498.0,
  'prediction_text': 'El Juzgado Treinta Laboral del Circuito de Bogotá'},
 {'id': 1031498.0,
  'prediction_text': 'El Juzgado Treinta Laboral del Circuito de Bogotá'},
 {'id': 1031498.0,
  'prediction_text': 'El Juzgado Treinta Laboral del Circuito de Bogotá'},
 {'id': 1031498.0,
  'prediction_text': 'El Juzgado Treinta Laboral del Circuito de Bogotá'}]

In [118]:
#compute_metrics(start_logits, end_logits, validation_dataset, raw_datasets["validation"])
theoretical_answers = [
    {"id": str(ex["id"]), "answers": [ex["answers"]]} for ex in val_dataset
]

In [119]:
theoretical_answers[:2]

[{'id': '1031498.0',
  'answers': [{'answer_start': [170],
    'text': ['la ADMINISTRADORA DE PENSIONES Y CESANTÍAS PORVENIR S.A. a devolver a la ADMINISTRADORA COLOMBIANA DE PENSIONES COLPENSIONES todos los valores de la cuenta de ahorro individual del señor JOSÉ DANIEL BUITRAGO VEGA,']}]},
 {'id': '1031498.0',
  'answers': [{'answer_start': [170],
    'text': ['la ADMINISTRADORA DE PENSIONES Y CESANTÍAS PORVENIR S.A. a devolver a la ADMINISTRADORA COLOMBIANA DE PENSIONES COLPENSIONES todos los valores de la cuenta de ahorro individual del señor JOSÉ DANIEL BUITRAGO VEGA,']}]}]

In [120]:
metric.compute(predictions=predicted_answers, references=theoretical_answers)

ValueError: ignored

-------------