In [1]:
from datasets import load_dataset

dataset = load_dataset('hotpot_qa', 'fullwiki')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 90447
    })
    validation: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 7405
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 7405
    })
})

In [3]:
dataset['train']['supporting_facts'][500]

{'title': ['Swannanoa River', 'Hurricane Ivan'], 'sent_id': [5, 2]}

In [4]:
dataset = dataset.remove_columns(['id', 'type', 'level'])

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'supporting_facts', 'context'],
        num_rows: 90447
    })
    validation: Dataset({
        features: ['question', 'answer', 'supporting_facts', 'context'],
        num_rows: 7405
    })
    test: Dataset({
        features: ['question', 'answer', 'supporting_facts', 'context'],
        num_rows: 7405
    })
})

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-cased-distilled-squad')

In [7]:
def preprocess(batch):
    questions = batch["question"]

    contexts = []
    answers_text = []
    for ctx, ans in zip(batch["context"], batch["answer"]):
        # Flatten list of sentences into a single string
        sentences = [s for group in ctx["sentences"] for s in group]
        context = " ".join(sentences)
        contexts.append(context)

        # If answer is None, replace with empty string
        if ans is None:
            answers_text.append("")
        else:
            answers_text.append(ans)

    # Tokenize
    tokenized = tokenizer(
        questions,
        contexts,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )

    # Compute start/end positions
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(tokenized["offset_mapping"]):
        context = contexts[i]
        answer = answers_text[i]

        if not answer:
            # No answer, set to CLS token
            start_positions.append(0)
            end_positions.append(0)
            continue

        # Find character span of answer in context
        start_char = context.lower().find(answer.lower())
        if start_char == -1:
            # Answer not found, fallback to CLS token
            start_positions.append(0)
            end_positions.append(0)
            continue

        end_char = start_char + len(answer)

        # Locate token start/end
        token_start = 0
        token_end = 0
        for idx, (s, e) in enumerate(offsets):
            if s <= start_char < e:
                token_start = idx
            if s < end_char <= e:
                token_end = idx

        start_positions.append(token_start)
        end_positions.append(token_end)

    # Add to tokenized output
    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions

    # Remove offsets to save memory
    tokenized.pop("offset_mapping")

    return tokenized


In [8]:
tokenized = dataset.map(preprocess, batched=True ,remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/90447 [00:00<?, ? examples/s]

Map:   0%|          | 0/7405 [00:00<?, ? examples/s]

Map:   0%|          | 0/7405 [00:00<?, ? examples/s]

In [9]:
tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 90447
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 7405
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 7405
    })
})

In [10]:
max(tokenized['validation']['start_positions'])

510

In [11]:
max(tokenized['train']['start_positions'])

510

In [12]:
max(tokenized['test']['start_positions'])

0

In [13]:
import numpy as np

labels = np.array(tokenized['train']["start_positions"])
invalid = np.sum(labels == 0)
print(f"Invalid spans: {invalid}/{len(labels)}")

Invalid spans: 37360/90447


In [14]:
import numpy as np

labels = np.array(tokenized['validation']["start_positions"])
invalid = np.sum(labels == 0)
print(f"Invalid spans: {invalid}/{len(labels)}")

Invalid spans: 4839/7405


In [15]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [16]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained('distilbert/distilbert-base-cased-distilled-squad')

In [16]:
training_args = TrainingArguments(
    output_dir="QA_model",
    learning_rate=2e-7,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,2.3427,1.880345
2,2.2477,1.783583
3,2.185,1.727906
4,2.1097,1.71524
5,2.1193,1.713713


TrainOutput(global_step=28265, training_loss=2.2552411006358626, metrics={'train_runtime': 4697.6381, 'train_samples_per_second': 96.269, 'train_steps_per_second': 6.017, 'total_flos': 5.908590469297152e+16, 'train_loss': 2.2552411006358626, 'epoch': 5.0})

In [17]:
#import shutil
from google.colab import files

# Path to the checkpoint directory
checkpoint_dir = "QA_model/checkpoint-28265"

# Create a zip archive of the checkpoint directory
shutil.make_archive("checkpoint-28265", 'zip', checkpoint_dir)

# Download the zip file
files.download("checkpoint-28265.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
# Save the trained model separately
output_model_dir = "my_trained_qa_model"
model.save_pretrained(output_model_dir)

print(f"Trained model saved to: {output_model_dir}")

Trained model saved to: my_trained_qa_model
