## Testing Question Answering Distilbert Model
- https://huggingface.co/distilbert-base-cased-distilled-squad
- https://www.kaggle.com/code/jamesmcguigan/coleridge-huggingface-question-answering

In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

In [2]:
context = r"""Chapter 17.5. Lead and Copper, 
Article 7. Public Education Program for Lead Action Level Exceedances, 
§ 64690.80. Recordkeeping. Any system subject to the requirements of this 
chapter shall retain on its premises original records of all sampling data 
and analyses, reports, surveys, letters, evaluations, schedules, Department 
determinations, and any other information required by this chapter. Each water 
system shall retain the records required by this section for no fewer than 12 
years or two compliance cycles (as defined in Section 64400.20), whichever is longer."""

In [25]:
result = question_answerer(question="what water system?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'Each water 
system', score: 0.0354, start: 400, end: 418


### Finetuning Model

#### Prepare the data

In [19]:
# Prepare the data
questions = ["What are California water regulations?", "How do California water regulations impact agriculture?"]
passages = ["California water regulations are laws that govern how water is used and managed in California.",            "California water regulations can have a significant impact on agriculture, which is a major water user in the state."]
answers = [("laws that govern how water is used and managed", "California water regulations"),           ("have a significant impact on agriculture", "California water regulations")]


#### Train the Model

In [20]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
import torch

# Load the pre-trained model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

inputs = tokenizer(questions, passages, padding=True, truncation=True, return_tensors="pt")
start_positions = torch.tensor([tokenizer.encode(answers[i][0], add_special_tokens=False)[0] for i in range(len(answers))])
end_positions = torch.tensor([tokenizer.encode(answers[i][1], add_special_tokens=False)[-1] for i in range(len(answers))])
inputs.update({'start_positions': start_positions, 'end_positions': end_positions})

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
model.train()
for epoch in range(3):
    optimizer.zero_grad()
    outputs = model(**inputs)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

# Save the model
torch.save(model.state_dict(), 'distilbert_model.pt')


In [None]:
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, Trainer, TrainingArguments, default_data_collator
from transformers.data.processors.squad import SquadExample, SquadV1Processor
import pandas as pd

# Load the dataset into a pandas dataframe
df = pd.read_csv("your_dataset.csv")

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")

# Tokenize the data
def tokenize_data(context, question, answer):
    encoded_dict = tokenizer.encode_plus(question, context, return_offsets_mapping=True, max_length=512, padding="max_length", truncation=True)
    start_char = encoded_dict["offset_mapping"][encoded_dict["input_ids"].index(tokenizer.sep_token_id)][0]
    end_char = start_char + len(answer)
    return SquadExample(
        qas_id="",
        question_text=question,
        context_text=context,
        answer_text=answer,
        start_position_character=start_char,
        end_position_character=end_char,
        is_impossible=False
    )

# Convert the data to features
train_examples = []
for row in df.itertuples():
    train_examples.append(tokenize_data(row.context, row.question, row.answer))

processor = SquadV1Processor()
train_features = processor.convert_examples_to_features(train_examples, tokenizer, max_length=512, doc_stride=128, padding="max_length", truncation=True)

# Load the pre-trained model and fine-tune on the data
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=5e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    disable_tqdm=False,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_features,
    data_collator=default_data_collator,
    tokenizer=tokenizer
)

trainer.train()

#### Load the Model

In [21]:
# Load the model
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
model.load_state_dict(torch.load('distilbert_model.pt'))

<All keys matched successfully>

#### Evaluate the Model

In [22]:
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    start_preds = torch.argmax(start_logits, dim=1)
    end_preds = torch.argmax(end_logits, dim=1)
    start_acc = (start_preds == start_positions).float().mean()
    end_acc = (end_preds == end_positions).float().mean()
    f1_score = 2 * ((start_acc * end_acc) / (start_acc + end_acc))

print("Start Accuracy:", start_acc.item())
print("End Accuracy:", end_acc.item())
print("F1 Score:", f1_score.item())

Start Accuracy: 0.0
End Accuracy: 0.0
F1 Score: nan


### AWS SageMaker Training Job
This section requires training-job/ml.p3.2xlarge GPU instance, which is not included in our free 2 months subscription. When training on the finalized large dataset, we need to use this instance.

In [23]:
import sagemaker
import boto3

iam_client = boto3.client('iam')
role = iam_client.get_role(RoleName='AmazonSageMaker-ExecutionRole-20230307T225001')['Role']['Arn']
sess = sagemaker.Session()

In [24]:
from sagemaker.huggingface import HuggingFace

hyperparameters = {
	'model_name_or_path':'gpt2',
	'output_dir':'/opt/ml/model'
	# add your remaining hyperparameters
	# more info here https://github.com/huggingface/transformers/tree/v4.17.0/examples/pytorch/language-modeling
}

# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.17.0'}

# creates Hugging Face estimator
huggingface_estimator = HuggingFace(
	entry_point='run_clm.py',
	source_dir='./examples/pytorch/language-modeling',
	instance_type='ml.p3.2xlarge',
	instance_count=1,
	role=role,
	git_config=git_config,
	transformers_version='4.17.0',
	pytorch_version='1.10.2',
	py_version='py38',
	hyperparameters = hyperparameters
)

# starting the train job
huggingface_estimator.fit()

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2023-03-14-23-15-25-396


ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The requested resource training-job/ml.p3.2xlarge is not available in this region