
#This notebook is being to train the finetuned squad model

In [None]:
# importing libs
!pip install datasets
!pip install pandas
!pip install numpy
!pip instal torch

ERROR: unknown command "instal" - maybe you meant "install"


In [None]:
# loading the dataset
from datasets import load_dataset

ds = load_dataset("tootooba/SMU_faq_dataset")

In [None]:
# loading the model (directly--> more flexible & customizable)
# Load model directly
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-cased-distilled-squad")




In [None]:
# preparing the dataset for training, this includes
# 1) extracting the qas from the ds
# 2) tokenzing these extracted qas using the tokenizer imported in the prev cell


# 1)
# "train" because there is no test data in the ds, you wouldve seen this when
# you ran the second block of code
questions = ds['train']['question']
answers = ds['train']['answer']

# 2)
# for the parameters, check the website:
# https://huggingface.co/docs/transformers/en/main_classes/tokenizer

tokenized_data = tokenizer(
    questions,
    answers,
    # padding to equalize length (longest because we dont want any of the answers)
    padding = "longest",
    #truncation: ensures that inuts are not longer than the max length
    truncation = True,
    max_length = 512,
    # pt--> pyTorch tensors, tensors are essentially a generalisation of matrices
    return_tensors  ="pt"
)

In [None]:
# Step 1: Prepare Labels for Question Answering
# Extract the start and end positions of each answer in the corresponding context (question)

start_positions = []
end_positions = []

for question, answer in zip(questions, answers):
    start_idx = question.find(answer)
    if start_idx == -1:
        # If the answer is not found in the question, set default values
        start_positions.append(0)
        end_positions.append(0)
    else:
        start_positions.append(start_idx)
        end_positions.append(start_idx + len(answer))

# Step 2: Tokenize Data with Labels
# Include the start and end positions
tokenized_data = tokenizer(
    questions,
    padding=True,
    truncation=True,
    max_length=1024,
    return_tensors="pt"
)

# Add start and end positions to the tokenized data
tokenized_data['start_positions'] = torch.tensor(start_positions)
tokenized_data['end_positions'] = torch.tensor(end_positions)

# Step 3: Create Dataset from Tokenized Data
print("Preparing dataset for training...")
qa_dataset = Dataset.from_dict({
    "input_ids": tokenized_data['input_ids'],
    "attention_mask": tokenized_data['attention_mask'],
    "start_positions": tokenized_data['start_positions'],
    "end_positions": tokenized_data['end_positions']
})

# Step 4: Set Up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=qa_dataset,
    eval_dataset=qa_dataset,    # Using the same dataset for evaluation (no separate test set)
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Function to compute metrics like accuracy
)

# Step 5: Train the Model
print("Training the model...")
trainer.train()

# Step 6: Evaluate and Print Results after Each Epoch
print("Evaluating the model...")
metrics = trainer.evaluate()  # Evaluate on the training data
print(f"Metrics after training: {metrics}")


Preparing dataset for training...
Training the model...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,0.0307,0.0,1.0


Evaluating the model...


Metrics after training: {'eval_loss': 0.0, 'eval_accuracy': 1.0, 'eval_runtime': 28.2434, 'eval_samples_per_second': 8.498, 'eval_steps_per_second': 1.062, 'epoch': 3.0}


In [None]:
print(f"Length of training dataset: {len(qa_dataset)}")


Length of training dataset: 240


In [None]:
# Extract original questions and answers from the dataset
original_questions = ds['train']['question']
original_contexts = ds['train']['answer']

# Test the model using the original questions
for i, (question, context) in enumerate(zip(original_questions, original_contexts)):
    # Tokenize the question and context
    inputs = tokenizer(
        question,
        context,
        add_special_tokens=True,
        return_tensors="pt"
    )

    # Get predictions from the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract start and end logits
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Get the most likely start and end of the answer
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)

    # Convert tokens to the answer text
    all_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    answer_tokens = all_tokens[start_index:end_index + 1]
    predicted_answer = tokenizer.convert_tokens_to_string(answer_tokens)

    # Print the original question and model prediction
    print(f"Original Question {i+1}: {question}")
    print(f"Predicted Answer: {predicted_answer}\n")

    # Stop after a few examples to limit output
    if i == 4:
        break


Original Question 1: ACADEMIC CALENDAR: That is it? Why do I need it? Now do I get one?
Predicted Answer: [CLS]

Original Question 2: ATHLETIC EVENTS: There can I get tickets? Now much do they cost? Are discount available? Now do I find more information?
Predicted Answer: [CLS]

Original Question 3: BUS PASS: There do I get a pass for Retro Transit? Now much does it cost? Are there special arrangements for students with disabilities? There can I find transit schedules and route maps?
Predicted Answer: [CLS]

Original Question 4: CAMPUS TOURS: Now do I arrange a tour of the camps? Now far in advance do I need to book?
Predicted Answer: [CLS]

Original Question 5: COMPUTER ACCOUNTS: I have received an 'A' number. That is it? Now do I use it? That is the Self-Service Manner? Now do I access it? Now do I get an 's' number? There can I find more information?
Predicted Answer: [CLS]

