# CMT122 Generative Question Answering Training Using Language Models

## READ ME

This file requires an intact environment before running. Please run the first section of the code to verify the integrity of the entire environment before running it for the first time.

In [5]:
try:
    import pandas as pd
    print("Pandas loaded successfully.")
except Exception as e:
    print(f"Issue: {e}")

try:
    from transformers import T5Tokenizer, T5ForConditionalGeneration
    print("Transformers loaded successfully")
except Exception as e:
    print(f"Issue: {e}")

try:
    from datasets import Dataset
    print("Datasets loaded successfully.")
except Exception as e:
    print(f"Issue: {e}")

try:
    import evaluate
    print("Evaluate module loaded and metric initialized successfully.")
except Exception as e:
    print(f"Evaluate module issue: {e}")

print("\nEnvironment check completed.")

✅ pandas loaded successfully.
✅ Transformers loaded successfully
✅ Datasets loaded successfully.
✅ evaluate module loaded and metric initialized successfully.

Environment check completed.


In [1]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import evaluate

# Data preprocessing

Read the data. This time the data will be fixed in UTF-8 encoding to avoid encoding problems that may affect the training input when reading the data later.

In [6]:
#dataset value in utf-8 formate
train_set = pd.read_csv('tweet_qa-train.csv',encoding = "utf-8")
test_set = pd.read_csv('tweet_qa-test.csv',encoding = "utf-8")
validation_set = pd.read_csv('tweet_qa-validation.csv',encoding = "utf-8")

Perform the first step of preprocessing on the overall training values, lowercase all English words to reduce noise

In [7]:
#lower case all letters
columns_to_lowercase = ['text', 'context', 'gold_label_str']

def lowercase(dataset, columns):
    for column in columns:
        if column in dataset.columns:
            dataset[column] = dataset[column].str.lower()
    return dataset

train_set = lowercase(train_set, columns_to_lowercase)
test_set = lowercase(test_set, columns_to_lowercase)
validation_set = lowercase(validation_set, columns_to_lowercase)

Replace the incomprehensible names in the original list with questions and answers

In [4]:
#rename columns according to column_mapping, for easy understanding

def rename_columns(dataset, column_mapping):
    return dataset.rename(columns=column_mapping)

column_mapping = {
    'text': 'text',
    'context': 'question',
    'gold_label_str': 'answer'
}

train_set = rename_columns(train_set, column_mapping)
test_set = rename_columns(test_set, column_mapping)
validation_set = rename_columns(validation_set, column_mapping)

Remove duplicates and missing items to avoid noise contamination of the model

In [5]:
#Clean up columns: remove duplicates, drop missing rows and ensure answers are found in text
def dataset_cleaning(dataset, cleaning_columns):
    dataset = dataset.drop_duplicates(subset=cleaning_columns)
    dataset = dataset.dropna(subset=cleaning_columns)
    return dataset

cleaning_columns = ['text', 'question', 'answer']
train_set_cleaned = dataset_cleaning(train_set, cleaning_columns)
test_set_cleaned = dataset_cleaning(test_set, cleaning_columns)
validation_set_cleaned = dataset_cleaning(validation_set, cleaning_columns)

Function to normalize column lengths to the average length of each column

In [14]:
# Function to normalize column lengths to the average length of each column
def normalization_question_text(dataset, columns, padding_char=" "):
    def adjust_length(value, target_length):
        if len(value) > target_length:
            return value[:target_length]
        else:
            return value + padding_char * (target_length - len(value))
    for column in columns:
        if column in dataset.columns:
            avg_length = int(dataset[column].dropna().apply(len).mean())
            dataset[column] = dataset[column].apply(
                lambda x: adjust_length(x, avg_length) if isinstance(x, str) else x
            )
    return dataset
train_set_normalized = normalization_question_text(train_set_cleaned, ['context', 'text'])
test_set_normalized = normalization_question_text(test_set_cleaned, ['context', 'text'])
validation_set_normalized = normalization_question_text(validation_set_cleaned, ['context', 'text'])

Convert the dataset into Huggingface Dataset to facilitate subsequent model training

In [15]:
# Turn to Huggingface Dataset
train_dataset = Dataset.from_pandas(train_set_normalized)
validation_dataset = Dataset.from_pandas(test_set_normalized)
test_dataset = Dataset.from_pandas(validation_set_normalized)

{'text': '"so much of the post is ben," mrs. graham said in 1994, three years after bradlee retired as editor. "he created it as we know it today."— ed o\'keefe (', 'question': 'what did bradlee retire as?', 'answer': 'editor', '__index_level_0__': 0}
{'text': '5 years in 5 seconds. darren booth (@darbooth) january 25, 2013                                                                                          ', 'question': 'what site does the link take you to?', 'answer': 'vine', '__index_level_0__': 0}
{'text': '"@reid2962: @realdonaldtrump@foxnews i expected better from @megynkelly, wondering what is her hidden agenda.— donald j. trump (@realdonaldtrump) augus', 'question': 'who do you expect better from?', 'answer': '@megynkelly', '__index_level_0__': 0}


Process the data into the data format used by the T5 model

In [17]:
def preprocess_function(examples):
    inputs = [f"question: {question} context: {text}" for question, text in zip(examples['question'], examples['text'])]
    targets = examples['answer']
    return {'input_text': inputs, 'target_text': targets}

train_dataset = train_dataset.map(preprocess_function, batched=True)
validation_dataset = validation_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/9449 [00:00<?, ? examples/s]

Map:   0%|          | 0/1197 [00:00<?, ? examples/s]

Map:   0%|          | 0/1085 [00:00<?, ? examples/s]

This project uses the T5-small model for training. Due to the poor performance of the developer's computer, it is not possible to use the more professional base or large model for development.

In [18]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize_function(examples):
    model_inputs = tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=512)
    labels = tokenizer(examples['target_text'], padding="max_length", truncation=True, max_length=128)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/9449 [00:00<?, ? examples/s]

Map:   0%|          | 0/1197 [00:00<?, ? examples/s]

Map:   0%|          | 0/1085 [00:00<?, ? examples/s]

The parameters of the model are set here, with the batch size set to 16 and the learning rate set to 2e-4, which are the highest values that the developer's host can run after adjustment.

In [19]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
#     fp16=True  
)



In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,         
    eval_dataset=validation_dataset,    
)

trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.2644,0.05114
2,0.0547,0.049074
3,0.0494,0.048641
4,0.046,0.048855
5,0.0427,0.048979


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2955, training_loss=0.08377779978382607, metrics={'train_runtime': 31599.3707, 'train_samples_per_second': 1.495, 'train_steps_per_second': 0.094, 'total_flos': 6394223410544640.0, 'train_loss': 0.08377779978382607, 'epoch': 5.0})

Test the model first in the test set to avoid excessive deviation in the data

In [21]:
results = trainer.evaluate(eval_dataset=test_dataset)
print(results)

{'eval_loss': 0.04575951769948006, 'eval_runtime': 180.3769, 'eval_samples_per_second': 6.015, 'eval_steps_per_second': 0.377, 'epoch': 5.0}


Set the generation function to a fixed length of 128 and write the transcoded text to a file to facilitate subsequent developers to check the deviation of the generated content

In [22]:
def generate_answer(batch):
    inputs = tokenizer(batch['input_text'], return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model.generate(inputs.input_ids, max_length=128)
    generated_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    outputs = model.generate(
        input_ids=inputs.input_ids,
        max_length=50,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )
    batch['generated_answer'] = generated_answers
    return batch
test_results = test_dataset.map(generate_answer, batched=True, batch_size=8)
test_results.to_csv("test_results_with_generated_answers.csv", index=False)

Map:   0%|          | 0/1085 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

5718969

Use bleu and rouge scores to evaluate the entire model and generation

In [24]:
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

references = [[item['answer']] for item in test_results] 
predictions = [item['generated_answer'] for item in test_results] 

bleu_score = bleu_metric.compute(
    predictions=predictions, 
    references=references     
)
print(f"BLEU Score: {bleu_score['bleu']:.2f}")


rouge_score = rouge_metric.compute(
    predictions=predictions, 
    references=[r[0] for r in references]
)
print(f"ROUGE-1: {rouge_score['rouge1']:.2f}")
print(f"ROUGE-L: {rouge_score['rougeL']:.2f}")

BLEU Score: 0.31
ROUGE-1: 0.61
ROUGE-L: 0.61
