In [1]:
import re
import json
import yaml
from IPython.display import display, Markdown, HTML

In [2]:
def load_yaml_file(filename):
    with open(filename, 'r') as f:
        return yaml.safe_load(f)

def clean_text_single_line(text):
    return re.sub(r'\s+', ' ', text).strip()

def clean_text_multiple_line(text):
    new_text = re.sub(r'\n+', '\n', text)
    # Clean traiiling white space at the end
    new_text = "\n".join([line.strip() for line in new_text.split('\n')])
    return new_text

def clean_text_paragraph(text):
    return re.sub(r'\n+', '\n', text)

def list_AZ(start_char,end_char):
    # List the alphabets from start_char to end_char. 
    # Ex: list_AZ('A','C') -> ['A', 'B', 'C']
    start = ord(start_char)
    end = ord(end_char)
    return [chr(i) for i in range(start, end+1)]
    
def list_numbers(start_num, end_num):
    # List the numbers from start_num to end_num
    # Ex: list_numbers(1,3) -> [1, 2, 3]
    return list(range(start_num, end_num+1))

def parse_task_question_number(text):
    range_pattern = re.compile(r'Questions (\d+)-(\d+)')    # Case 1: "Questions 1-5" (Range)
    multiple_pattern = re.compile(r'Questions (\d+) and (\d+)')    # Case 2: "Questions 1 and 2" (Multiple)
    single_pattern = re.compile(r'Question (\d+)')    # Case 3: "Question 1" (Single)
    # Return whichever case matches
    if range_pattern.match(text):
        start_num, end_num = range_pattern.match(text).groups()
        return list_numbers(int(start_num), int(end_num))
    elif multiple_pattern.match(text):
        num1, num2 = multiple_pattern.match(text).groups()
        return [int(num1), int(num2)]
    elif single_pattern.match(text):
        return [int(single_pattern.match(text).group(1))]
    else:
        raise Exception("Invalid question number format")
    
print(parse_task_question_number("Questions 1-5"))
print(parse_task_question_number("Questions 1 and 2"))
print(parse_task_question_number("Question 1"))

[1, 2, 3, 4, 5]
[1, 2]
[1]


In [3]:
#  Completion questions
def parse_diagram-completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    question_img_path = questionTask['question_img_path'].strip()  # Most important
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)

    # matching patterns
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_items = re.split(r'\n+', correct_answer)

    # Idea: Correct-answer-bassed question items. Each question item will be created for each correct answer   
    question_items = []
    for answer_item in correct_answer_items:
        answer_item = answer_item.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_item)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
    })

    return {
        "taskType": task_type,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionImgPath": question_img_path, # Most important
        "questionItems": question_items,
    }

def parse_flow-chart-completion(questionTask):
    return parse_diagram-completion(questionTask)

def parse_sentence_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()
    
    task_question_number_list = parse_task_question_number(task_question_number)
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)

    # matching patterns
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_items = re.split(r'\n+', correct_answer)

    # Idea: Correct-answer-bassed question items. Each question item will be created for each correct answer   
    question_items = []
    for answer_item in correct_answer_items:
        answer_item = answer_item.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_item)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
    })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }

def parse_summary_completion(questionTask):
    return parse_sentence_completion(questionTask)

def parse_note_completion(questionTask):
    return parse_sentence_completion(questionTask)

def parse_summary_completion_word_list(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    question_list_of_options = clean_text_multiple_line(question_list_of_options)
    task_question_number_list = parse_task_question_number(task_question_number)

    
    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    #  Strip items

    # Idea: Each matching question item contains question, list of choices and correct answer
    
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

def parse_table_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_question_number_list = parse_task_question_number(task_question_number)
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    question_list_title = clean_text_single_line(question_list_title)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    table_data_rows = [tuple(r.strip() for r in re.split(r'\|',row)) for row in question_main_text_lines]


    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": table_data_rows,
        "questionItems": question_items,
    }

# Matching questions
def parse_matching_features(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    question_list_title = clean_text_single_line(question_list_title)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)
    # question_list_of_options_lines = re.split(r'\n+', question_list_of_options)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch


    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, # option is a tuple but correct answer might be a string
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

def parse_matching_headings(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)
    # question_list_of_options_lines = re.split(r'\n+', question_list_of_options)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_option_item_pattern = re.compile(r'([ixv]+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch

    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        # "questionMainText": question_main_text, # Not needed
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

def parse_matching-sentence-endings(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    question_list_of_options = clean_text_multiple_line(question_list_of_options)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    #  Strip items

    # Idea: Each matching question item contains question, list of choices and correct answer
    
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }


def parse_matching_paragraphs(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    question_list_of_options = clean_text_multiple_line(question_list_of_options)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    start_char, end_char = re.split('-', question_list_of_options)
    question_option_items = list_AZ(start_char, end_char)
    #  Strip items

    # Idea: Each matching question item contains question, list of choices and correct answer
    
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

def parse_true_false_not_given(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    task_question_number_list = parse_task_question_number(task_question_number)


    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n

    # Idea: Each TFNG question item contains question, list of choices (TFNG) and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": ["TRUE", "FALSE", "NOT GIVEN"], # Always the same
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }

def parse_yes_no_not_given(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    task_question_number_list = parse_task_question_number(task_question_number)


    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n

    # Idea: Each TFNG question item contains question, list of choices (TFNG) and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": ["YES", "NO", "NOT GIVEN"], # Always the same
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }



# Choice questions
def parse_multiple_choice_select_one(questionTask):
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()

    # Not applicable but included for consistency
    question_list_of_options = questionTask['question_list_of_options'].strip()
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    task_question_number_list = parse_task_question_number(task_question_number)

    mcq_question_content_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    mcq_question_option_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers

    # Split items
    multiple_choice_question_item_lines = re.split(r'\n(?=\d+\s)', question_main_text) # ['27    In the secon...\nA   the subject...\nB   the subject...', '28    The author...\nA   the subject...\nB   the subject...']
    correct_answer_lines = re.split(r'\n+', correct_answer) # ['27. A', '28. B']

    question_items = []
    for mcq_question_item_line, answer_line in zip(multiple_choice_question_item_lines, correct_answer_lines):
        mcq_question_item_line, answer_line = mcq_question_item_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = mcq_question_content_pattern.match(mcq_question_item_line)
        question_option_items = mcq_question_option_pattern.findall(mcq_question_item_line)
        answer_item_match = correct_answer_pattern.match(answer_line)

        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionItems": question_items,
    }

def parse_multiple_choice_select_many(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()


    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    task_question_number_list = parse_task_question_number(task_question_number)

    # matching patterns
    correct_answer_pattern = re.compile(r'\d+[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    question_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_lines = re.split(r'\n+', correct_answer)
    question_list_of_options_lines = re.split(r'\n+', question_list_of_options)


    question_items =  question_item_pattern.findall(question_list_of_options)
    correct_answer = correct_answer_pattern.findall(correct_answer)

    return {
    "taskType": task_type,
    "taskQuestionNumberList": task_question_number_list,
    "taskQuestionNumberText": task_question_number,
    "taskDescription": task_description,
    "questionMainTitle": question_main_title,
    "questionMainText": question_main_text,
    "questionItems": question_items,
    "correctAnswer": correct_answer
    }




### Parse All

In [4]:
def parse_reading_from_yaml(filename):
    with open(filename, 'r') as f:
        raw_data = yaml.safe_load(f)
    
    reading_info = parse_reading_info(raw_data["reading_info"])
    passage_content = parse_passage_content(raw_data["passage_content"])
    question_content = parse_question_content(raw_data["question_content"])
    return {
        "readingInfo": reading_info,
        "passageContent": passage_content,
        "questionContent": question_content
    }

def parse_reading_info(reading_info_data):
    return {
        "raedingTitle": clean_text_single_line(reading_info_data["reading_title"]).strip(),
        "readingSubtitle": clean_text_single_line(reading_info_data["reading_subtitle"]).strip(),
    }

def parse_passage_content(passage_data):
    paragraph_markers = passage_data['paragraph_markers']
    passage_context = clean_text_single_line(passage_data["passage_context"])
    passage_title = clean_text_single_line(passage_data["passage_title"])
    passage_subtitle = clean_text_single_line(passage_data["passage_subtitle"])
    passage_main_text = clean_text_paragraph(passage_data["passage_main_text"])

    if paragraph_markers:
        split_pattern = re.compile(r'\n+(?=[A-Z]\n+[A-Z])')
        match_pattern = re.compile(r'([A-Z])\n+([A-Z].*)')
        passsage_paragraphs = re.split(split_pattern, passage_main_text) # ['A\nEaster Island, or ', 'B\nWhen the Europeans', 'C\nThe moai, he think',...]
        passsage_paragraphs = [match_pattern.match(a).groups() for a in passsage_paragraphs]     # For each paragraph, match into header and content groups
    else:
        passsage_paragraphs = re.split(r'\n+(?=[A-Z])', passage_main_text)

    return {
        "hasParagraphMarkers": paragraph_markers,
        "passageContext": passage_context,
        "passageTitle": passage_title,
        "passageSubtitle": passage_subtitle,
        "passageMainText": passage_main_text,
        "passageParagraphs": passsage_paragraphs
    }

def parse_question_content(question_data):
    questionTasks = []
    for question in question_data:
        questionTasks.append(parse_question_task(question))
    return questionTasks

def parse_question_task(questionTask):
    task_type = questionTask["task_type"].strip()
    parser_functions = {
        "multiple-choice-select-one": parse_multiple_choice_select_one,
        "multiple-choice-select-many": parse_multiple_choice_select_many,
        "diagram-completion": parse_diagram-completion,
        "flow-chart-completion": parse_flow-chart-completion,
        "summary-completion": parse_summary_completion,
        "summary-completion-word-list": parse_summary_completion_word_list,
        "table-completion": parse_table_completion,
        "note-completion": parse_note_completion,
        "matching-features": parse_matching_features,
        "matching-headings": parse_matching_headings,
        "matching-sentence-endings": parse_matching-sentence-endings,
        "matching-paragraphs": parse_matching_paragraphs,
        "true_false_not_given": parse_true_false_not_given,
        "yes_no_not_given": parse_yes_no_not_given
    }
    if task_type not in parser_functions.keys():
        raise Exception(f"{task_type} is an invalid question type")
    return parser_functions[task_type](questionTask)



In [5]:
cam11_test1_1 = "../components/assets/yaml/cam-11-test-1/cam-11-test-1-1.yaml"
cam11_test1_2 = "../components/assets/yaml/cam-11-test-1/cam-11-test-1-2.yaml"
cam11_test1_3 = "../components/assets/yaml/cam-11-test-1/cam-11-test-1-3.yaml"
cam11_test2_1 = "../components/assets/yaml/cam-11-test-2/cam-11-test-2-1.yaml"
cam11_test2_2 = "../components/assets/yaml/cam-11-test-2/cam-11-test-2-2.yaml"
cam11_test2_3 = "../components/assets/yaml/cam-11-test-2/cam-11-test-2-3.yaml"
cam13_test2_1 = "../components/assets/yaml/cam-13-test-2/cam-13-test-2-1.yaml"
cam13_test2_2 = "../components/assets/yaml/cam-13-test-2/cam-13-test-2-2.yaml"
cam13_test2_3 = "../components/assets/yaml/cam-13-test-2/cam-13-test-2-3.yaml"

raw_data = load_yaml_file(cam11_test1_2)
raw_data["question_content"]

[{'task_type': 'true-false-notgiven\n',
  'task_question_number': 'Questions 14-19\n',
  'task_description': 'Do the following statements agree with the information given in Reading Passage 2?\n\nIn boxes 14-19 on your answer sheet, write\n\nTRUE                if the statement agrees with the information\n\nFALSE               if the statement contradicts the information\n\nNOT GIVEN     if there is no information on this\n',
  'question_main_title': '',
  'question_main_text': '14   The Falkirk Wheel has linked the Forth & Clyde Canal with the Union Canal for the first time in their history.\n\n15   There was some opposition to the design of the Falkirk Wheel at first.\n\n16   The Falkirk Wheel was initially put together at the location where its components were manufactured.\n\n17   The Falkirk Wheel is the only boat lift in the world which has steel sections bolted together by hand.\n\n18   The weight of the gondolas varies according to the size of boat being carried.\n\n19   The c

## Parse Passage & Reading Info

In [6]:
raw_data = load_yaml_file(cam11_test2_2)
reading_data = raw_data["reading_info"]

display(reading_data)
parse_reading_info(reading_data)

{'reading_title': 'READING PASSAGE 2\n',
 'reading_subtitle': 'You should spend about 20 minutes on Questions 14-26 which are based on Reading Passage 2 below. \n'}

{'raedingTitle': 'READING PASSAGE 2',
 'readingSubtitle': 'You should spend about 20 minutes on Questions 14-26 which are based on Reading Passage 2 below.'}

In [7]:
raw_data = load_yaml_file(cam11_test2_2)
# raw_data = load_yaml_file(cam11_test1_1)
passage_data = raw_data["passage_content"]
display(passage_data)
# parse_passage_content(passage_data)

def parse_passage_content(passage_data):
    paragraph_markers = passage_data['paragraph_markers']
    passage_context = clean_text_single_line(passage_data["passage_context"])
    passage_title = clean_text_single_line(passage_data["passage_title"])
    passage_subtitle = clean_text_single_line(passage_data["passage_subtitle"])
    passage_main_text = clean_text_paragraph(passage_data["passage_main_text"])

    if paragraph_markers:
        split_pattern = re.compile(r'\n+(?=[A-Z]\n+[A-Z])')
        match_pattern = re.compile(r'([A-Z])\n+([A-Z].*)')
        passsage_paragraphs = re.split(split_pattern, passage_main_text) # ['A\nEaster Island, or ', 'B\nWhen the Europeans', 'C\nThe moai, he think',...]
        passsage_paragraphs = [match_pattern.match(a).groups() for a in passsage_paragraphs]     # For each paragraph, match into header and content groups
    else:
        passsage_paragraphs = re.split(r'\n+(?=[A-Z])', passage_main_text)

    return {
        "hasParagraphMarkers": paragraph_markers,
        "passageContext": passage_context,
        "passageTitle": passage_title,
        "passageSubtitle": passage_subtitle,
        "passageMainText": passage_main_text,
        "passageParagraphs": passsage_paragraphs
    }

parse_passage_content(passage_data)

{'paragraph_markers': True,
 'passage_context': '',
 'passage_title': 'What destroyed the civilisation of Easter Island?\n',
 'passage_subtitle': 'How a sixteenth-century warship was recovered from the seabed\n',
 'passage_main_text': 'A\n\nEaster Island, or Rapu Nui as it is known locally, is home to several hundred ancient human statues – the moai. After this remote Pacific island was settled by the Polynesians, it remained isolated for centuries. All the energy and resources that went into the moai – some of which are ten metres tall and weigh over 7,000 kilos – came from the island itself. Yet when Dutch explorers landed in 1722, they met a Stone Age culture. The moai were carved with stone tools, then transported for many kilometres, without the use of animals or wheels, to massive stone platforms. The identity of the moai builders was in doubt until well into the twentieth century. Thor Heyerdahl, the Norwegian ethnographer and adventurer, thought the statues had been created by 

{'hasParagraphMarkers': True,
 'passageContext': '',
 'passageTitle': 'What destroyed the civilisation of Easter Island?',
 'passageSubtitle': 'How a sixteenth-century warship was recovered from the seabed',
 'passageMainText': 'A\nEaster Island, or Rapu Nui as it is known locally, is home to several hundred ancient human statues – the moai. After this remote Pacific island was settled by the Polynesians, it remained isolated for centuries. All the energy and resources that went into the moai – some of which are ten metres tall and weigh over 7,000 kilos – came from the island itself. Yet when Dutch explorers landed in 1722, they met a Stone Age culture. The moai were carved with stone tools, then transported for many kilometres, without the use of animals or wheels, to massive stone platforms. The identity of the moai builders was in doubt until well into the twentieth century. Thor Heyerdahl, the Norwegian ethnographer and adventurer, thought the statues had been created by pre-Inca 

In [8]:
display(load_yaml_file(cam11_test2_2))
parse_reading_from_yaml(cam11_test2_2)

{'reading_info': {'reading_title': 'READING PASSAGE 2\n',
  'reading_subtitle': 'You should spend about 20 minutes on Questions 14-26 which are based on Reading Passage 2 below. \n'},
 'passage_content': {'paragraph_markers': True,
  'passage_context': '',
  'passage_title': 'What destroyed the civilisation of Easter Island?\n',
  'passage_subtitle': 'How a sixteenth-century warship was recovered from the seabed\n',
  'passage_main_text': 'A\n\nEaster Island, or Rapu Nui as it is known locally, is home to several hundred ancient human statues – the moai. After this remote Pacific island was settled by the Polynesians, it remained isolated for centuries. All the energy and resources that went into the moai – some of which are ten metres tall and weigh over 7,000 kilos – came from the island itself. Yet when Dutch explorers landed in 1722, they met a Stone Age culture. The moai were carved with stone tools, then transported for many kilometres, without the use of animals or wheels, to ma

{'readingInfo': {'raedingTitle': 'READING PASSAGE 2',
  'readingSubtitle': 'You should spend about 20 minutes on Questions 14-26 which are based on Reading Passage 2 below.'},
 'passageContent': {'hasParagraphMarkers': True,
  'passageContext': '',
  'passageTitle': 'What destroyed the civilisation of Easter Island?',
  'passageSubtitle': 'How a sixteenth-century warship was recovered from the seabed',
  'passageMainText': 'A\nEaster Island, or Rapu Nui as it is known locally, is home to several hundred ancient human statues – the moai. After this remote Pacific island was settled by the Polynesians, it remained isolated for centuries. All the energy and resources that went into the moai – some of which are ten metres tall and weigh over 7,000 kilos – came from the island itself. Yet when Dutch explorers landed in 1722, they met a Stone Age culture. The moai were carved with stone tools, then transported for many kilometres, without the use of animals or wheels, to massive stone platfo

## Parse Questions

### Diagram & Flow Chart Completion

In [9]:
#  Completion questions
def parse_diagram-completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_question_number = questionTask['task_question_number'].strip()
    task_description = questionTask['task_description'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    question_img_path = questionTask['question_img_path'].strip()  # Most important
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_question_number_list = parse_task_question_number(task_question_number)
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)

    # matching patterns
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_items = re.split(r'\n+', correct_answer)

    # Idea: Correct-answer-bassed question items. Each question item will be created for each correct answer   
    question_items = []
    for answer_item in correct_answer_items:
        answer_item = answer_item.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_item)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
    })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionImgPath": question_img_path, # Most important
        "questionItems": question_items,
    }

def parse_flow-chart-completion(questionTask):
    return parse_diagram-completion(questionTask)

questionTask = load_yaml_file(cam11_test1_2)["question_content"][1]
# questionTask = load_yaml_file(cam11_test2_1)["question_content"][2]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_diagram-completion(questionTask)

Raw version


{'task_type': 'diagram-completion\n',
 'task_question_number': 'Questions 20-26\n',
 'task_description': 'Label the diagram below.\n\nChoose ONE WORD from the passage for each answer.\n\nWrite your answers in boxes 20-26 on your answer sheet.\n',
 'question_main_title': 'How a boat is lifted on the Falkirk Wheel\n',
 'question_main_text': '',
 'question_img_path': 'https://ieltstrainingonline.com/wp-content/uploads/2020/07/11-1-2-IELTS-Reading-q14-26-980x663.jpg\n',
 'question_list_title': '',
 'question_list_of_options': '',
 'example_answer': '',
 'correct_answer': '20. gates\n\n21. clamp\n\n22. axle\n\n23. cogs\n\n24. aqueduct\n\n25. wall\n\n26. locks'}

Parsed version


{'taskType': 'diagram-completion',
 'taskQuestionNumberList': [20, 21, 22, 23, 24, 25, 26],
 'taskQuestionNumberText': 'Questions 20-26',
 'taskDescription': 'Label the diagram below.\nChoose ONE WORD from the passage for each answer.\nWrite your answers in boxes 20-26 on your answer sheet.',
 'questionMainTitle': 'How a boat is lifted on the Falkirk Wheel',
 'questionMainText': '',
 'questionImgPath': 'https://ieltstrainingonline.com/wp-content/uploads/2020/07/11-1-2-IELTS-Reading-q14-26-980x663.jpg',
 'questionItems': [{'questionNumber': 20, 'correctAnswer': 'gates'},
  {'questionNumber': 21, 'correctAnswer': 'clamp'},
  {'questionNumber': 22, 'correctAnswer': 'axle'},
  {'questionNumber': 23, 'correctAnswer': 'cogs'},
  {'questionNumber': 24, 'correctAnswer': 'aqueduct'},
  {'questionNumber': 25, 'correctAnswer': 'wall'},
  {'questionNumber': 26, 'correctAnswer': 'locks'}]}

### Sentence, Summary and Note Completion

In [10]:
def parse_sentence_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()
    
    task_question_number_list = parse_task_question_number(task_question_number)
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)

    # matching patterns
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_items = re.split(r'\n+', correct_answer)

    # Idea: Correct-answer-bassed question items. Each question item will be created for each correct answer   
    question_items = []
    for answer_item in correct_answer_items:
        answer_item = answer_item.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_item)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
    })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }

def parse_summary_completion(questionTask):
    return parse_sentence_completion(questionTask)

def parse_note_completion(questionTask):
    return parse_sentence_completion(questionTask)
    
questionTask = load_yaml_file(cam13_test2_1)["question_content"][0] # Note completion
# questionTask = load_yaml_file(cam11_test2_2)["question_content"][1] # Summary completion
# questionTask = load_yaml_file(cam11_test1_1)["question_content"][0] # Sentence completion
print('Raw version')
display(questionTask)
print('Parsed version')
parse_sentence_completion(questionTask)



Raw version


{'task_type': 'note-completion\n',
 'task_question_number': 'Questions 1-9\n',
 'task_description': 'Complete the notes below.\nChoose ONE WORD ONLY from the passage for each answer.\n\nWrite your answers in boxes 1-9 on your answer sheet.\n',
 'question_main_title': 'The Early History of Cinnamon\n',
 'question_main_text': 'Biblical times:\n\nadded to 1………………………..\n\nused to show 2…………………………. Between people\n\nAncient Rome:\n\nused for its sweet smell at 3………………………..\n\nMiddle Ages:\n\nadded to food, especially meat\n\nwas an indication of a person’s 4………………………..\n\nknown as a treatment for 5……………………….. and other health problems\n\ngrown in 6……………………….\n\nmerchants used 7……………………… to bring it to the Mediterranean\n\narrived in the Mediterranean at 8……………………………\n\ntraders took it to 9……………………………. and sold it to destinations around Europe.\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': '',
 'example_answer': '',
 'correct_answer': '1. oils\n\n2. f

Parsed version


{'taskType': 'note-completion',
 'taskQuestionNumberList': [1, 2, 3, 4, 5, 6, 7, 8, 9],
 'taskQuestionNumberText': 'Questions 1-9',
 'taskDescription': 'Complete the notes below.\nChoose ONE WORD ONLY from the passage for each answer.\nWrite your answers in boxes 1-9 on your answer sheet.',
 'questionMainTitle': 'The Early History of Cinnamon',
 'questionMainText': 'Biblical times:\nadded to 1………………………..\nused to show 2…………………………. Between people\nAncient Rome:\nused for its sweet smell at 3………………………..\nMiddle Ages:\nadded to food, especially meat\nwas an indication of a person’s 4………………………..\nknown as a treatment for 5……………………….. and other health problems\ngrown in 6……………………….\nmerchants used 7……………………… to bring it to the Mediterranean\narrived in the Mediterranean at 8……………………………\ntraders took it to 9……………………………. and sold it to destinations around Europe.',
 'questionItems': [{'questionNumber': 1, 'correctAnswer': 'oils'},
  {'questionNumber': 2, 'correctAnswer': 'friendship'},
  {'qu

### Table Completion

In [11]:
def parse_table_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_question_number_list = parse_task_question_number(task_question_number)
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    question_list_title = clean_text_single_line(question_list_title)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    table_data_rows = [tuple(r.strip() for r in re.split(r'\|',row)) for row in question_main_text_lines]


    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": table_data_rows,
        "questionItems": question_items,
    }

questionTask = load_yaml_file(cam11_test1_3)["question_content"][1]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_table_completion(questionTask)


Raw version


{'task_type': 'table-completion\n',
 'task_question_number': 'Questions 30-36\n',
 'task_description': 'Complete the table below.\n\nChoose ONE WORD from the passage for each answer.\n\nWrite your answers in boxes 30-36 on your answer sheet.\n',
 'question_main_title': 'GEO-ENGINEERING PROJECTS\n',
 'question_main_text': 'Procedure | Aim\n\nput a large number of tiny spacecraft into orbit far above Earth | to create a 30………….. that would reduce the amount of light reaching Earth\n\nplace 31…………… in the sea | to encourage 32…………… to form\n\nrelease aerosol sprays into the stratosphere | to create 33……………. that would reduce the amount of light reaching Earth\n\nfix strong 34…………… to Greenland ice sheets | to prevent icebergs moving into the sea\n\nplant trees in Russian Arctic that would lose their leaves in winter | to allow the 35…………… to reflect radiation\n\nchange the direction of 36…………… | to bring more cold water into ice-forming areas\n',
 'question_img_path': '',
 'question_list_

Parsed version


{'taskType': 'table-completion',
 'taskQuestionNumberList': [30, 31, 32, 33, 34, 35, 36],
 'taskQuestionNumberText': 'Questions 30-36',
 'taskDescription': 'Complete the table below.\nChoose ONE WORD from the passage for each answer.\nWrite your answers in boxes 30-36 on your answer sheet.',
 'questionMainTitle': 'GEO-ENGINEERING PROJECTS',
 'questionMainText': [('Procedure', 'Aim'),
  ('put a large number of tiny spacecraft into orbit far above Earth',
   'to create a 30………….. that would reduce the amount of light reaching Earth'),
  ('place 31…………… in the sea', 'to encourage 32…………… to form'),
  ('release aerosol sprays into the stratosphere',
   'to create 33……………. that would reduce the amount of light reaching Earth'),
  ('fix strong 34…………… to Greenland ice sheets',
   'to prevent icebergs moving into the sea'),
  ('plant trees in Russian Arctic that would lose their leaves in winter',
   'to allow the 35…………… to reflect radiation'),
  ('change the direction of 36……………',
   'to br

### Matching Features

In [12]:
# Matching questions
def parse_matching_features(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    question_list_title = clean_text_single_line(question_list_title)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)
    # question_list_of_options_lines = re.split(r'\n+', question_list_of_options)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch


    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, # option is a tuple but correct answer might be a string
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

questionTask = load_yaml_file(cam11_test1_3)["question_content"][2]
# questionTask = load_yaml_file(cam11_test2_1)["question_content"][1]

display(questionTask)
parse_matching_features(questionTask)


{'task_type': 'matching-features\n',
 'task_question_number': 'Questions 37-40\n',
 'task_description': 'Look at the following statements (Questions 37-40) and the list of scientists below.\n\nMatch each statement with the correct scientist, A-D.\n\nWrite the correct letter, A-D, in boxes 37-40 on your answer sheet.\n',
 'question_main_title': '',
 'question_main_text': '37   The effects of geo-engineering may not be long-lasting.\n\n38   Geo-engineering is a topic worth exploring.\n\n39   It may be necessary to limit the effectiveness of geo-engineering projects.\n\n40   Research into non-fossil-based fuels cannot be replaced by geo-engineering.\n',
 'question_img_path': '',
 'question_list_title': 'List of Scientists\n',
 'question_list_of_options': 'A    Roger Angel\n\nB    Phil Rasch\n\nC    Dan Lunt\n\nD    Martin Sommerkorn\n',
 'example_answer': '',
 'correct_answer': '37. B\n\n38. D\n\n39. C\n\n40. A'}

{'taskType': 'matching-features',
 'taskQuestionNumberList': [37, 38, 39, 40],
 'taskQuestionNumberText': 'Questions 37-40',
 'taskDescription': 'Look at the following statements (Questions 37-40) and the list of scientists below.\nMatch each statement with the correct scientist, A-D.\nWrite the correct letter, A-D, in boxes 37-40 on your answer sheet.',
 'questionMainTitle': '',
 'questionMainText': '37   The effects of geo-engineering may not be long-lasting.\n38   Geo-engineering is a topic worth exploring.\n39   It may be necessary to limit the effectiveness of geo-engineering projects.\n40   Research into non-fossil-based fuels cannot be replaced by geo-engineering.',
 'questionListTitle': 'List of Scientists',
 'questionListOptions': [('A', 'Roger Angel'),
  ('B', 'Phil Rasch'),
  ('C', 'Dan Lunt'),
  ('D', 'Martin Sommerkorn')],
 'questionItems': [{'questionNumber': 37,
   'questionText': 'The effects of geo-engineering may not be long-lasting.',
   'questionOptions': [('A', 'Ro

### Matching Headings

In [13]:
questionTask = load_yaml_file(cam11_test2_2)["question_content"][0]
display(questionTask)

def parse_matching_headings(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)
    # question_list_of_options_lines = re.split(r'\n+', question_list_of_options)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_option_item_pattern = re.compile(r'([ixv]+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch

    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        # "questionMainText": question_main_text, # Not needed
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }


parse_matching_headings(questionTask)


{'task_type': 'matching-headings\n',
 'task_question_number': 'Questions 14-20\n',
 'task_description': 'Reading Passage 2 has seven paragraphs, A-G.\n\nChoose the correct heading for each paragraph from the list of headings below.\n\nWrite the correct number, i-ix, in boxes 14-20 on your answer sheet.\n',
 'question_main_title': '',
 'question_main_text': '14    Paragraph A\n\n15    Paragraph B\n\n16    Paragraph C\n\n17    Paragraph D\n\n18    Paragraph E\n\n19    Paragraph F\n\n20    Paragraph G\n',
 'question_img_path': '',
 'question_list_title': 'List of Headings\n',
 'question_list_of_options': 'i    Evidence of innovative environment management practices\n\nii    An undisputed answer to a question about the moai\n\niii    The future of the moai statues\n\niv    A theory which supports a local belief\n\nv    The future of Easter Island\n\nvi    Two opposing views about the Rapanui people\n\nvii    Destruction outside the inhabitants’ control\n\nviii    How the statues made a sit

{'taskType': 'matching-headings',
 'taskQuestionNumberList': [14, 15, 16, 17, 18, 19, 20],
 'taskQuestionNumberText': 'Questions 14-20',
 'taskDescription': 'Reading Passage 2 has seven paragraphs, A-G.\nChoose the correct heading for each paragraph from the list of headings below.\nWrite the correct number, i-ix, in boxes 14-20 on your answer sheet.',
 'questionMainTitle': '',
 'questionListTitle': 'List of Headings',
 'questionListOptions': [('i',
   'Evidence of innovative environment management practices'),
  ('ii', 'An undisputed answer to a question about the moai'),
  ('iii', 'The future of the moai statues'),
  ('iv', 'A theory which supports a local belief'),
  ('v', 'The future of Easter Island'),
  ('vi', 'Two opposing views about the Rapanui people'),
  ('vii', 'Destruction outside the inhabitants’ control'),
  ('viii', 'How the statues made a situation worse'),
  ('ix', 'Diminishing food resources')],
 'questionItems': [{'questionNumber': 14,
   'questionText': 'Paragraph 

### Matching Paragraphs

In [14]:
questionTask = load_yaml_file(cam13_test2_2)["question_content"][0]
display(questionTask)

def parse_matching_paragraphs(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    question_list_of_options = clean_text_multiple_line(question_list_of_options)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    start_char, end_char = re.split('-', question_list_of_options)
    question_option_items = list_AZ(start_char, end_char)
    #  Strip items

    # Idea: Each matching question item contains question, list of choices and correct answer
    
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

parse_matching_paragraphs(questionTask)

{'task_type': 'matching-paragraphs\n',
 'task_question_number': 'Questions 14-17\n',
 'task_description': 'Reading Passage 2 has six section, A-F.\n\nWhich paragraph contains the following information?\n\nWrite the correct letter, A-F, in boxes 14-17 on your answer sheet.\n\nNB  You may use any letter more than once.\n',
 'question_main_title': '',
 'question_main_text': '14   reference to research showing the beneficial effects of oxytocin on people\n\n15   reasons why the effects of oxytocin are complex\n\n16   mention of a period in which oxytocin attracted little scientific attention\n\n17   reference to people ignoring certain aspects of their research data\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': 'A-F\n',
 'example_answer': '',
 'correct_answer': '14. B\n\n15. F\n\n16. B\n\n17. E\n'}

{'taskType': 'matching-paragraphs',
 'taskQuestionNumberList': [14, 15, 16, 17],
 'taskQuestionNumberText': 'Questions 14-17',
 'taskDescription': 'Reading Passage 2 has six section, A-F.\nWhich paragraph contains the following information?\nWrite the correct letter, A-F, in boxes 14-17 on your answer sheet.\nNB  You may use any letter more than once.',
 'questionMainTitle': '',
 'questionMainText': '14   reference to research showing the beneficial effects of oxytocin on people\n15   reasons why the effects of oxytocin are complex\n16   mention of a period in which oxytocin attracted little scientific attention\n17   reference to people ignoring certain aspects of their research data',
 'questionListTitle': '',
 'questionListOptions': ['A', 'B', 'C', 'D', 'E', 'F'],
 'questionItems': [{'questionNumber': 14,
   'questionOptions': ['A', 'B', 'C', 'D', 'E', 'F'],
   'correctAnswer': 'B'},
  {'questionNumber': 15,
   'questionOptions': ['A', 'B', 'C', 'D', 'E', 'F'],
   'correctAnswer': '

### Matching Sentence Endings

In [15]:
questionTask = load_yaml_file(cam13_test2_3)["question_content"][2]
display(questionTask)

def parse_matching-sentence-endings(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    question_list_of_options = clean_text_multiple_line(question_list_of_options)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    #  Strip items

    # Idea: Each matching question item contains question, list of choices and correct answer
    
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

parse_matching-sentence-endings(questionTask)

{'task_type': 'matching-sentence-endings\n',
 'task_question_number': 'Questions 38-40\n',
 'task_description': 'Complete each sentence with the correct ending, A, B, C or D below.\n\nWrite the correct letter, A, B, C or D, in boxes 38-40 on your answer sheet.\n',
 'question_main_title': '',
 'question_main_text': '38   If there are any trend-related changes impacting on your category, you should\n\n39   If a current trend highlights a negative aspect of your category, you should\n\n40   If the consumers’ new focus has an increasing lack of connection with your offering you should\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': 'A   employ a combination of strategies to maintain your consumer base.\n\nB   identify the most appropriate innovation strategy to use.\n\nC   emphasise your brand’s traditional values with the counteract-and-affirm strategy.\n\nD   use the combine-and-transcend strategy to integrate the two worlds.\n',
 'example_answer': 

{'taskType': 'matching-sentence-endings',
 'taskQuestionNumberList': [38, 39, 40],
 'taskQuestionNumberText': 'Questions 38-40',
 'taskDescription': 'Complete each sentence with the correct ending, A, B, C or D below.\nWrite the correct letter, A, B, C or D, in boxes 38-40 on your answer sheet.',
 'questionMainTitle': '',
 'questionMainText': '38   If there are any trend-related changes impacting on your category, you should\n39   If a current trend highlights a negative aspect of your category, you should\n40   If the consumers’ new focus has an increasing lack of connection with your offering you should',
 'questionListTitle': '',
 'questionListOptions': [('A',
   'employ a combination of strategies to maintain your consumer base.'),
  ('B', 'identify the most appropriate innovation strategy to use.'),
  ('C',
   'emphasise your brand’s traditional values with the counteract-and-affirm strategy.'),
  ('D',
   'use the combine-and-transcend strategy to integrate the two worlds.')],
 '

### Summary Completion Word List

In [16]:
questionTask = load_yaml_file(cam11_test2_3)["question_content"][1]
display(questionTask)

def parse_summary_completion_word_list(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    question_list_of_options = clean_text_multiple_line(question_list_of_options)
    task_question_number_list = parse_task_question_number(task_question_number)

    
    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    #  Strip items

    # Idea: Each matching question item contains question, list of choices and correct answer
    
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

parse_summary_completion_word_list(questionTask)

{'task_type': 'summary-completion-word-list\n',
 'task_question_number': 'Questions 31-33\n',
 'task_description': 'Complete the summary using the list of words, A-H, below.\n\nWrite the correct letters, A-H, in boxes 31-33 on your answer sheet.\n',
 'question_main_title': 'Art and the Brain\n',
 'question_main_text': 'The discipline of neuroaesthetics aims to bring scientific objectivity to the study of art. Neurological studies of the brain, for example, demonstrate the impact which Impressionist paintings have on our 31……………. Alex Forsythe of the University of Liverpool believes many artists give their works the precise degree of 32……………… which most appeals to the viewer’s brain. She also observes that pleasing works of art often contain certain repeated 33……………… which occur frequently in the natural world.\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': 'A     interpretation      \n\nB     complexity            \n\nC     emotions\n\nD     move

{'taskType': 'summary-completion-word-list',
 'taskQuestionNumberList': [31, 32, 33],
 'taskQuestionNumberText': 'Questions 31-33',
 'taskDescription': 'Complete the summary using the list of words, A-H, below.\nWrite the correct letters, A-H, in boxes 31-33 on your answer sheet.',
 'questionMainTitle': 'Art and the Brain',
 'questionMainText': 'The discipline of neuroaesthetics aims to bring scientific objectivity to the study of art. Neurological studies of the brain, for example, demonstrate the impact which Impressionist paintings have on our 31……………. Alex Forsythe of the University of Liverpool believes many artists give their works the precise degree of 32……………… which most appeals to the viewer’s brain. She also observes that pleasing works of art often contain certain repeated 33……………… which occur frequently in the natural world.',
 'questionListTitle': '',
 'questionListOptions': [('A', 'interpretation'),
  ('B', 'complexity'),
  ('C', 'emotions'),
  ('D', 'movements'),
  ('E',

### True False Not Given & Yes No Not Given

In [17]:
def parse_true_false_not_given(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    task_question_number_list = parse_task_question_number(task_question_number)


    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n

    # Idea: Each TFNG question item contains question, list of choices (TFNG) and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": ["TRUE", "FALSE", "NOT GIVEN"], # Always the same
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }

def parse_yes_no_not_given(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    task_question_number_list = parse_task_question_number(task_question_number)


    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n

    # Idea: Each TFNG question item contains question, list of choices (TFNG) and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": ["YES", "NO", "NOT GIVEN"], # Always the same
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }

questionTask = load_yaml_file(cam11_test1_1)["question_content"][1] # TFNG
# questionTask = load_yaml_file(cam11_test2_3)["question_content"][2] # Yes no not given
display(questionTask)
parse_true_false_not_given(questionTask)
# parse_yes_no_not_given(questionTask)
    

{'task_type': 'true-false-notgiven\n',
 'task_question_number': 'Questions 8-13\n',
 'task_description': 'Do the following statements agree with the information given in Reading Passage?\n\nIn boxes 8-13 on your answer sheet, write\n\nTRUE               if the statement agrees with the information\n\nFALSE              if the statement contradicts the information\n\nNOT GIVEN    if there is no information on this\n',
 'question_main_title': '',
 'question_main_text': '8   Methods for predicting the Earth’s population have recently changed.\n\n9   Human beings are responsible for some of the destruction to food-producing land.\n\n10   The crops produced in vertical farms will depend on the season.\n\n11   Some damage to food crops is caused by climate change.\n\n12   Fertilisers will be needed for certain crops in vertical farms.\n\n13   Vertical farming will make plants less likely to be affected by infectious diseases.\n',
 'question_img_path': '',
 'question_list_title': '',
 'questi

{'taskType': 'true-false-notgiven',
 'taskQuestionNumberList': [8, 9, 10, 11, 12, 13],
 'taskQuestionNumberText': 'Questions 8-13',
 'taskDescription': 'Do the following statements agree with the information given in Reading Passage?\nIn boxes 8-13 on your answer sheet, write\nTRUE               if the statement agrees with the information\nFALSE              if the statement contradicts the information\nNOT GIVEN    if there is no information on this',
 'questionMainTitle': '',
 'questionMainText': '8   Methods for predicting the Earth’s population have recently changed.\n9   Human beings are responsible for some of the destruction to food-producing land.\n10   The crops produced in vertical farms will depend on the season.\n11   Some damage to food crops is caused by climate change.\n12   Fertilisers will be needed for certain crops in vertical farms.\n13   Vertical farming will make plants less likely to be affected by infectious diseases.',
 'questionItems': [{'questionNumber': 8,


### Multiple Choice Select Many

In [18]:


def parse_multiple_choice_select_many(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()


    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    task_question_number_list = parse_task_question_number(task_question_number)

    # matching patterns
    correct_answer_pattern = re.compile(r'\d+[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    question_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_lines = re.split(r'\n+', correct_answer)
    question_list_of_options_lines = re.split(r'\n+', question_list_of_options)


    question_items =  question_item_pattern.findall(question_list_of_options)
    correct_answer = correct_answer_pattern.findall(correct_answer)

    return {
    "taskType": task_type,
    "taskQuestionNumberList": task_question_number_list,
    "taskQuestionNumberText": task_question_number,
    "taskDescription": task_description,
    "questionMainTitle": question_main_title,
    "questionMainText": question_main_text,
    "questionItems": question_items,
    "correctAnswer": correct_answer
    }

questionTask = load_yaml_file(cam11_test2_2)["question_content"][2]
display(questionTask)
parse_multiple_choice_select_many(questionTask)

{'task_type': 'multiple-choice-select-many\n',
 'task_question_number': 'Questions 25 and 26\n',
 'task_description': 'Choose TWO letters, A-E.\n\nWrite the correct letters in boxes 25 and 26 on your answer sheet.\n',
 'question_main_title': '',
 'question_main_text': 'On what points do Hunt and Lipo disagree with Diamond?\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': 'A   the period when the moai were created\n\nB   how the moai were transported\n\nC   the impact of the moai on Rapanui society\n\nD   how the moai were carved\n\nE   the origins of the people who made the moai\n',
 'example_answer': '',
 'correct_answer': '25. B\n\n26. C\n'}

{'taskType': 'multiple-choice-select-many',
 'taskQuestionNumberList': [25, 26],
 'taskQuestionNumberText': 'Questions 25 and 26',
 'taskDescription': 'Choose TWO letters, A-E.\nWrite the correct letters in boxes 25 and 26 on your answer sheet.',
 'questionMainTitle': '',
 'questionMainText': 'On what points do Hunt and Lipo disagree with Diamond?',
 'questionItems': [('A', 'the period when the moai were created'),
  ('B', 'how the moai were transported'),
  ('C', 'the impact of the moai on Rapanui society'),
  ('D', 'how the moai were carved'),
  ('E', 'the origins of the people who made the moai')],
 'correctAnswer': ['B', 'C']}

### Multiple Choice Select One

In [19]:
questionTask = load_yaml_file(cam11_test2_3)["question_content"][0]
display(questionTask)

def parse_multiple_choice_select_one(questionTask):
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()

    # Not applicable but included for consistency
    question_list_of_options = questionTask['question_list_of_options'].strip()
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    task_question_number_list = parse_task_question_number(task_question_number)

    mcq_question_content_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    mcq_question_option_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers

    # Split items
    multiple_choice_question_item_lines = re.split(r'\n(?=\d+\s)', question_main_text) # ['27    In the secon...\nA   the subject...\nB   the subject...', '28    The author...\nA   the subject...\nB   the subject...']
    correct_answer_lines = re.split(r'\n+', correct_answer) # ['27. A', '28. B']

    question_items = []
    for mcq_question_item_line, answer_line in zip(multiple_choice_question_item_lines, correct_answer_lines):
        mcq_question_item_line, answer_line = mcq_question_item_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = mcq_question_content_pattern.match(mcq_question_item_line)
        question_option_items = mcq_question_option_pattern.findall(mcq_question_item_line)
        answer_item_match = correct_answer_pattern.match(answer_line)

        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionItems": question_items,
    }

parse_multiple_choice_select_one(questionTask)

{'task_type': 'multiple-choice-select-one\n',
 'task_question_number': 'Questions 27-30\n',
 'task_description': 'Choose the correct letter, A, B, C or D.\n\nWrite the correct letter in boxes 27-30 on your answer sheet\n',
 'question_main_title': '',
 'question_main_text': '27    In the second paragraph, the writer refers to a shape-matching test in order to illustrate\n\nA   the subjective nature of art appreciation.\n\nB   the reliance of modern art on abstract forms.\n\nC   our tendency to be influenced by the opinions of others.\n\nD   a common problem encountered when processing visual data.\n\n\n\n28   Angelina Hawley-Dolan’s findings indicate that people\n\nA   mostly favour works of art which they know well.\n\nB   hold fixed ideas about what makes a good work of art.\n\nC   are often misled by their initial expectations of a work of art.\n\nD   have the ability to perceive the intention behind works of art.\n\n\n29    Results of studies involving Robert Pepperell’s pieces sugg

{'taskType': 'multiple-choice-select-one',
 'taskQuestionNumberList': [27, 28, 29, 30],
 'taskQuestionNumberText': 'Questions 27-30',
 'taskDescription': 'Choose the correct letter, A, B, C or D.\nWrite the correct letter in boxes 27-30 on your answer sheet',
 'questionItems': [{'questionNumber': 27,
   'questionText': 'In the second paragraph, the writer refers to a shape-matching test in order to illustrate',
   'questionOptions': [('A', 'the subjective nature of art appreciation.'),
    ('B', 'the reliance of modern art on abstract forms.'),
    ('C', 'our tendency to be influenced by the opinions of others.'),
    ('D', 'a common problem encountered when processing visual data.')],
   'correctAnswer': 'C'},
  {'questionNumber': 28,
   'questionText': 'Angelina Hawley-Dolan’s findings indicate that people',
   'questionOptions': [('A',
     'mostly favour works of art which they know well.'),
    ('B', 'hold fixed ideas about what makes a good work of art.'),
    ('C', 'are often mi