In [2]:
import errno
from pathlib import Path
import re
import json
import yaml
import argparse
import os
import unicodedata

from IPython.display import display, Markdown, HTML

In [3]:
def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as e:
        if e.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

def get_files(directory, ending=""):
    files = []
    for root, dirs, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith(ending):
                files.append(os.path.join(root, filename))
    return files

def load_yaml_file(filename):
    with open(filename, 'r') as f:
        return yaml.safe_load(f)


def save_json_file(string, filename):
    mkdir_p(os.path.dirname(filename))
    with open(filename, 'w') as f:
        f.write(string)

def clean_text_single_line(text):
    return re.sub(r'\s+', ' ', text).strip()

def clean_text_multiple_line(text):
    new_text = re.sub(r'\n+', '\n', text)
    # Clean traiiling white space at the end
    new_text = "\n".join([line.strip() for line in new_text.split('\n')])
    return new_text

def clean_floating_linebreaks(text):
    # Pattern to match any \n not preceded by a period OR not followed by an uppercase letter
    pattern = re.compile(r'(?<![A-Z\.])\n|(?<=\n)(?![A-Z])')
    return pattern.sub('', text)


def list_AZ(start_char,end_char):
    # List the alphabets from start_char to end_char. 
    # Ex: list_AZ('A','C') -> ['A', 'B', 'C']
    start = ord(start_char)
    end = ord(end_char)
    return [chr(i) for i in range(start, end+1)]
    
def list_numbers(start_num, end_num):
    # List the numbers from start_num to end_num
    # Ex: list_numbers(1,3) -> [1, 2, 3]
    return list(range(start_num, end_num+1))

def parse_task_question_number(text):
    # Take into account the unicode for '-'
    # Case 1: "Questions 1-5" or "Questions 1 - 5" (Range). Take into account multiple spaces
    range_pattern = re.compile(r'Questions\s*(\d+)[\-\–\s]+(\d+)')
        # range_pattern = re.compile(r'Questions (\d+)\s*-\s*(\d+)')
    multiple_pattern = re.compile(r'Questions\s*(\d+)\s*and\s*(\d+)')    # Case 2: "Questions 1 and 2" (Multiple)
    single_pattern = re.compile(r'Question\s*(\d+)')    # Case 3: "Question 1" (Single)
    # Return whichever case matches
    if range_pattern.match(text):
        start_num, end_num = range_pattern.match(text).groups()
        return list_numbers(int(start_num), int(end_num))
    elif multiple_pattern.match(text):
        num1, num2 = multiple_pattern.match(text).groups()
        return [int(num1), int(num2)]
    elif single_pattern.match(text):
        return [int(single_pattern.match(text).group(1))]
    else:
        raise Exception(f"Invalid question number format {text}")
    
print(parse_task_question_number("Questions 1 – 10"))
print(parse_task_question_number("Questions   1 and 2"))
print(parse_task_question_number("Question  1  "))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[1, 2]
[1]


In [4]:
#  Completion questions
def parse_diagram_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    question_img_path = questionTask['question_img_path'].strip()  # Most important
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_question_number_list = parse_task_question_number(task_question_number)
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)

    # matching patterns
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_items = re.split(r'\n+', correct_answer)

    # Idea: Correct-answer-bassed question items. Each question item will be created for each correct answer   
    question_items = []
    for answer_item in correct_answer_items:
        answer_item = answer_item.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_item)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
    })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionImgPath": question_img_path, # Most important
        "questionItems": question_items,
    }

def parse_flow_chart_completion(questionTask):
    return parse_diagram_completion(questionTask)

def parse_sentence_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()
    
    task_question_number_list = parse_task_question_number(task_question_number)
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)

    # matching patterns
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_items = re.split(r'\n+', correct_answer)
    question_main_text_lines = re.split(r'\n+', question_main_text)

    # Idea: Correct-answer-bassed question items. Each question item will be created for each correct answer   
    question_items = []
    for answer_item in correct_answer_items:
        answer_item = answer_item.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_item)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
    })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text_lines,
        "questionItems": question_items,
    }

def parse_summary_completion(questionTask):
    return parse_sentence_completion(questionTask)

def parse_note_completion(questionTask):
    return parse_sentence_completion(questionTask)

def parse_summary_completion_word_list(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    question_list_of_options = clean_text_multiple_line(question_list_of_options)
    task_question_number_list = parse_task_question_number(task_question_number)

    
    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+([A-Z])') 
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    #  Strip items

    # Idea: Each matching question item contains question, list of choices and correct answer
    
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

def parse_table_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_question_number_list = parse_task_question_number(task_question_number)
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    question_list_title = clean_text_single_line(question_list_title)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    table_data_rows = [tuple(r.strip() for r in re.split(r'\|',row)) for row in question_main_text_lines]


    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": table_data_rows,
        "questionItems": question_items,
    }

# Matching questions
def parse_matching_features(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    question_list_title = clean_text_single_line(question_list_title)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)
    # question_list_of_options_lines = re.split(r'\n+', question_list_of_options)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch


    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, # option is a tuple but correct answer might be a string
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

def parse_matching_headings(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)
    # question_list_of_options_lines = re.split(r'\n+', question_list_of_options)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+([ixv]+)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_option_item_pattern = re.compile(r'([ixv]+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch

    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        # "questionMainText": question_main_text, # Not needed
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

def parse_matching_sentence_endings(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_floating_linebreaks(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    question_list_of_options = clean_text_multiple_line(question_list_of_options)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    #  Strip items

    # Idea: Each matching question item contains question, list of choices and correct answer
    
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }


def parse_matching_paragraphs(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    # question_main_text = clean_text_multiple_line(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    question_list_of_options = clean_text_multiple_line(question_list_of_options)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)')
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)')
    start_char, end_char = re.split('-', question_list_of_options)
    question_option_items = list_AZ(start_char, end_char)
    #  Strip items

    # Idea: Each matching question item contains question, list of choices and correct answer
    
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_item_match = question_item_pattern.match(question_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

def parse_true_false_notgiven(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    task_question_number_list = parse_task_question_number(task_question_number)


    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n

    # Idea: Each TFNG question item contains question, list of choices (TFNG) and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": ["TRUE", "FALSE", "NOT GIVEN"], # Always the same
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }

def parse_yes_no_notgiven(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    task_question_number_list = parse_task_question_number(task_question_number)


    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n

    # Idea: Each TFNG question item contains question, list of choices (TFNG) and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": ["YES", "NO", "NOT GIVEN"], # Always the same
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }

# Choice questions
def parse_multiple_choice_select_one(questionTask):
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()

    # Not applicable but included for consistency
    question_list_of_options = questionTask['question_list_of_options'].strip()
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    task_question_number_list = parse_task_question_number(task_question_number)

    mcq_question_content_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    mcq_question_option_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers

    # Split items
    multiple_choice_question_item_lines = re.split(r'\n(?=\d+\s)', question_main_text) # ['27    In the secon...\nA   the subject...\nB   the subject...', '28    The author...\nA   the subject...\nB   the subject...']
    correct_answer_lines = re.split(r'\n+', correct_answer) # ['27. A', '28. B']

    question_items = []
    for mcq_question_item_line, answer_line in zip(multiple_choice_question_item_lines, correct_answer_lines):
        mcq_question_item_line, answer_line = mcq_question_item_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = mcq_question_content_pattern.match(mcq_question_item_line)
        question_option_items = mcq_question_option_pattern.findall(mcq_question_item_line)
        answer_item_match = correct_answer_pattern.match(answer_line)

        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionItems": question_items,
    }

def parse_multiple_choice_select_many(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()


    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_floating_linebreaks(question_main_text)
    task_question_number_list = parse_task_question_number(task_question_number)

    # matching patterns
    correct_answer_pattern_1 = re.compile(r'\d+[^a-zA-Z\d\(\)\-\+:]+([A-Z])') # 25. B\n26. D\n27. E'
    correct_answer_pattern_2 = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+.*') # 'B ■ They can predict...\nG ■ They are more...'
    question_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_lines = re.split(r'\n+', correct_answer)
    question_list_of_options_lines = re.split(r'\n+', question_list_of_options)


    question_items =  question_item_pattern.findall(question_list_of_options)
    if correct_answer_pattern_1.match(correct_answer):
        correct_answer_items = correct_answer_pattern_1.findall(correct_answer)
    elif correct_answer_pattern_2.match(correct_answer):
        correct_answer_items = correct_answer_pattern_2.findall(correct_answer)
    # correct_answer = correct_answer_pattern.findall(correct_answer)

    return {
    "taskType": task_type,
    "taskQuestionNumberList": task_question_number_list,
    "taskQuestionNumberText": task_question_number,
    "taskDescription": task_description,
    "questionMainTitle": question_main_title,
    "questionMainText": question_main_text,
    "questionItems": question_items,
    "correctAnswer": correct_answer_items
    }

### Parse All

In [35]:
def populate_metadata(raw_data):
    if raw_data["metadata"]:
        metadata = raw_data["metadata"]
        raw_data["reading_info"]["metadata"] = metadata
        raw_data["passage_content"]["metadata"] = metadata
        for question in raw_data["question_content"]:
            question["metadata"] = metadata

def parse_reading_from_yaml(filename):
    raw_data = load_yaml_file(filename)
    populate_metadata(raw_data)
    reading_info = parse_reading_info(raw_data["reading_info"])
    passage_content = parse_passage_content(raw_data["passage_content"])
    question_content = parse_question_content(raw_data["question_content"])
    return {
        "readingInfo": reading_info,
        "passageContent": passage_content,
        "questionContent": question_content
    }

def parse_reading_info(reading_info_data):
    return {
        "readingTitle": clean_text_single_line(reading_info_data["reading_title"]).strip(),
        "readingSubtitle": clean_text_single_line(reading_info_data["reading_subtitle"]).strip(),
    }

def parse_passage_content(passage_data):
    metadata = passage_data['metadata']
    paragraph_markers = passage_data['paragraph_markers']
    passage_context = clean_text_single_line(passage_data["passage_context"])
    passage_title = clean_text_single_line(passage_data["passage_title"])
    passage_subtitle = clean_text_single_line(passage_data["passage_subtitle"])
    if metadata['contains_floating_breaks']:
        passage_main_text = clean_floating_linebreaks(passage_data["passage_main_text"])
    else:
        passage_main_text = clean_text_multiple_line(passage_data["passage_main_text"])

    if paragraph_markers:
        split_pattern = re.compile(r'\n+(?=[A-Z]\n+[^A-Za-z]?[A-Z])')
        match_pattern = re.compile(r'([A-Z])\n+([^A-Za-z]?[A-Z].*)')
        passsage_paragraphs = re.split(split_pattern, passage_main_text) # ['A\nEaster Island, or ', 'B\nWhen the Europeans', 'C\nThe moai, he think',...]
        passsage_paragraphs = [match_pattern.match(a).groups() for a in passsage_paragraphs]     # For each paragraph, match into header and content groups
    else:
        passsage_paragraphs = re.split(r'\n+(?=[A-Z])', passage_main_text)

    return {
        "hasParagraphMarkers": paragraph_markers,
        "passageContext": passage_context,
        "passageTitle": passage_title,
        "passageSubtitle": passage_subtitle,
        "passageMainText": passage_main_text,
        "passageParagraphs": passsage_paragraphs
    }

def parse_question_content(question_data):
    questionTasks = []
    for question in question_data:
        questionTasks.append(parse_question_task(question))
    return questionTasks

def parse_question_task(questionTask):
    task_type = questionTask["task_type"].strip()
    parser_functions = {
        "multiple-choice-select-one": parse_multiple_choice_select_one,
        "multiple-choice-select-many": parse_multiple_choice_select_many,
        "diagram-completion": parse_diagram_completion,
        "flow-chart-completion": parse_flow_chart_completion,
        "summary-completion": parse_summary_completion,
        "summary-completion-word-list": parse_summary_completion_word_list,
        "sentence-completion": parse_sentence_completion,
        "table-completion": parse_table_completion,
        "note-completion": parse_note_completion,
        "matching-features": parse_matching_features,
        "matching-headings": parse_matching_headings,
        "matching-sentence-endings": parse_matching_sentence_endings,
        "matching-paragraphs": parse_matching_paragraphs,
        "true-false-notgiven": parse_true_false_notgiven,
        "yes-no-notgiven": parse_yes_no_notgiven
    }
    if task_type not in parser_functions.keys():
        raise Exception(f"{task_type} is an invalid question type")
    return parser_functions[task_type](questionTask)


### Declare Paths

In [6]:
cam11_test1_1 = "../components/assets/yaml/cam-11-test-1/cam-11-test-1-1.yaml"
cam11_test1_2 = "../components/assets/yaml/cam-11-test-1/cam-11-test-1-2.yaml"
cam11_test1_3 = "../components/assets/yaml/cam-11-test-1/cam-11-test-1-3.yaml"
cam11_test2_1 = "../components/assets/yaml/cam-11-test-2/cam-11-test-2-1.yaml"
cam11_test2_2 = "../components/assets/yaml/cam-11-test-2/cam-11-test-2-2.yaml"
cam11_test2_3 = "../components/assets/yaml/cam-11-test-2/cam-11-test-2-3.yaml"
cam13_test2_1 = "../components/assets/yaml/cam-13-test-2/cam-13-test-2-1.yaml"
cam13_test2_2 = "../components/assets/yaml/cam-13-test-2/cam-13-test-2-2.yaml"
cam13_test2_3 = "../components/assets/yaml/cam-13-test-2/cam-13-test-2-3.yaml"

diagram_completion = "../components/assets/yaml/reading-practices/diagram-completion.yaml"
flow_chart_completion = "../components/assets/yaml/reading-practices/flow-chart-completion.yaml"
sentence_completion = "../components/assets/yaml/reading-practices/sentence-completion.yaml"
summary_completion = "../components/assets/yaml/reading-practices/summary-completion.yaml"
table_completion = "../components/assets/yaml/reading-practices/table-completion.yaml"
note_completion = "../components/assets/yaml/reading-practices/note-completion.yaml"
summary_completion_word_list = "../components/assets/yaml/reading-practices/summary-completion-word-list.yaml"
matching_features = "../components/assets/yaml/reading-practices/matching-features.yaml"
matching_headings = "../components/assets/yaml/reading-practices/matching-headings.yaml"
matching_sentence_endings = "../components/assets/yaml/reading-practices/matching-sentence-endings.yaml"
matching_paragraphs = "../components/assets/yaml/reading-practices/matching-paragraphs.yaml"
true_false_notgiven = "../components/assets/yaml/reading-practices/true-false-notgiven.yaml"
yes_no_notgiven = "../components/assets/yaml/reading-practices/yes-no-notgiven.yaml"
multiple_choice_select_one = "../components/assets/yaml/reading-practices/multiple-choice-select-one.yaml"
multiple_choice_select_many = "../components/assets/yaml/reading-practices/multiple-choice-select-many.yaml"

raw_data = load_yaml_file(cam11_test1_2)
raw_data["question_content"]

[{'task_type': 'true-false-notgiven\n',
  'task_question_number': 'Questions 14-19\n',
  'task_description': 'Do the following statements agree with the information given in Reading Passage 2?\n\nIn boxes 14-19 on your answer sheet, write\n\nTRUE                if the statement agrees with the information\n\nFALSE               if the statement contradicts the information\n\nNOT GIVEN     if there is no information on this\n',
  'question_main_title': '',
  'question_main_text': '14   The Falkirk Wheel has linked the Forth & Clyde Canal with the Union Canal for the first time in their history.\n\n15   There was some opposition to the design of the Falkirk Wheel at first.\n\n16   The Falkirk Wheel was initially put together at the location where its components were manufactured.\n\n17   The Falkirk Wheel is the only boat lift in the world which has steel sections bolted together by hand.\n\n18   The weight of the gondolas varies according to the size of boat being carried.\n\n19   The c

## Parse Passage & Reading Info

In [27]:
raw_data = load_yaml_file(cam11_test1_1)
reading_data = raw_data["reading_info"]

display(reading_data)
parse_reading_info(reading_data)

{'reading_title': 'READING PASSAGE 1\n',
 'reading_subtitle': 'You should spend about 20 minutes on Questions 1-13 which are based on Reading Passage 1 below.\n'}

{'readingTitle': 'READING PASSAGE 1',
 'readingSubtitle': 'You should spend about 20 minutes on Questions 1-13 which are based on Reading Passage 1 below.'}

In [44]:
# raw_data = load_yaml_file(cam11_test2_2)
raw_data = load_yaml_file(cam11_test1_3)
populate_metadata(raw_data)
passage_data = raw_data["passage_content"]
display(passage_data)
# parse_passage_content(passage_data)
def clean_floating_linebreaks(text):
    # Pattern to match any \n not preceded by a period OR not followed by an uppercase letter
    pattern = re.compile(r'(?<![A-Z\?\!\.\'\"\‘\’\`])\n|(?<=\n)(?![A-Z])')
    return pattern.sub('', text)

def parse_passage_content(passage_data):
    metadata = passage_data['metadata']
    paragraph_markers = passage_data['paragraph_markers']
    passage_context = clean_text_single_line(passage_data["passage_context"])
    passage_title = clean_text_single_line(passage_data["passage_title"])
    passage_subtitle = clean_text_single_line(passage_data["passage_subtitle"])
    if metadata['contains_floating_breaks']:
        passage_main_text = clean_floating_linebreaks(passage_data["passage_main_text"])
    else:
        passage_main_text = clean_text_multiple_line(passage_data["passage_main_text"])

    if paragraph_markers:
        split_pattern = re.compile(r'\n+(?=[A-Z]\n+[^A-Za-z]?[A-Z])')
        match_pattern = re.compile(r'([A-Z])\n+([^A-Za-z]?[A-Z].*)')
        passsage_paragraphs = re.split(split_pattern, passage_main_text) # ['A\nEaster Island, or ', 'B\nWhen the Europeans', 'C\nThe moai, he think',...]
        passsage_paragraphs = [match_pattern.match(a).groups() for a in passsage_paragraphs]     # For each paragraph, match into header and content groups
    else:
        passsage_paragraphs = re.split(r'\n+(?=[A-Z])', passage_main_text)

    return {
        "hasParagraphMarkers": paragraph_markers,
        "passageContext": passage_context,
        "passageTitle": passage_title,
        "passageSubtitle": passage_subtitle,
        "passageMainText": passage_main_text,
        "passageParagraphs": passsage_paragraphs
    }

parse_passage_content(passage_data)

{'paragraph_markers': True,
 'passage_context': '',
 'passage_title': 'Reducing the Effects of Climate Change\n',
 'passage_subtitle': 'Mark Rowe reports on the increasingly ambitious geo-engineering projects being explored by scientists\n',
 'passage_main_text': 'A\n\nSuch is our dependence on fossil fuels, and such is the volume of carbon dioxide already released into the atmosphere, that many experts agree that significant global warming is now inevitable. They believe that the best we can do is keep it at a reasonable level, and at present the only serious option for doing this is cutting back on our carbon emissions. But while a few countries are making major strides in this regard, the majority are having great difficulty even stemming the rate of increase, let alone reversing it. Consequently, an increasing number of scientists are beginning to explore the alternative of geo-engineering — a term which generally refers to the intentional large-scale manipulation of the environmen

{'hasParagraphMarkers': True,
 'passageContext': '',
 'passageTitle': 'Reducing the Effects of Climate Change',
 'passageSubtitle': 'Mark Rowe reports on the increasingly ambitious geo-engineering projects being explored by scientists',
 'passageMainText': 'A\nSuch is our dependence on fossil fuels, and such is the volume of carbon dioxide already released into the atmosphere, that many experts agree that significant global warming is now inevitable. They believe that the best we can do is keep it at a reasonable level, and at present the only serious option for doing this is cutting back on our carbon emissions. But while a few countries are making major strides in this regard, the majority are having great difficulty even stemming the rate of increase, let alone reversing it. Consequently, an increasing number of scientists are beginning to explore the alternative of geo-engineering — a term which generally refers to the intentional large-scale manipulation of the environment. Accord

In [54]:
metadata = passage_data['metadata']
paragraph_markers = passage_data['paragraph_markers']
passage_context = clean_text_single_line(passage_data["passage_context"])
passage_title = clean_text_single_line(passage_data["passage_title"])
passage_subtitle = clean_text_single_line(passage_data["passage_subtitle"])
if metadata['contains_floating_breaks']:
    passage_main_text = clean_floating_linebreaks(passage_data["passage_main_text"])
else:
    passage_main_text = clean_text_multiple_line(passage_data["passage_main_text"])

if paragraph_markers:
    split_pattern = re.compile(r'\n+(?=[A-Z]\n+[^A-Za-z]?[A-Z])')
    match_pattern = re.compile(r'([A-Z])\n+([^A-Za-z]?[A-Z].*)')
    passsage_paragraphs = re.split(split_pattern, passage_main_text) # ['A\nEaster Island, or ', 'B\nWhen the Europeans', 'C\nThe moai, he think',...]
    passsage_paragraphs = [match_pattern.match(a).groups() for a in passsage_paragraphs]     # For each paragraph, match into header and content groups
else:
    passsage_paragraphs = re.split(r'\n+(?=[A-Z])', passage_main_text)

passsage_paragraphs

[('A',
  'Such is our dependence on fossil fuels, and such is the volume of carbon dioxide already released into the atmosphere, that many experts agree that significant global warming is now inevitable. They believe that the best we can do is keep it at a reasonable level, and at present the only serious option for doing this is cutting back on our carbon emissions. But while a few countries are making major strides in this regard, the majority are having great difficulty even stemming the rate of increase, let alone reversing it. Consequently, an increasing number of scientists are beginning to explore the alternative of geo-engineering — a term which generally refers to the intentional large-scale manipulation of the environment. According to its proponents, geo-engineering is the equivalent of a backup generator: if Plan A – reducing our dependency on fossil fuels – fails, we require a Plan B, employing grand schemes to slow down or reverse the process of global warming.'),
 ('B',


In [9]:
display(load_yaml_file(cam11_test2_2))
parse_reading_from_yaml(cam11_test2_2)

{'reading_info': {'reading_title': 'READING PASSAGE 2\n',
  'reading_subtitle': 'You should spend about 20 minutes on Questions 14-26 which are based on Reading Passage 2 below. \n'},
 'passage_content': {'paragraph_markers': True,
  'passage_context': '',
  'passage_title': 'What destroyed the civilisation of Easter Island?\n',
  'passage_subtitle': 'How a sixteenth-century warship was recovered from the seabed\n',
  'passage_main_text': 'A\n\nEaster Island, or Rapu Nui as it is known locally, is home to several hundred ancient human statues – the moai. After this remote Pacific island was settled by the Polynesians, it remained isolated for centuries. All the energy and resources that went into the moai – some of which are ten metres tall and weigh over 7,000 kilos – came from the island itself. Yet when Dutch explorers landed in 1722, they met a Stone Age culture. The moai were carved with stone tools, then transported for many kilometres, without the use of animals or wheels, to ma

{'readingInfo': {'readingTitle': 'READING PASSAGE 2',
  'readingSubtitle': 'You should spend about 20 minutes on Questions 14-26 which are based on Reading Passage 2 below.'},
 'passageContent': {'hasParagraphMarkers': True,
  'passageContext': '',
  'passageTitle': 'What destroyed the civilisation of Easter Island?',
  'passageSubtitle': 'How a sixteenth-century warship was recovered from the seabed',
  'passageMainText': 'A\nEaster Island, or Rapu Nui as it is known locally, is home to several hundred ancient human statues – the moai. After this remote Pacific island was settled by the Polynesians, it remained isolated for centuries. All the energy and resources that went into the moai – some of which are ten metres tall and weigh over 7,000 kilos – came from the island itself. Yet when Dutch explorers landed in 1722, they met a Stone Age culture. The moai were carved with stone tools, then transported for many kilometres, without the use of animals or wheels, to massive stone platfo

### Clean Paragraph

In [10]:
passage_main_text = load_yaml_file(matching_headings)['passage_content']['passage_main_text']

def clean_floating_linebreaks(text):
    # Pattern to match any \n not preceded by a period OR not followed by an uppercase letter
    pattern = re.compile(r'(?<![A-Z\.])\n|(?<=\n)(?![A-Z])')
    return pattern.sub('', text)

clean_text = clean_floating_linebreaks(passage_main_text)
for line in re.split(r'\n+', clean_text):
    print(line + '\n')

A

Some years ago, when several theoretical physicists, principally Dirk Helbing and Boris Kerner of Stuttgart, Germany, began publishing papers on traffic flow in publications normally read by traffic engineers, they were clearly working outside their usual sphere of investigation. They had noticed that if they simulated the movement of vehicles on a highway, using the equations that describe how the molecules of a gas move, some very strange results emerged. Of course, vehicles do not behave exactly like gas molecules: for example, drivers try to avoid collisions by slowing down when they get too near another vehicle, whereas gas molecules have no such concern. However, the physicists modified the equations to take the differences into account and the overall description of traffic as a flowing gas has proved to be a very good one; the moving-gas model of traffic reproduces many phenomena seen in real-world traffic.

The strangest thing that came out of these equations, however, was 

In [11]:
parse_passage_content(load_yaml_file(matching_headings)['passage_content'])

{'hasParagraphMarkers': True,
 'passageContext': '[Note: This is an extract from a Part 2 text about the physics of traffic behaviour.] © 2000 The Atlantic Media Co., as first published in The Atlantic Magazine. All rights reserved. Distributed by Tribune Content Agency.',
 'passageTitle': 'The Physics of Traffic Behavior',
 'passageSubtitle': '',
 'passageMainText': 'A\nSome years ago, when several theoretical physicists, principally Dirk Helbing and Boris Kerner of Stuttgart, Germany, began publishing papers on traffic flow in publications normally read by traffic engineers, they were clearly working outside their usual sphere of investigation. They had noticed that if they simulated the movement of vehicles on a highway, using the equations that describe how the molecules of a gas move, some very strange results emerged. Of course, vehicles do not behave exactly like gas molecules: for example, drivers try to avoid collisions by slowing down when they get too near another vehicle, w

## Parse Questions

### Diagram & Flow Chart Completion

In [55]:
#  Completion questions
def parse_diagram_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    question_img_path = questionTask['question_img_path'].strip()  # Most important
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_question_number_list = parse_task_question_number(task_question_number)
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)

    # matching patterns
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_items = re.split(r'\n+', correct_answer)

    # Idea: Correct-answer-bassed question items. Each question item will be created for each correct answer   
    question_items = []
    for answer_item in correct_answer_items:
        answer_item = answer_item.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_item)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
    })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionImgPath": question_img_path, # Most important
        "questionItems": question_items,
    }


def parse_flow_chart_completion(questionTask):
    return parse_diagram_completion(questionTask)

questionTask = load_yaml_file(cam11_test1_2)["question_content"][1]
# questionTask = load_yaml_file(cam11_test2_1)["question_content"][2]

questionTask = load_yaml_file(diagram_completion)["question_content"][0]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_diagram_completion(questionTask)

Raw version


{'task_type': 'diagram-completion\n',
 'task_question_number': 'Questions 1 – 5\n',
 'task_description': 'Label the diagram below.\nChoose NO MORE THAN THREE WORDS from the passage for each answer.\nWrite your answers in boxes 1-5 on your answer sheet.\n',
 'question_main_title': '',
 'question_main_text': '',
 'question_img_path': '../assets/images/reading/diagram_completion_1.jpg\n',
 'question_list_title': '',
 'question_list_of_options': '',
 'example_answer': '',
 'correct_answer': '1 hot dry air ■ dry hot air\n2 cool moist air ■ moist cool air\n3 infrared radiation/light\n4 (a/the) condenser\n5 pure/distilled water ■ pure distilled water'}

Parsed version


{'taskType': 'diagram-completion',
 'taskQuestionNumberList': [1, 2, 3, 4, 5],
 'taskQuestionNumberText': 'Questions 1 – 5',
 'taskDescription': 'Label the diagram below.\nChoose NO MORE THAN THREE WORDS from the passage for each answer.\nWrite your answers in boxes 1-5 on your answer sheet.',
 'questionMainTitle': '',
 'questionMainText': '',
 'questionImgPath': '../assets/images/reading/diagram_completion_1.jpg',
 'questionItems': [{'questionNumber': 1,
   'correctAnswer': 'hot dry air ■ dry hot air'},
  {'questionNumber': 2, 'correctAnswer': 'cool moist air ■ moist cool air'},
  {'questionNumber': 3, 'correctAnswer': 'infrared radiation/light'},
  {'questionNumber': 4, 'correctAnswer': '(a/the) condenser'},
  {'questionNumber': 5,
   'correctAnswer': 'pure/distilled water ■ pure distilled water'}]}

### Sentence, Summary and Note Completion

In [68]:
def parse_sentence_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()
    
    task_question_number_list = parse_task_question_number(task_question_number)
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)

    # matching patterns
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_items = re.split(r'\n+', correct_answer)
    question_main_text_lines = re.split(r'\n+', question_main_text)

    # Idea: Correct-answer-bassed question items. Each question item will be created for each correct answer   
    question_items = []
    for answer_item in correct_answer_items:
        answer_item = answer_item.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_item)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
    })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text_lines,
        "questionItems": question_items,
    }

def parse_summary_completion(questionTask):
    return parse_sentence_completion(questionTask)

def parse_note_completion(questionTask):
    return parse_sentence_completion(questionTask)
    
# questionTask = load_yaml_file(cam13_test2_1)["question_content"][0] # Note completion
# questionTask = load_yaml_file(cam11_test2_2)["question_content"][1] # Summary completion
# questionTask = load_yaml_file(cam11_test1_1)["question_content"][0] # Sentence completion
yamlPath = sentence_completion
raw_data = load_yaml_file(yamlPath)
populate_metadata(raw_data)
print('Raw paragraph')
print(raw_data['passage_content']['passage_main_text'])
parsed_paragraphs = parse_passage_content(raw_data['passage_content'])['passageParagraphs']
print('Parsed paragraph')
print(parsed_paragraphs)

questionTask = raw_data["question_content"][0]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_sentence_completion(questionTask)



Raw paragraph
The science of evolutionary relationships has undergone a major change in recent decades. It used 
to be the case that all the features of organisms were important in working out their family tree. 
But following the work of German entomologist Willi Hennig, many evolutionary scientists now 
believe that the only features which carry any useful information are the evolutionary ‘novelties’ 
shared between organisms. Mice, lizards and fish, for example, all have backbones – so the feature 
‘backbone’ tells us nothing about their evolutionary relationship. But the feature ‘four legs’ is 
useful because it’s an evolutionary novelty – a characteristic shared only between the lizard and 
the mouse. This would suggest that the lizard and mouse are more closely related to each other 
than either is to the fish. This revolutionary approach is called cladistics, and it has been central to 
the idea that birds evolved from dinosaurs.
The ‘birds are dinosaurs’ theory was first develo

{'task_type': 'sentence-completion\n',
 'task_question_number': 'Questions 1 – 5\n',
 'task_description': 'Complete the sentences below.\nChoose ONE WORD ONLY from the passage for each answer.\nWrite your answers in boxes 1-5 on your answer sheet.\n',
 'question_main_title': '',
 'question_main_text': '1 Huxley formulated his theory while studying a dinosaur belonging to a group called ………… .\n2 Heilmann rejected Huxley’s theory because of the apparent absence of.................in dinosaurs.\n3 Feduccia and Martin believe that the ancestor of today’s birds was a kind of early ………… .\n4 In cladistics, the ............... between organisms’ characteristics are of major importance.\n5 The dangerous................on a primitive bird from Madagascar adds weight to the ‘dino-bird’ argument.\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': '',
 'example_answer': '',
 'correct_answer': '1 theropods\n2 collarbones\n3 reptile\n4 similarities\n5 claw',
 'me

Parsed version


{'taskType': 'sentence-completion',
 'taskQuestionNumberList': [1, 2, 3, 4, 5],
 'taskQuestionNumberText': 'Questions 1 – 5',
 'taskDescription': 'Complete the sentences below.\nChoose ONE WORD ONLY from the passage for each answer.\nWrite your answers in boxes 1-5 on your answer sheet.',
 'questionMainTitle': '',
 'questionMainText': ['1 Huxley formulated his theory while studying a dinosaur belonging to a group called ………… .',
  '2 Heilmann rejected Huxley’s theory because of the apparent absence of.................in dinosaurs.',
  '3 Feduccia and Martin believe that the ancestor of today’s birds was a kind of early ………… .',
  '4 In cladistics, the ............... between organisms’ characteristics are of major importance.',
  '5 The dangerous................on a primitive bird from Madagascar adds weight to the ‘dino-bird’ argument.'],
 'questionItems': [{'questionNumber': 1, 'correctAnswer': 'theropods'},
  {'questionNumber': 2, 'correctAnswer': 'collarbones'},
  {'questionNumber'

In [66]:
parse_passage_content(raw_data['passage_content'])

{'hasParagraphMarkers': False,
 'passageContext': '[Note: This is an extract from a Part 2 task about the evolution of birds and their ancestry.]',
 'passageTitle': 'The origins of birds',
 'passageSubtitle': '',
 'passageMainText': 'The science of evolutionary relationships has undergone a major change in recent decades. It used to be the case that all the features of organisms were important in working out their family tree. But following the work of German entomologist Willi Hennig, many evolutionary scientists now believe that the only features which carry any useful information are the evolutionary ‘novelties’ shared between organisms. Mice, lizards and fish, for example, all have backbones – so the feature ‘backbone’ tells us nothing about their evolutionary relationship. But the feature ‘four legs’ is useful because it’s an evolutionary novelty – a characteristic shared only between the lizard and the mouse. This would suggest that the lizard and mouse are more closely related t

### Table Completion

In [70]:
def parse_table_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_question_number_list = parse_task_question_number(task_question_number)
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    question_list_title = clean_text_single_line(question_list_title)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    table_data_rows = [tuple(r.strip() for r in re.split(r'\|',row)) for row in question_main_text_lines]


    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": table_data_rows,
        "questionItems": question_items,
    }

# questionTask = load_yaml_file(cam11_test1_3)["question_content"][1]
yamlPath = table_completion
raw_data = load_yaml_file(yamlPath)
populate_metadata(raw_data)
print('Raw paragraph')
print(raw_data['passage_content']['passage_main_text'])
parsed_paragraphs = parse_passage_content(raw_data['passage_content'])['passageParagraphs']
print('Parsed paragraph')
display(parsed_paragraphs)

questionTask = raw_data["question_content"][0]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_table_completion(questionTask)


Raw paragraph
Introducing dung1 beetles into a pasture is a simple process: approximately 1,500 beetles 
are released, a handful at a time, into fresh cow pats2 in the cow pasture. The beetles 
immediately disappear beneath the pats digging and tunnelling and, if they successfully 
adapt to their new environment, soon become a permanent, self-sustaining part of the 
local ecology. In time they multiply and within three or four years the benefits to the 
pasture are obvious.
Dung beetles work from the inside of the pat so they are sheltered from predators such 
as birds and foxes. Most species burrow into the soil and bury dung in tunnels directly 
underneath the pats, which are hollowed out from within. Some large species originating 
from France excavate tunnels to a depth of approximately 30 cm below the dung pat.
These beetles make sausage-shaped brood chambers along the tunnels. The shallowest 
tunnels belong to a much smaller Spanish species that buries dung in chambers that hang 

['Introducing dung1 beetles into a pasture is a simple process: approximately 1,500 beetles are released, a handful at a time, into fresh cow pats2 in the cow pasture. The beetles immediately disappear beneath the pats digging and tunnelling and, if they successfully adapt to their new environment, soon become a permanent, self-sustaining part of the local ecology. In time they multiply and within three or four years the benefits to the pasture are obvious.',
 'Dung beetles work from the inside of the pat so they are sheltered from predators such as birds and foxes. Most species burrow into the soil and bury dung in tunnels directly underneath the pats, which are hollowed out from within. Some large species originating from France excavate tunnels to a depth of approximately 30 cm below the dung pat.',
 'These beetles make sausage-shaped brood chambers along the tunnels. The shallowest tunnels belong to a much smaller Spanish species that buries dung in chambers that hang like fruit fr

Raw version


{'task_type': 'table-completion',
 'task_question_number': 'Question 9 – 13\n',
 'task_description': 'Complete the table below.\nChoose NO MORE THAN THREE WORDS from the passage for each answer.\nWrite your answers in boxes 9-13 on your answer sheet.\n',
 'question_main_title': '',
 'question_main_text': 'Species | Size | Preferred climate | Complementary species | Start of active period | Number of generations per year\nFrench | 2.5 cm | cool | Spanish | late spring | 1 - 2\nSpanish | 1.25 cm | [9] |  | [10] | [11]\nSouth African ball roller |  | [12] | [13] | |\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': '',
 'example_answer': '',
 'correct_answer': '9 temperate\n10 early spring\n11 two to five / 2-5\n12 sub-tropical\n13 South African tunneling/tunnelling',
 'metadata': {'contains_floating_breaks': True}}

Parsed version


{'taskType': 'table-completion',
 'taskQuestionNumberList': [9],
 'taskQuestionNumberText': 'Question 9 – 13',
 'taskDescription': 'Complete the table below.\nChoose NO MORE THAN THREE WORDS from the passage for each answer.\nWrite your answers in boxes 9-13 on your answer sheet.',
 'questionMainTitle': '',
 'questionMainText': [('Species',
   'Size',
   'Preferred climate',
   'Complementary species',
   'Start of active period',
   'Number of generations per year'),
  ('French', '2.5 cm', 'cool', 'Spanish', 'late spring', '1 - 2'),
  ('Spanish', '1.25 cm', '[9]', '', '[10]', '[11]'),
  ('South African ball roller', '', '[12]', '[13]', '', '')],
 'questionItems': [{'questionNumber': 9, 'correctAnswer': 'temperate'},
  {'questionNumber': 10, 'correctAnswer': 'early spring'},
  {'questionNumber': 11, 'correctAnswer': 'two to five / 2-5'},
  {'questionNumber': 12, 'correctAnswer': 'sub-tropical'},
  {'questionNumber': 13,
   'correctAnswer': 'South African tunneling/tunnelling'}]}

### Matching Features

In [71]:
# Matching questions
def parse_matching_features(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    question_list_title = clean_text_single_line(question_list_title)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)
    # question_list_of_options_lines = re.split(r'\n+', question_list_of_options)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch


    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, # option is a tuple but correct answer might be a string
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

# questionTask = load_yaml_file(cam11_test1_3)["question_content"][2]
# questionTask = load_yaml_file(cam11_test2_1)["question_content"][1]
yamlPath = matching_features
raw_data = load_yaml_file(yamlPath)
populate_metadata(raw_data)
print('Raw paragraph')
print(raw_data['passage_content']['passage_main_text'])
parsed_paragraphs = parse_passage_content(raw_data['passage_content'])['passageParagraphs']
print('Parsed paragraph')
display(parsed_paragraphs)

questionTask = raw_data["question_content"][0]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_matching_features(questionTask)


Raw paragraph
The invention of rockets is linked inextricably with the invention of 'black powder'. Most 
historians of technology credit the Chinese with its discovery. They base their belief on 
studies of Chinese writings or on the notebooks of early Europeans who settled in or 
made long visits to China to study its history and civilisation. It is probable that, some 
time in the tenth century, black powder was first compounded from its basic ingredients 
of saltpetre, charcoal and sulphur. But this does not mean that it was immediately used 
to propel rockets. By the thirteenth century, powder-propelled fire arrows had become 
rather common. The Chinese relied on this type of technological development to 
produce incendiary projectiles of many sorts, explosive grenades and possibly cannons 
to repel their enemies. One such weapon was the 'basket of fire' or, as directly 
translated from Chinese, the 'arrows like flying leopards'. The 0.7 metre-long arrows, 
each with a long tube o

["The invention of rockets is linked inextricably with the invention of 'black powder'. Most historians of technology credit the Chinese with its discovery. They base their belief on studies of Chinese writings or on the notebooks of early Europeans who settled in or made long visits to China to study its history and civilisation. It is probable that, some time in the tenth century, black powder was first compounded from its basic ingredients of saltpetre, charcoal and sulphur. But this does not mean that it was immediately used to propel rockets. By the thirteenth century, powder-propelled fire arrows had become rather common. The Chinese relied on this type of technological development to produce incendiary projectiles of many sorts, explosive grenades and possibly cannons to repel their enemies. One such weapon was the 'basket of fire' or, as directly translated from Chinese, the 'arrows like flying leopards'. The 0.7 metre-long arrows, each with a long tube of gunpowder attached ne

Raw version


{'task_type': 'matching-features\n',
 'task_question_number': 'Questions 7 – 10\n',
 'task_description': 'Look at the following items (Questions 7-10) and the list of groups below. \nMatch each item with the group which first invented or used them.\nWrite the correct letter A-E in boxes 7-10 on your answer sheet.\nNB You may use any letter more than once.\n',
 'question_main_title': '',
 'question_main_text': '7 black powder\n8 rocket-propelled arrows for fighting\n9 rockets as war weapons\n10 the rocket launcher\n',
 'question_img_path': '',
 'question_list_title': 'First invented or used by\n',
 'question_list_of_options': 'A the Chinese \nB the Indians \nC the British \nD the Arabs\nE the Americans\n',
 'example_answer': '',
 'correct_answer': '7 A\n8 A\n9 B\n10 E',
 'metadata': {'contains_floating_breaks': True}}

Parsed version


{'taskType': 'matching-features',
 'taskQuestionNumberList': [7, 8, 9, 10],
 'taskQuestionNumberText': 'Questions 7 – 10',
 'taskDescription': 'Look at the following items (Questions 7-10) and the list of groups below.\nMatch each item with the group which first invented or used them.\nWrite the correct letter A-E in boxes 7-10 on your answer sheet.\nNB You may use any letter more than once.',
 'questionMainTitle': '',
 'questionMainText': '7 black powder\n8 rocket-propelled arrows for fighting\n9 rockets as war weapons\n10 the rocket launcher',
 'questionListTitle': 'First invented or used by',
 'questionListOptions': [('A', 'the Chinese '),
  ('B', 'the Indians '),
  ('C', 'the British '),
  ('D', 'the Arabs'),
  ('E', 'the Americans')],
 'questionItems': [{'questionNumber': 7,
   'questionText': 'black powder',
   'questionOptions': [('A', 'the Chinese '),
    ('B', 'the Indians '),
    ('C', 'the British '),
    ('D', 'the Arabs'),
    ('E', 'the Americans')],
   'correctAnswer': '

### Matching Headings

In [72]:


def parse_matching_headings(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)
    # question_list_of_options_lines = re.split(r'\n+', question_list_of_options)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+([ixv]+)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_option_item_pattern = re.compile(r'([ixv]+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch

    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        # "questionMainText": question_main_text, # Not needed
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }


# questionTask = load_yaml_file(cam11_test2_2)["question_content"][0]
yamlPath = matching_headings
raw_data = load_yaml_file(yamlPath)
populate_metadata(raw_data)
print('Raw paragraph')
print(raw_data['passage_content']['passage_main_text'])
parsed_paragraphs = parse_passage_content(raw_data['passage_content'])['passageParagraphs']
print('Parsed paragraph')
display(parsed_paragraphs)

questionTask = raw_data["question_content"][0]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_matching_headings(questionTask)


Raw paragraph
A
Some years ago, when several theoretical physicists, principally Dirk Helbing and Boris Kerner of 
Stuttgart, Germany, began publishing papers on traffic flow in publications normally read by traffic 
engineers, they were clearly working outside their usual sphere of investigation. They had noticed 
that if they simulated the movement of vehicles on a highway, using the equations that describe 
how the molecules of a gas move, some very strange results emerged. Of course, vehicles do not 
behave exactly like gas molecules: for example, drivers try to avoid collisions by slowing down when 
they get too near another vehicle, whereas gas molecules have no such concern. However, the 
physicists modified the equations to take the differences into account and the overall description of 
traffic as a flowing gas has proved to be a very good one; the moving-gas model of traffic 
reproduces many phenomena seen in real-world traffic.
The strangest thing that came out of these equ

[('A',
  'Some years ago, when several theoretical physicists, principally Dirk Helbing and Boris Kerner of Stuttgart, Germany, began publishing papers on traffic flow in publications normally read by traffic engineers, they were clearly working outside their usual sphere of investigation. They had noticed that if they simulated the movement of vehicles on a highway, using the equations that describe how the molecules of a gas move, some very strange results emerged. Of course, vehicles do not behave exactly like gas molecules: for example, drivers try to avoid collisions by slowing down when they get too near another vehicle, whereas gas molecules have no such concern. However, the physicists modified the equations to take the differences into account and the overall description of traffic as a flowing gas has proved to be a very good one; the moving-gas model of traffic reproduces many phenomena seen in real-world traffic.'),
 ('B',
  'Though a decidedly unsettling discovery, this sh

Raw version


{'task_type': 'matching-headings\n',
 'task_question_number': 'Questions 1 – 4\n',
 'task_description': 'Reading Passage 1 has five sections, A-E.\nChoose the correct heading for each section from the list of headings below.\nWrite the correct number, i-viii, in boxes 1-4 on your answer sheet.\n',
 'question_main_title': '',
 'question_main_text': '1 Section A\n2 Section C\n3 Section D\n4 Section E\n',
 'question_img_path': '',
 'question_list_title': 'List of Headings\n',
 'question_list_of_options': 'i Dramatic effects can result from small changes in traffic just as in nature\nii How a maths experiment actually reduced traffic congestion\niii How a concept from one field of study was applied in another\niv A lack of investment in driver training\nv Areas of doubt and disagreement between experts\nvi How different countries have dealt with traffic congestion\nvii The impact of driver behaviour on traffic speed\nviii A proposal to take control away from the driver\n',
 'example_answer

Parsed version


{'taskType': 'matching-headings',
 'taskQuestionNumberList': [1, 2, 3, 4],
 'taskQuestionNumberText': 'Questions 1 – 4',
 'taskDescription': 'Reading Passage 1 has five sections, A-E.\nChoose the correct heading for each section from the list of headings below.\nWrite the correct number, i-viii, in boxes 1-4 on your answer sheet.',
 'questionMainTitle': '',
 'questionListTitle': 'List of Headings',
 'questionListOptions': [('i',
   'Dramatic effects can result from small changes in traffic just as in nature'),
  ('ii', 'How a maths experiment actually reduced traffic congestion'),
  ('iii', 'How a concept from one field of study was applied in another'),
  ('iv', 'A lack of investment in driver training'),
  ('v', 'Areas of doubt and disagreement between experts'),
  ('vi', 'How different countries have dealt with traffic congestion'),
  ('vii', 'The impact of driver behaviour on traffic speed'),
  ('viii', 'A proposal to take control away from the driver')],
 'questionItems': [{'quest

### Matching Paragraphs

In [73]:
def parse_matching_paragraphs(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    # question_main_text = clean_floating_linebreaks(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    question_list_of_options = clean_text_multiple_line(question_list_of_options)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)')
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)')
    start_char, end_char = re.split('-', question_list_of_options)
    question_option_items = list_AZ(start_char, end_char)
    #  Strip items

    # Idea: Each matching question item contains question, list of choices and correct answer
    
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_item_match = question_item_pattern.match(question_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }


# questionTask = load_yaml_file(cam13_test2_2)["question_content"][0]
# questionTask = load_yaml_file(matching_paragraphs)["question_content"][0]
yamlPath = matching_paragraphs
raw_data = load_yaml_file(yamlPath)
populate_metadata(raw_data)
print('Raw paragraph')
print(raw_data['passage_content']['passage_main_text'])
parsed_paragraphs = parse_passage_content(raw_data['passage_content'])['passageParagraphs']
print('Parsed paragraph')
display(parsed_paragraphs)

questionTask = raw_data["question_content"][0]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_matching_paragraphs(questionTask)

Raw paragraph
A

Oxytocin is a chemical, a hormone produced in the pituitary gland in the brain. It was through various studies focusing on animals that scientists first became aware of the influence of oxytocin. They discovered that it helps reinforce the bonds between prairie voles, which mate for life, and triggers the motherly behaviour that sheep show towards their newborn lambs. It is also released by women in childbirth, strengthening the attachment between mother and baby. Few chemicals have as positive a reputation as oxytocin, which is sometimes referred to as the ‘love hormone’. One sniff of it can, it is claimed, make a person more trusting, empathetic, generous and cooperative. It is time, however, to revise this wholly optimistic view. A new wave of studies has shown that its effects vary greatly depending on the person and the circumstances, and it can impact on our social interactions for worse as well as for better.

B

Oxytocin’s role in human behaviour first emerged 

[('A',
  'Oxytocin is a chemical, a hormone produced in the pituitary gland in the brain. It was through various studies focusing on animals that scientists first became aware of the influence of oxytocin. They discovered that it helps reinforce the bonds between prairie voles, which mate for life, and triggers the motherly behaviour that sheep show towards their newborn lambs. It is also released by women in childbirth, strengthening the attachment between mother and baby. Few chemicals have as positive a reputation as oxytocin, which is sometimes referred to as the ‘love hormone’. One sniff of it can, it is claimed, make a person more trusting, empathetic, generous and cooperative. It is time, however, to revise this wholly optimistic view. A new wave of studies has shown that its effects vary greatly depending on the person and the circumstances, and it can impact on our social interactions for worse as well as for better.'),
 ('B',
  'Oxytocin’s role in human behaviour first emerge

Raw version


{'task_type': 'matching-paragraphs\n',
 'task_question_number': 'Questions 14-17\n',
 'task_description': 'Reading Passage 2 has six section, A-F.\n\nWhich paragraph contains the following information?\n\nWrite the correct letter, A-F, in boxes 14-17 on your answer sheet.\n\nNB  You may use any letter more than once.\n',
 'question_main_title': '',
 'question_main_text': '14   reference to research showing the beneficial effects of oxytocin on people\n\n15   reasons why the effects of oxytocin are complex\n\n16   mention of a period in which oxytocin attracted little scientific attention\n\n17   reference to people ignoring certain aspects of their research data\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': 'A-F\n',
 'example_answer': '',
 'correct_answer': '14. B\n\n15. F\n\n16. B\n\n17. E\n',
 'metadata': {'contains_floating_breaks': True}}

Parsed version


{'taskType': 'matching-paragraphs',
 'taskQuestionNumberList': [14, 15, 16, 17],
 'taskQuestionNumberText': 'Questions 14-17',
 'taskDescription': 'Reading Passage 2 has six section, A-F.\nWhich paragraph contains the following information?\nWrite the correct letter, A-F, in boxes 14-17 on your answer sheet.\nNB  You may use any letter more than once.',
 'questionMainTitle': '',
 'questionMainText': '14   reference to research showing the beneficial effects of oxytocin on people\n\n15   reasons why the effects of oxytocin are complex\n\n16   mention of a period in which oxytocin attracted little scientific attention\n\n17   reference to people ignoring certain aspects of their research data',
 'questionListTitle': '',
 'questionListOptions': ['A', 'B', 'C', 'D', 'E', 'F'],
 'questionItems': [{'questionNumber': 14,
   'questionText': 'reference to research showing the beneficial effects of oxytocin on people',
   'questionOptions': ['A', 'B', 'C', 'D', 'E', 'F'],
   'correctAnswer': 'B'

### Matching Sentence Endings

In [74]:
def parse_matching_sentence_endings(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    # question_main_text = clean_floating_linebreaks(question_main_text) # Since it usually doesnt end with a period, this would cause issues
    question_list_title = clean_text_single_line(question_list_title)
    question_list_of_options = clean_text_multiple_line(question_list_of_options)
    task_question_number_list = parse_task_question_number(task_question_number)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+([A-Z])') 
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    question_line_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)')
    #  Strip items

    # Idea: Each matching question item contains question, list of choices and correct answer
    
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines,correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        question_text = question_line_pattern.match(question_line).group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "exampleAnswer": example_answer,
        "questionItems": question_items,
    }

# questionTask = load_yaml_file(cam13_test2_3)["question_content"][2]
yamlPath = matching_sentence_endings
raw_data = load_yaml_file(yamlPath)
populate_metadata(raw_data)
print('Raw paragraph')
print(raw_data['passage_content']['passage_main_text'])
parsed_paragraphs = parse_passage_content(raw_data['passage_content'])['passageParagraphs']
print('Parsed paragraph')
display(parsed_paragraphs)

questionTask = raw_data["question_content"][0]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_summary_completion_word_list(questionTask)
parse_matching_sentence_endings(questionTask)


Raw paragraph
Deborah Harkness devotes her elegant 
and erudite new book, The Jewel House, to the 
scientific community in 16th-century London. 
She (rightly) argues that it is thanks to the 
imaginative collective efforts of the urban
scientists that London became the melting pot 
in which a new mathematical and 
experimental culture crystallized.
Harkness is known for her ingenuity as a 
researcher and her historical empathy. In The 
Jewel House, Harkness turns her skills on the 
city of London as a whole with surprising and 
fascinating results. She began her research by 
asking herself a new question: not what 
caused scientific revolution but what the 
names science and scientist meant in 16th-century London. Then she collected a vast 
range of sources, from printed books to
scientific instruments and notebooks, and 
recorded, in a relational database, information 
on the men and women who produced them.
Every chapter of The Jewel House charts the 
activities of a particular commu

['Deborah Harkness devotes her elegant and erudite new book, The Jewel House, to the scientific community in 16th-century London. She (rightly) argues that it is thanks to the imaginative collective efforts of the urbanscientists that London became the melting pot in which a new mathematical and experimental culture crystallized.',
 'Harkness is known for her ingenuity as a researcher and her historical empathy. In The Jewel House, Harkness turns her skills on the city of London as a whole with surprising and fascinating results. She began her research by asking herself a new question: not what caused scientific revolution but what the names science and scientist meant in 16th-century London. Then she collected a vast range of sources, from printed books toscientific instruments and notebooks, and recorded, in a relational database, information on the men and women who produced them.',
 'Every chapter of The Jewel House charts the activities of a particular community. Harkness leads us

Raw version


{'task_type': 'matching-sentence-endings\n',
 'task_question_number': 'Questions 1 – 3\n',
 'task_description': 'Complete each sentence with the correct ending, A-F, below. Write the correct letter, A-F, in boxes \n1-3 on your answer sheet.\n',
 'question_main_title': '',
 'question_main_text': '1 Harkness’s research method was different to that of other writers because\n2 Harkness’s reconstruction of the 16th-century London scientific groups was new because\n3 Harkness shows that the 16th-century London scientists were innovative because\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': 'A she has the greatest knowledge of Elizabethan London.\nB she started by seeking to understand how basic terms were used in the past.\nC they worked as individuals rather than as a group.\nD she examined how their methods evolved and changed.\nE Clement Draper was the best scientist of his time.\nF they used old ways of analysing written information for new purpos

Parsed version


{'taskType': 'matching-sentence-endings',
 'taskQuestionNumberList': [1, 2, 3],
 'taskQuestionNumberText': 'Questions 1 – 3',
 'taskDescription': 'Complete each sentence with the correct ending, A-F, below. Write the correct letter, A-F, in boxes\n1-3 on your answer sheet.',
 'questionMainTitle': '',
 'questionMainText': '1 Harkness’s research method was different to that of other writers because\n2 Harkness’s reconstruction of the 16th-century London scientific groups was new because\n3 Harkness shows that the 16th-century London scientists were innovative because',
 'questionListTitle': '',
 'questionListOptions': [('A',
   'she has the greatest knowledge of Elizabethan London.'),
  ('B',
   'she started by seeking to understand how basic terms were used in the past.'),
  ('C', 'they worked as individuals rather than as a group.'),
  ('D', 'she examined how their methods evolved and changed.'),
  ('E', 'Clement Draper was the best scientist of his time.'),
  ('F',
   'they used old w

### Summary Completion Word List

In [75]:
def parse_summary_completion_word_list(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    question_list_of_options = clean_text_multiple_line(question_list_of_options)
    task_question_number_list = parse_task_question_number(task_question_number)

    
    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+([A-Z])') 
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') 
    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    #  Strip items

    # Idea: Each matching question item contains question, list of choices and correct answer
    
    question_items = []
    for answer_line in correct_answer_lines:
        answer_line = answer_line.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }


# questionTask = load_yaml_file(cam11_test2_3)["question_content"][1]
yamlPath = summary_completion_word_list
raw_data = load_yaml_file(yamlPath)
populate_metadata(raw_data)
print('Raw paragraph')
print(raw_data['passage_content']['passage_main_text'])
parsed_paragraphs = parse_passage_content(raw_data['passage_content'])['passageParagraphs']
print('Parsed paragraph')
display(parsed_paragraphs)

questionTask = raw_data["question_content"][0]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_summary_completion_word_list(questionTask)


Raw paragraph
Of all mankind’s manifold creations, language must take pride of place. Other inventions – the 
wheel, agriculture, sliced bread – may have transformed our material existence, but the advent 
of language is what made us human. Compared to language, all other inventions pale in 
significance, since everything we have ever achieved depends on language and originates from 
it. Without language, we could never have embarked on our ascent to unparalleled power over
all other animals, and even over nature itself.
But language is foremost not just because it came first. In its own right it is a tool of 
extraordinary sophistication, yet based on an idea of ingenious simplicity: ‘this marvellous 
invention of composing out of twenty-five or thirty sounds that infinite variety of expressions 
which, whilst having in themselves no likeness to what is in our mind, allow us to disclose to 
others its whole secret, and to make known to those who cannot penetrate it all that we 
imagin

['Of all mankind’s manifold creations, language must take pride of place. Other inventions – the wheel, agriculture, sliced bread – may have transformed our material existence, but the advent of language is what made us human. Compared to language, all other inventions pale in significance, since everything we have ever achieved depends on language and originates from it. Without language, we could never have embarked on our ascent to unparalleled power overall other animals, and even over nature itself.',
 'But language is foremost not just because it came first. In its own right it is a tool of extraordinary sophistication, yet based on an idea of ingenious simplicity: ‘this marvellous invention of composing out of twenty-five or thirty sounds that infinite variety of expressions which, whilst having in themselves no likeness to what is in our mind, allow us to disclose to others its whole secret, and to make known to those who cannot penetrate it all that we imagine, and all the var

Raw version


{'task_type': 'summary-completion-word-list',
 'task_question_number': 'Questions 1 – 4\n',
 'task_description': 'Complete the summary using the list of words, A-G, below.\nWrite the correct letter, A-G, in boxes 1-4 on your answer sheet.\n',
 'question_main_title': 'The importance of language\n',
 'question_main_text': 'The wheel is one invention that has had a major impact on 1.................aspects of life, but no\nimpact has been as 2 ………… as that of language. Language is very 3 ………… , yet\ncomposed of just a small number of sounds. Language appears to be 4 ………… to use. \nHowever, its sophistication is often overlooked.\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': 'A difficult \nB complex \nC original\nD admired \nE material \nF easy\nG fundamental\n',
 'example_answer': '',
 'correct_answer': '1 E ■ material\n2 G ■ fundamental\n3 B ■ complex\n4 F ■ easy',
 'metadata': {'contains_floating_breaks': True}}

Parsed version


{'taskType': 'summary-completion-word-list',
 'taskQuestionNumberList': [1, 2, 3, 4],
 'taskQuestionNumberText': 'Questions 1 – 4',
 'taskDescription': 'Complete the summary using the list of words, A-G, below.\nWrite the correct letter, A-G, in boxes 1-4 on your answer sheet.',
 'questionMainTitle': 'The importance of language',
 'questionMainText': 'The wheel is one invention that has had a major impact on 1.................aspects of life, but noimpact has been as 2 ………… as that of language. Language is very 3 ………… , yetcomposed of just a small number of sounds. Language appears to be 4 ………… to use. However, its sophistication is often overlooked.',
 'questionListTitle': '',
 'questionListOptions': [('A', 'difficult'),
  ('B', 'complex'),
  ('C', 'original'),
  ('D', 'admired'),
  ('E', 'material'),
  ('F', 'easy'),
  ('G', 'fundamental')],
 'questionItems': [{'questionNumber': 1,
   'questionOptions': [('A', 'difficult'),
    ('B', 'complex'),
    ('C', 'original'),
    ('D', 'admi

### True False Not Given & Yes No Not Given

In [76]:
def parse_true_false_notgiven(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_floating_linebreaks(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    task_question_number_list = parse_task_question_number(task_question_number)


    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n

    # Idea: Each TFNG question item contains question, list of choices (TFNG) and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": ["TRUE", "FALSE", "NOT GIVEN"], # Always the same
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }

def parse_yes_no_notgiven(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_floating_linebreaks(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)
    task_question_number_list = parse_task_question_number(task_question_number)


    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n

    # Idea: Each TFNG question item contains question, list of choices (TFNG) and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": ["YES", "NO", "NOT GIVEN"], # Always the same
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }

questionTask = load_yaml_file(cam11_test1_1)["question_content"][1] # TFNG
yamlPath = true_false_notgiven
raw_data = load_yaml_file(yamlPath)
populate_metadata(raw_data)
print('Raw paragraph')
print(raw_data['passage_content']['passage_main_text'])
parsed_paragraphs = parse_passage_content(raw_data['passage_content'])['passageParagraphs']
print('Parsed paragraph')
display(parsed_paragraphs)

questionTask = raw_data["question_content"][0]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_true_false_notgiven(questionTask)
# parse_yes_no_notgiven(questionTask)
    

Raw paragraph
Marie Curie is probably the most famous woman scientist who has ever lived. Born Maria 
Sklodowska in Poland in 1867, she is famous for her work on radioactivity, and was twice a 
winner of the Nobel Prize. With her husband, Pierre Curie, and Henri Becquerel, she was 
awarded the 1903 Nobel Prize for Physics, and was then sole winner of the 1911 Nobel Prize 
for Chemistry. She was the first woman to win a Nobel Prize.
From childhood, Marie was remarkable for her prodigious memory, and at the age of 16 won 
a gold medal on completion of her secondary education. Because her father lost his savings 
through bad investment, she then had to take work as a teacher. From her earnings she was 
able to finance her sister Bronia’s medical studies in Paris, on the understanding that Bronia 
would, in turn, later help her to get an education.
In 1891 this promise was fulfilled and Marie went to Paris and began to study at the 
Sorbonne (the University of Paris). She often worked far 

['Marie Curie is probably the most famous woman scientist who has ever lived. Born Maria Sklodowska in Poland in 1867, she is famous for her work on radioactivity, and was twice a winner of the Nobel Prize. With her husband, Pierre Curie, and Henri Becquerel, she was awarded the 1903 Nobel Prize for Physics, and was then sole winner of the 1911 Nobel Prize for Chemistry. She was the first woman to win a Nobel Prize.',
 'From childhood, Marie was remarkable for her prodigious memory, and at the age of 16 won a gold medal on completion of her secondary education. Because her father lost his savings through bad investment, she then had to take work as a teacher. From her earnings she was able to finance her sister Bronia’s medical studies in Paris, on the understanding that Bronia would, in turn, later help her to get an education.',
 'In 1891 this promise was fulfilled and Marie went to Paris and began to study at the Sorbonne (the University of Paris). She often worked far into the nigh

Raw version


{'task_type': 'true-false-notgiven',
 'task_question_number': 'Questions 1 – 3\n',
 'task_description': 'Do the following statements agree with the information given in Reading Passage 1?\nIn boxes 1-3 on your answer sheet, write\nTRUE if the statement agrees with the information\nFALSE if the statement contradicts the information\nNOT GIVEN if there is no information on this\n',
 'question_main_title': '',
 'question_main_text': '1 Marie Curie’s husband was a joint winner of both Marie’s Nobel Prizes.\n2 Marie became interested in science when she was a child.\n3 Marie was able to attend the Sorbonne because of her sister’s financial contribution.\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': '',
 'example_answer': '',
 'correct_answer': '1 FALSE\n2 NOT GIVEN\n3 TRUE',
 'metadata': {'contains_floating_breaks': True}}

Parsed version


{'taskType': 'true-false-notgiven',
 'taskQuestionNumberList': [1, 2, 3],
 'taskQuestionNumberText': 'Questions 1 – 3',
 'taskDescription': 'Do the following statements agree with the information given in Reading Passage 1?\nIn boxes 1-3 on your answer sheet, write\nTRUE if the statement agrees with the information\nFALSE if the statement contradicts the information\nNOT GIVEN if there is no information on this',
 'questionMainTitle': '',
 'questionMainText': '1 Marie Curie’s husband was a joint winner of both Marie’s Nobel Prizes.\n2 Marie became interested in science when she was a child.\n3 Marie was able to attend the Sorbonne because of her sister’s financial contribution.',
 'questionItems': [{'questionNumber': 1,
   'questionText': 'Marie Curie’s husband was a joint winner of both Marie’s Nobel Prizes.',
   'questionOptions': ['TRUE', 'FALSE', 'NOT GIVEN'],
   'correctAnswer': 'FALSE'},
  {'questionNumber': 2,
   'questionText': 'Marie became interested in science when she was a

In [77]:
yamlPath = yes_no_notgiven
raw_data = load_yaml_file(yamlPath)
populate_metadata(raw_data)
print('Raw paragraph')
print(raw_data['passage_content']['passage_main_text'])
parsed_paragraphs = parse_passage_content(raw_data['passage_content'])['passageParagraphs']
print('Parsed paragraph')
display(parsed_paragraphs)

questionTask = raw_data["question_content"][0]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_yes_no_notgiven(questionTask)

Raw paragraph
An emerging discipline called neuroaesthetics is seeking to bring scientific objectivity to the study of art, and has already given us a better understanding of many masterpieces. The blurred imagery of Impressionist paintings seems to stimulate the brain’s amygdala, for instance. Since the amygdala plays a crucial role in our feelings, that finding might explain why many people find these pieces so moving.

Could the same approach also shed light on abstract twentieth-century pieces, from Mondrian’s geometrical blocks of colour, to Pollock’s seemingly haphazard arrangements of splashed paint on canvas? Sceptics believe that people claim to like such works simply because they are famous. We certainly do have an inclination to follow the crowd. When asked to make simple perceptual decisions such as matching a shape to its rotated image, for example, people often choose a definitively wrong answer if they see others doing the same. It is easy to imagine that this mentality 

['An emerging discipline called neuroaesthetics is seeking to bring scientific objectivity to the study of art, and has already given us a better understanding of many masterpieces. The blurred imagery of Impressionist paintings seems to stimulate the brain’s amygdala, for instance. Since the amygdala plays a crucial role in our feelings, that finding might explain why many people find these pieces so moving.',
 'Could the same approach also shed light on abstract twentieth-century pieces, from Mondrian’s geometrical blocks of colour, to Pollock’s seemingly haphazard arrangements of splashed paint on canvas? Sceptics believe that people claim to like such works simply because they are famous. We certainly do have an inclination to follow the crowd. When asked to make simple perceptual decisions such as matching a shape to its rotated image, for example, people often choose a definitively wrong answer if they see others doing the same. It is easy to imagine that this mentality would hav

Raw version


{'task_type': 'yes-no-notgiven\n',
 'task_question_number': 'Questions 34-39\n',
 'task_description': 'Do the following statements agree with the views of the writer in Reading Passage 3?\n\nIn boxes 34-39 on your answer sheet, write\n\nYES                  if the statement agrees with the views of the writer\n\nNO                   if the statement contradicts the views of the writer\n\nNOT GIVEN    if there is no information on this\n',
 'question_main_title': '',
 'question_main_text': '34   Forsythe’s findings contradicted previous beliefs on the function of ‘fractals’ in art.\n\n35   Certain ideas regarding the link between ‘mirror neurons’ and art appreciation require further verification.\n\n36   People’s taste in paintings depends entirely on the current artistic trends of the period.\n\n37   Scientists should seek to define the precise rules which govern people’s reactions to works of art.\n\n38   Art appreciation should always involve taking into consideration the cultural co

Parsed version


{'taskType': 'yes-no-notgiven',
 'taskQuestionNumberList': [34, 35, 36, 37, 38, 39],
 'taskQuestionNumberText': 'Questions 34-39',
 'taskDescription': 'Do the following statements agree with the views of the writer in Reading Passage 3?\nIn boxes 34-39 on your answer sheet, write\nYES                  if the statement agrees with the views of the writer\nNO                   if the statement contradicts the views of the writer\nNOT GIVEN    if there is no information on this',
 'questionMainTitle': '',
 'questionMainText': '34   Forsythe’s findings contradicted previous beliefs on the function of ‘fractals’ in art.\n35   Certain ideas regarding the link between ‘mirror neurons’ and art appreciation require further verification.\n36   People’s taste in paintings depends entirely on the current artistic trends of the period.\n37   Scientists should seek to define the precise rules which govern people’s reactions to works of art.\n38   Art appreciation should always involve taking into co

### Multiple Choice Select Many

In [78]:


def parse_multiple_choice_select_many(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()


    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_floating_linebreaks(question_main_text)
    task_question_number_list = parse_task_question_number(task_question_number)

    # matching patterns
    correct_answer_pattern_1 = re.compile(r'\d+[^a-zA-Z\d\(\)\-\+:]+([A-Z])') # 25. B\n26. D\n27. E'
    correct_answer_pattern_2 = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+.*') # 'B ■ They can predict...\nG ■ They are more...'
    question_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_lines = re.split(r'\n+', correct_answer)
    question_list_of_options_lines = re.split(r'\n+', question_list_of_options)


    question_items =  question_item_pattern.findall(question_list_of_options)
    if correct_answer_pattern_1.match(correct_answer):
        correct_answer_items = correct_answer_pattern_1.findall(correct_answer)
    elif correct_answer_pattern_2.match(correct_answer):
        correct_answer_items = correct_answer_pattern_2.findall(correct_answer)
    # correct_answer = correct_answer_pattern.findall(correct_answer)

    return {
    "taskType": task_type,
    "taskQuestionNumberList": task_question_number_list,
    "taskQuestionNumberText": task_question_number,
    "taskDescription": task_description,
    "questionMainTitle": question_main_title,
    "questionMainText": question_main_text,
    "questionItems": question_items,
    "correctAnswer": correct_answer_items
    }

# questionTask = load_yaml_file(cam11_test2_2)["question_content"][2]
yamlPath = multiple_choice_select_many
raw_data = load_yaml_file(yamlPath)
populate_metadata(raw_data)
print('Raw paragraph')
print(raw_data['passage_content']['passage_main_text'])
parsed_paragraphs = parse_passage_content(raw_data['passage_content'])['passageParagraphs']
print('Parsed paragraph')
display(parsed_paragraphs)

questionTask = raw_data["question_content"][0]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_multiple_choice_select_many(questionTask)

Raw paragraph
Clearly, when older people do heavy physical work, their age may affect their productivity. But 
other skills may increase with age, including many that are crucial for good management, such as 
an ability to handle people diplomatically, to run a meeting or to spot a problem before it blows 
up. Peter Hicks, who co-ordinates OECD work on the policy implications of ageing, says that 
plenty of research suggests older people are paid more because they are worth more.
And the virtues of the young may be exaggerated. ‘The few companies that have kept on older 
workers find they have good judgement and their productivity is good,’ says Peter Peterson, 
author of a recent book on the impact of ageing. ‘Besides, their education standards are much 
better than those of today’s young high-school graduates.’ Companies may say that older 
workers are not worth training because they are reaching the end of their working lives; in fact, 
young people tend to switch jobs so frequently

['Clearly, when older people do heavy physical work, their age may affect their productivity. But other skills may increase with age, including many that are crucial for good management, such as an ability to handle people diplomatically, to run a meeting or to spot a problem before it blows up. Peter Hicks, who co-ordinates OECD work on the policy implications of ageing, says that plenty of research suggests older people are paid more because they are worth more.',
 'And the virtues of the young may be exaggerated. ‘The few companies that have kept on older workers find they have good judgement and their productivity is good,’ says Peter Peterson, author of a recent book on the impact of ageing. ‘Besides, their education standards are much better than those of today’s young high-school graduates.’ Companies may say that older workers are not worth training because they are reaching the end of their working lives; in fact, young people tend to switch jobs so frequently that they offer 

Raw version


{'task_type': 'multiple-choice-select-many',
 'task_question_number': 'Questions 1 and 2\n',
 'task_description': 'Choose TWO letters, A-G.\nWrite the correct letters in boxes 1 and 2 on your answer sheet.\n',
 'question_main_title': '',
 'question_main_text': 'The list below gives some of the advantages of employing older workers.\nWhich TWO advantages are mentioned by the writer of the text?\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': 'A They are less likely to be involved in careless accidents.\nB They can predict areas that may cause trouble in the future.\nC They are able to train younger workers.\nD They can deal with unexpected problems.\nE They are more conscientious.\nF They are prepared to work for lower salaries.\nG They are more skilled in personal relationships.\n',
 'example_answer': '',
 'correct_answer': 'B ■ They can predict areas that may cause trouble in the future.\nG ■ They are more skilled in personal relationships.\n',
 

Parsed version


{'taskType': 'multiple-choice-select-many',
 'taskQuestionNumberList': [1, 2],
 'taskQuestionNumberText': 'Questions 1 and 2',
 'taskDescription': 'Choose TWO letters, A-G.\nWrite the correct letters in boxes 1 and 2 on your answer sheet.',
 'questionMainTitle': '',
 'questionMainText': 'The list below gives some of the advantages of employing older workers.\nWhich TWO advantages are mentioned by the writer of the text?',
 'questionItems': [('A',
   'They are less likely to be involved in careless accidents.'),
  ('B', 'They can predict areas that may cause trouble in the future.'),
  ('C', 'They are able to train younger workers.'),
  ('D', 'They can deal with unexpected problems.'),
  ('E', 'They are more conscientious.'),
  ('F', 'They are prepared to work for lower salaries.'),
  ('G', 'They are more skilled in personal relationships.')],
 'correctAnswer': ['B', 'G']}

### Multiple Choice Select One

In [79]:
def parse_multiple_choice_select_one(questionTask):
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()
    task_question_number = questionTask['task_question_number'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()

    # Not applicable but included for consistency
    question_list_of_options = questionTask['question_list_of_options'].strip()
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_multiple_line(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)
    task_question_number_list = parse_task_question_number(task_question_number)

    mcq_question_content_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    mcq_question_option_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch
    # correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_pattern_1 = re.compile(r'\d+[^a-zA-Z\d\(\)\-\+:]+([A-Z])') # 25. B\n26. D\n27. E'
    correct_answer_pattern_2 = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+.*') # 'B ■ They can predict...\nG ■ They are more...'
    # Split items
    multiple_choice_question_item_lines = re.split(r'\n(?=\d+\s)', question_main_text) # ['27    In the secon...\nA   the subject...\nB   the subject...', '28    The author...\nA   the subject...\nB   the subject...']
    correct_answer_lines = re.split(r'\n+', correct_answer) # ['27. A', '28. B']

    question_items = []
    for mcq_question_item_line, answer_line in zip(multiple_choice_question_item_lines, correct_answer_lines):
        mcq_question_item_line, answer_line = mcq_question_item_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = mcq_question_content_pattern.match(mcq_question_item_line)
        question_option_items = mcq_question_option_pattern.findall(mcq_question_item_line)
        # answer_item_match = correct_answer_pattern.match(answer_line)
        if correct_answer_pattern_1.match(answer_line):
            correct_answer = correct_answer_pattern_1.match(answer_line).group(1)
        elif correct_answer_pattern_2.match(answer_line):
            correct_answer = correct_answer_pattern_2.match(answer_line).group(1)

        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = correct_answer
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskQuestionNumberList": task_question_number_list,
        "taskQuestionNumberText": task_question_number,
        "taskDescription": task_description,
        "questionItems": question_items,
    }

questionTask = load_yaml_file(cam11_test2_3)["question_content"][0]
# 
yamlPath = multiple_choice_select_one
# questionTask = load_yaml_file(yamlPath)["question_content"][0]
raw_data = load_yaml_file(yamlPath)
populate_metadata(raw_data)
print('Raw paragraph')
print(raw_data['passage_content']['passage_main_text'])
parsed_paragraphs = parse_passage_content(raw_data['passage_content'])['passageParagraphs']
print('Parsed paragraph')
display(parsed_paragraphs)

questionTask = raw_data["question_content"][0]
print('Raw version')
display(questionTask)
print('Parsed version')
parse_multiple_choice_select_one(questionTask)

Raw paragraph
The general assumption is that older workers are paid more in spite of, rather than because of, 
their productivity. That might partly explain why, when employers are under pressure to cut costs, 
they persuade a 55-year old to take early retirement. Take away seniority-based pay scales, and 
older workers may become a much more attractive employment proposition. But most employers 
and many workers are uncomfortable with the idea of reducing someone’s pay in later life –
although manual workers on piece-rates often earn less as they get older. So retaining the 
services of older workers may mean employing them in different ways.
One innovation was devised by IBM Belgium. Faced with the need to cut staff costs, and having 
decided to concentrate cuts on 55 to 60-year olds, IBM set up a separate company called Skill 
Team, which re-employed any of the early retired who wanted to go on working up to the age of 
60. An employee who joined Skill Team at the age of 55 on a fiv

['The general assumption is that older workers are paid more in spite of, rather than because of, their productivity. That might partly explain why, when employers are under pressure to cut costs, they persuade a 55-year old to take early retirement. Take away seniority-based pay scales, and older workers may become a much more attractive employment proposition. But most employers and many workers are uncomfortable with the idea of reducing someone’s pay in later life –although manual workers on piece-rates often earn less as they get older. So retaining the services of older workers may mean employing them in different ways.',
 'One innovation was devised by IBM Belgium. Faced with the need to cut staff costs, and having decided to concentrate cuts on 55 to 60-year olds, IBM set up a separate company called Skill Team, which re-employed any of the early retired who wanted to go on working up to the age of 60. An employee who joined Skill Team at the age of 55 on a five-year contract w

Raw version


{'task_type': 'multiple-choice-select-one',
 'task_question_number': 'Questions 1 – 4\n',
 'task_description': 'Choose the correct letter, A, B, C or D.\nWrite the correct letter in boxes 1-4 on your answer sheet.\n',
 'question_main_title': '',
 'question_main_text': '1 In paragraph one, the writer suggests that companies could consider\nA abolishing pay schemes that are based on age.\nB avoiding pay that is based on piece-rates.\nC increasing pay for older workers.\nD equipping older workers with new skills.\n2 Skill Team is an example of a company which\nA offers older workers increases in salary.\nB allows people to continue working for as long as they want.\nC allows the expertise of older workers to be put to use.\nD treats older and younger workers equally.\n3 According to the writer, ‘bridge’ jobs\nA tend to attract people in middle-salary ranges.\nB are better paid than some full-time jobs.\nC originated in the United States.\nD appeal to distinct groups of older workers.\n4 D

Parsed version


{'taskType': 'multiple-choice-select-one',
 'taskQuestionNumberList': [1, 2, 3, 4],
 'taskQuestionNumberText': 'Questions 1 – 4',
 'taskDescription': 'Choose the correct letter, A, B, C or D.\nWrite the correct letter in boxes 1-4 on your answer sheet.',
 'questionItems': [{'questionNumber': 1,
   'questionText': 'In paragraph one, the writer suggests that companies could considerA abolishing pay schemes that are based on age.',
   'questionOptions': [('A', 'abolishing pay schemes that are based on age.'),
    ('B', 'avoiding pay that is based on piece-rates.'),
    ('C', 'increasing pay for older workers.'),
    ('D', 'equipping older workers with new skills.')],
   'correctAnswer': 'A'},
  {'questionNumber': 2,
   'questionText': 'Skill Team is an example of a company whichA offers older workers increases in salary.',
   'questionOptions': [('A', 'offers older workers increases in salary.'),
    ('B', 'allows people to continue working for as long as they want.'),
    ('C', 'allows t