In [6]:
import re
import json
import yaml
from IPython.display import display, Markdown, HTML

In [7]:
def load_yaml_file(filename):
    with open(filename, 'r') as f:
        return yaml.safe_load(f)

def clean_text_single_line(text):
    return re.sub(r'\s+', ' ', text)

def clean_text_multiple_line(text):
    return re.sub(r'\n+', '\n', text)

def clean_text_paragraph(text):
    return re.sub(r'\n+', '\n', text)


#  Completion questions
def parse_diagram_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    question_img_path = questionTask['question_img_path'].strip()  # Most important
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)

    # matching patterns
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_items = re.split(r'\n+', correct_answer)

    # Idea: Correct-answer-bassed question items. Each question item will be created for each correct answer   
    question_items = []
    for answer_item in correct_answer_items:
        answer_item = answer_item.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_item)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
    })

    return {
        "taskType": task_type,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionImgPath": question_img_path, # Most important
        "questionItems": question_items,
    }

def parse_flow_chart_completion(questionTask):
    return parse_diagram_completion(questionTask)

def parse_sentence_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    # matching patterns
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_items = re.split(r'\n+', correct_answer)

    # Idea: Correct-answer-bassed question items. Each question item will be created for each correct answer   
    question_items = []
    for answer_item in correct_answer_items:
        answer_item = answer_item.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_item)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
    })

    return {
        "taskType": task_type,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }

def parse_summary_completion():
    pass

def parse_summary_completion_word_list():
    pass

def parse_table_completion():
    pass

def parse_note_completion():
    pass


# Matching questions
def parse_matching_features(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)
    # question_list_of_options_lines = re.split(r'\n+', question_list_of_options)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch


    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, # option is a tuple but correct answer might be a string
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

def parse_matching_headings():
    pass

def parse_matching_sentence_endings():
    pass

def parse_true_false_not_given(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n

    # Idea: Each TFNG question item contains question, list of choices (TFNG) and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": ["TRUE", "FALSE", "NOT GIVEN"], # Always the same
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }

def parse_yes_no_not_given(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n

    # Idea: Each TFNG question item contains question, list of choices (TFNG) and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": ["YES", "NO", "NOT GIVEN"], # Always the same
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }


# Choice questions
def parse_multiple_choice_select_one():
    pass

def parse_multiple_choice_select_many():
    pass




In [8]:
def parse_reading_from_yaml(filename):
    with open(filename, 'r') as f:
        raw_data = yaml.safe_load(f)
    
    reading_info = parse_reading_info(raw_data["reading_info"])
    passage_content = parse_passage_content(raw_data["passage_content"])
    question_content = parse_question_content(raw_data["question_content"])
    return {
        "readingInfo": reading_info,
        "passageContent": passage_content,
        "questionContent": question_content
    }

def parse_reading_info(reading_info_data):
    return {
        "raedingTitle": clean_text_single_line(reading_info_data["reading_title"]),
        "readingSubtitle": clean_text_single_line(reading_info_data["reading_subtitle"]),
    }
    pass

def parse_passage_content(passage_data):
    return {
        "passageContext": clean_text_single_line(passage_data["passage_context"]),
        "passageTitle": clean_text_single_line(passage_data["passage_title"]),
        "passageSubtitle": clean_text_single_line(passage_data["passage_subtitle"]),
        "passageMainText": clean_text_paragraph(passage_data["passage_main_text"]),
    }

def parse_question_content(question_data):
    questionTasks = []
    for question in question_data:
        questionTasks.append(parse_question_task(question))
    return questionTasks

def parse_question_task(questionTask):
    parser_functions = {
        "multiple_choice_select_one": parse_multiple_choice_select_one,
        "multiple_choice_select_many": parse_multiple_choice_select_many,
        "diagram_completion": parse_diagram_completion,
        "flow_chart_completion": parse_flow_chart_completion,
        "summary_completion": parse_summary_completion,
        "summary_completion_word_list": parse_summary_completion_word_list,
        "table_completion": parse_table_completion,
        "note_completion": parse_note_completion,
        "matching_features": parse_matching_features,
        "matching_headings": parse_matching_headings,
        "matching_sentence_endings": parse_matching_sentence_endings,
        "true_false_not_given": parse_true_false_not_given,
        "yes_no_not_given": parse_yes_no_not_given
    }
    if questionTask["task_type"] not in parser_functions:
        raise Exception("Invalid question type")
    return parser_functions[questionTask["task_type"]](questionTask)


In [9]:
cam11_test1_1 = "../components/assets/yaml/cam-11-test-1/cam-11-test-1-1.yaml"
cam11_test1_2 = "../components/assets/yaml/cam-11-test-1/cam-11-test-1-2.yaml"
cam11_test1_3 = "../components/assets/yaml/cam-11-test-1/cam-11-test-1-3.yaml"
cam11_test2_1 = "../components/assets/yaml/cam-11-test-2/cam-11-test-2-1.yaml"
cam11_test2_2 = "../components/assets/yaml/cam-11-test-2/cam-11-test-2-2.yaml"
cam11_test2_3 = "../components/assets/yaml/cam-11-test-2/cam-11-test-2-3.yaml"

raw_data = load_yaml_file(cam11_test1_2)
raw_data["question"]

[{'task_type': 'true_false_notgiven\n',
  'task_description': 'Questions 14-19\nDo the following statements agree with the information given in Reading Passage 2?\n\nIn boxes 14-19 on your answer sheet, write\n\nTRUE                if the statement agrees with the information\n\nFALSE               if the statement contradicts the information\n\nNOT GIVEN     if there is no information on this\n',
  'question_main_title': '',
  'question_main_text': '14   The Falkirk Wheel has linked the Forth & Clyde Canal with the Union Canal for the first time in their history.\n\n15   There was some opposition to the design of the Falkirk Wheel at first.\n\n16   The Falkirk Wheel was initially put together at the location where its components were manufactured.\n\n17   The Falkirk Wheel is the only boat lift in the world which has steel sections bolted together by hand.\n\n18   The weight of the gondolas varies according to the size of boat being carried.\n\n19   The construction of the Falkirk Whe

### Sentence Completion

In [10]:
def parse_sentence_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)

    # matching patterns
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_items = re.split(r'\n+', correct_answer)

    # Idea: Correct-answer-bassed question items. Each question item will be created for each correct answer   
    question_items = []
    for answer_item in correct_answer_items:
        answer_item = answer_item.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_item)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
    })

    return {
        "taskType": task_type,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }
    
questionTask = load_yaml_file(cam11_test1_1)["question"][0]
parse_sentence_completion(questionTask)



{'taskType': 'sentence_completion',
 'taskDescription': 'Questions 1-7\nComplete the sentences below.\nChoose NO MORE THAN TWO WORDS from the passage for each answer.\nWrite your answers in boxes 1-7 on your answer sheet.',
 'questionMainTitle': 'Indoor farming',
 'questionMainText': '1    Some food plants, including……………… are already grown indoors.\n2    Vertical farms would be located in………………, meaning that there would be less need to take them long distances to customers.\n3    Vertical farms could use methane from plants and animals to produce………………..\n4    The consumption of………………… would be cut because agricultural vehicles would be unnecessary.\n5    The fact that vertical farms would need……………….. light is a disadvantage.\n6    One form of vertical farming involves planting in……………….. which are not fixed.\n7    The most probable development is that food will be grown on………………… in towns and cities.',
 'questionItems': [{'questionNumber': 1, 'correctAnswer': 'tomatoes'},
  {'questi

### True False Not Given & Yes No Not Given

In [11]:
def parse_true_false_not_given(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()

    # Main content
    task_description = questionTask['task_description'].strip()
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n

    # Idea: Each TFNG question item contains question, list of choices (TFNG) and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": ["TRUE", "FALSE", "NOT GIVEN"], # Always the same
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }

def parse_yes_no_not_given():
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n

    # Idea: Each TFNG question item contains question, list of choices (TFNG) and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": ["YES", "NO", "NOT GIVEN"], # Always the same
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionItems": question_items,
    }


questionTask = load_yaml_file(cam11_test1_1)["question"][1]
display(questionTask)
parse_true_false_not_given(questionTask)

    

{'task_type': 'true_false_notgiven\n',
 'task_description': 'Questions 8-13\nDo the following statements agree with the information given in Reading Passage?\n\nIn boxes 8-13 on your answer sheet, write\n\nTRUE               if the statement agrees with the information\n\nFALSE              if the statement contradicts the information\n\nNOT GIVEN    if there is no information on this\n',
 'question_main_title': '',
 'question_main_text': '8   Methods for predicting the Earth’s population have recently changed.\n\n9   Human beings are responsible for some of the destruction to food-producing land.\n\n10   The crops produced in vertical farms will depend on the season.\n\n11   Some damage to food crops is caused by climate change.\n\n12   Fertilisers will be needed for certain crops in vertical farms.\n\n13   Vertical farming will make plants less likely to be affected by infectious diseases.\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_of_options': '',
 'ex

{'taskType': 'true_false_notgiven',
 'taskDescription': 'Questions 8-13\nDo the following statements agree with the information given in Reading Passage?\nIn boxes 8-13 on your answer sheet, write\nTRUE               if the statement agrees with the information\nFALSE              if the statement contradicts the information\nNOT GIVEN    if there is no information on this',
 'questionMainTitle': '',
 'questionMainText': '8   Methods for predicting the Earth’s population have recently changed.\n9   Human beings are responsible for some of the destruction to food-producing land.\n10   The crops produced in vertical farms will depend on the season.\n11   Some damage to food crops is caused by climate change.\n12   Fertilisers will be needed for certain crops in vertical farms.\n13   Vertical farming will make plants less likely to be affected by infectious diseases.',
 'questionItems': [{'questionNumber': 8,
   'questionText': 'Methods for predicting the Earth’s population have recently 

### Diagram & Flow Chart Completion

In [12]:
def parse_diagram_completion(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    question_img_path = questionTask['question_img_path'].strip()  # Most important
    correct_answer = questionTask['correct_answer'].strip()

    # Not applicable but included for consistency
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    correct_answer = clean_text_multiple_line(correct_answer)

    # matching patterns
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 1. tomatoes 2. urban centres/ centers
    correct_answer_items = re.split(r'\n+', correct_answer)

    # Idea: Correct-answer-bassed question items. Each question item will be created for each correct answer   
    question_items = []
    for answer_item in correct_answer_items:
        answer_item = answer_item.strip()

        # Extract question number and correct answer for each question item
        answer_item_match = correct_answer_pattern.match(answer_item)
        question_number = answer_item_match.group(1).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "correctAnswer": correct_answer
    })

    return {
        "taskType": task_type,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionImgPath": question_img_path, # Most important
        "questionItems": question_items,
    }

def parse_flow_chart_completion(questionTask):
    return parse_diagram_completion(questionTask)

questionTask = load_yaml_file(cam11_test1_2)["question"][1]
display(questionTask)
parse_diagram_completion(questionTask)
# parse_true_false_not_given(questionTask)

{'task_type': 'diagram_completion\n',
 'task_description': 'Questions 20-26\nLabel the diagram below.\n\nChoose ONE WORD from the passage for each answer.\n\nWrite your answers in boxes 20-26 on your answer sheet.\n',
 'question_main_title': 'How a boat is lifted on the Falkirk Wheel\n',
 'question_main_text': '',
 'question_img_path': 'https://ieltstrainingonline.com/wp-content/uploads/2020/07/11-1-2-IELTS-Reading-q14-26-980x663.jpg\n',
 'question_list_title': '',
 'question_list_of_options': '',
 'example_answer': '',
 'correct_answer': '20. gates\n\n21. clamp\n\n22. axle\n\n23. cogs\n\n24. aqueduct\n\n25. wall\n\n26. locks'}

{'taskType': 'diagram_completion',
 'taskDescription': 'Questions 20-26\nLabel the diagram below.\nChoose ONE WORD from the passage for each answer.\nWrite your answers in boxes 20-26 on your answer sheet.',
 'questionMainTitle': 'How a boat is lifted on the Falkirk Wheel',
 'questionMainText': '',
 'questionImgPath': 'https://ieltstrainingonline.com/wp-content/uploads/2020/07/11-1-2-IELTS-Reading-q14-26-980x663.jpg',
 'questionItems': [{'questionNumber': 20, 'correctAnswer': 'gates'},
  {'questionNumber': 21, 'correctAnswer': 'clamp'},
  {'questionNumber': 22, 'correctAnswer': 'axle'},
  {'questionNumber': 23, 'correctAnswer': 'cogs'},
  {'questionNumber': 24, 'correctAnswer': 'aqueduct'},
  {'questionNumber': 25, 'correctAnswer': 'wall'},
  {'questionNumber': 26, 'correctAnswer': 'locks'}]}

### Matching Features

In [14]:
def parse_matching_features(questionTask):
    # Task description
    task_type = questionTask['task_type'].strip()
    task_description = questionTask['task_description'].strip()

    # Main content
    question_main_title = questionTask['question_main_title'].strip()
    question_main_text = questionTask['question_main_text'].strip()
    correct_answer = questionTask['correct_answer'].strip()
    question_list_title = questionTask['question_list_title'].strip()
    question_list_of_options = questionTask['question_list_of_options'].strip()

    # Not applicable but included for consistency
    question_img_path = questionTask['question_img_path'].strip()
    example_answer = questionTask['example_answer'].strip()

    #  Clean text
    task_description = clean_text_multiple_line(task_description)
    question_main_title = clean_text_single_line(question_main_title)
    question_main_text = clean_text_paragraph(question_main_text)
    question_list_title = clean_text_single_line(question_list_title)

    # Split items
    question_main_text_lines = re.split(r'\n+', question_main_text)
    correct_answer_lines = re.split(r'\n+', correct_answer)
    # question_list_of_options_lines = re.split(r'\n+', question_list_of_options)

    # Matching patterns - both questions and correct answers
    correct_answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_item_pattern = re.compile(r'(\d+)[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: 8. NOT GIVEN\n\n9. TRUE\n\n
    question_option_item_pattern = re.compile(r'([A-Z])[^a-zA-Z\d\(\)\-\+:]+(.*)') # Ex: A    Roger Angel\n\nB    Phil Rasch


    question_option_items = question_option_item_pattern.findall(question_list_of_options)
    # Idea: Each matching question item contains question, list of choices and correct answer
    question_items = []
    for question_line, answer_line in zip(question_main_text_lines, correct_answer_lines):
        question_line, answer_line = question_line.strip(), answer_line.strip()

        # Extract question number and correct answer for each question item
        question_item_match = question_item_pattern.match(question_line)
        answer_item_match = correct_answer_pattern.match(answer_line)
        question_number = question_item_match.group(1).strip()
        question_text = question_item_match.group(2).strip()
        correct_answer = answer_item_match.group(2).strip()
        question_items.append({
            "questionNumber": int(question_number),
            "questionText": question_text,
            "questionOptions": question_option_items, 
            "correctAnswer": correct_answer
        })

    return {
        "taskType": task_type,
        "taskDescription": task_description,
        "questionMainTitle": question_main_title,
        "questionMainText": question_main_text,
        "questionListTitle": question_list_title,
        "questionListOptions": question_option_items,
        "questionItems": question_items,
    }

questionTask = load_yaml_file(cam11_test1_3)["question"][2]
display(questionTask)
parse_matching_features(questionTask)


{'task_type': 'matching_features\n',
 'task_description': 'Questions 37-40\nLook at the following statements (Questions 37-40) and the list of scientists below.\n\nMatch each statement with the correct scientist, A-D.\n\nWrite the correct letter, A-D, in boxes 37-40 on your answer sheet.\n',
 'question_main_title': '',
 'question_main_text': '37   The effects of geo-engineering may not be long-lasting.\n\n38   Geo-engineering is a topic worth exploring.\n\n39   It may be necessary to limit the effectiveness of geo-engineering projects.\n\n40   Research into non-fossil-based fuels cannot be replaced by geo-engineering.\n',
 'question_img_path': '',
 'question_list_title': 'List of Scientists\n',
 'question_list_of_options': 'A    Roger Angel\n\nB    Phil Rasch\n\nC    Dan Lunt\n\nD    Martin Sommerkorn\n',
 'example_answer': '',
 'correct_answer': '37. B\n\n38. D\n\n39. C\n\n40. A'}

{'taskType': 'matching_features',
 'taskDescription': 'Questions 37-40\nLook at the following statements (Questions 37-40) and the list of scientists below.\nMatch each statement with the correct scientist, A-D.\nWrite the correct letter, A-D, in boxes 37-40 on your answer sheet.',
 'questionMainTitle': '',
 'questionMainText': '37   The effects of geo-engineering may not be long-lasting.\n38   Geo-engineering is a topic worth exploring.\n39   It may be necessary to limit the effectiveness of geo-engineering projects.\n40   Research into non-fossil-based fuels cannot be replaced by geo-engineering.',
 'questionListTitle': 'List of Scientists',
 'questionListOptions': [('A', 'Roger Angel'),
  ('B', 'Phil Rasch'),
  ('C', 'Dan Lunt'),
  ('D', 'Martin Sommerkorn')],
 'questionItems': [{'questionNumber': 37,
   'questionText': 'The effects of geo-engineering may not be long-lasting.',
   'questionOptions': [('A', 'Roger Angel'),
    ('B', 'Phil Rasch'),
    ('C', 'Dan Lunt'),
    ('D', 'Ma

In [15]:
questionTask = load_yaml_file(cam11_test1_3)["question"][1]
display(questionTask)

{'task_type': 'table_completion\n',
 'task_description': 'Questions 30-36\nComplete the table below.\n\nChoose ONE WORD from the passage for each answer.\n\nWrite your answers in boxes 30-36 on your answer sheet.\n',
 'question_main_title': 'GEO-ENGINEERING PROJECTS\n',
 'question_main_text': 'Procedure | Aim\n\nput a large number of tiny spacecraft into orbit far above Earth | to create a 30………….. that would reduce the amount of light reaching Earth\n\nplace 31…………… in the sea | to encourage 32…………… to form\n\nrelease aerosol sprays into the stratosphere | to create 33……………. that would reduce the amount of light reaching Earth\n\nfix strong 34…………… to Greenland ice sheets | to prevent icebergs moving into the sea\n\nplant trees in Russian Arctic that would lose their leaves in winter | to allow the 35…………… to reflect radiation\n\nchange the direction of 36…………… | to bring more cold water into ice-forming areas\n',
 'question_img_path': '',
 'question_list_title': '',
 'question_list_o