In [1]:
import re
import json
import yaml

## 1. MCQ

In [3]:
raw_passage_title = """\
Academic Reading Sample Task – Multiple Choice: 
one answer
"""
raw_passage_subtitle = """\
[Note: This is an extract from a Part 1 text about older people in the workforce.]
"""
raw_passage_text = """\
The general assumption is that older workers are paid more in spite of, rather than because of, 
their productivity. That might partly explain why, when employers are under pressure to cut costs, 
they persuade a 55-year old to take early retirement. Take away seniority-based pay scales, and 
older workers may become a much more attractive employment proposition. But most employers 
and many workers are uncomfortable with the idea of reducing someone’s pay in later life –
although manual workers on piece-rates often earn less as they get older. So retaining the 
services of older workers may mean employing them in different ways.

One innovation was devised by IBM Belgium. Faced with the need to cut staff costs, and having 
decided to concentrate cuts on 55 to 60-year olds, IBM set up a separate company called Skill 
Team, which re-employed any of the early retired who wanted to go on working up to the age of 
60. An employee who joined Skill Team at the age of 55 on a five-year contract would work for 
58% of his time, over the full period, for 88% of his last IBM salary. The company offered services 
to IBM, thus allowing it to retain access to some of the intellectual capital it would otherwise have 
lost.

The best way to tempt the old to go on working may be to build on such ‘bridge’ jobs: part- time or 
temporary employment that creates a more gradual transition from full-time work to retirement. 
Studies have found that, in the United States, nearly half of all men and women who had been in 
full-time jobs in middle age moved into such ‘bridge’ jobs at the end of their working lives. In 
general, it is the best-paid and worst-paid who carry on working. There seem to be two very 
different types of bridge job-holder – those who continue working because they have to and those 
who continue working because they want to, even though they could afford to retire.

If the job market grows more flexible, the old may find more jobs that suit them. Often, they will be 
self-employed. Sometimes, they may start their own businesses: a study by David Storey of 
Warwick University found that in Britain 70% of businesses started by people over 55 survived, 
compared with an overall national average of only 19%. But whatever pattern of employment they 
choose, in the coming years the skills of these ‘grey workers’ will have to be increasingly 
acknowledged and rewarded.
"""

raw_question_instructions = """\
Questions 1 – 4
Choose the correct letter, A, B, C or D.
Write the correct letter in boxes 1-4 on your answer sheet.
"""

raw_questions_text= """\
1 In paragraph one, the writer suggests that companies could consider
A abolishing pay schemes that are based on age.
B avoiding pay that is based on piece-rates.
C increasing pay for older workers.
D equipping older workers with new skills.
2 Skill Team is an example of a company which
A offers older workers increases in salary.
B allows people to continue working for as long as they want.
C allows the expertise of older workers to be put to use.
D treats older and younger workers equally.
3 According to the writer, ‘bridge’ jobs
A tend to attract people in middle-salary ranges.
B are better paid than some full-time jobs.
C originated in the United States.
D appeal to distinct groups of older workers.
4 David Storey’s study found that
A people demand more from their work as they get older.
B older people are good at running their own businesses.
C an increasing number of old people are self-employed.
D few young people have their own businesses.
"""

raw_answers_text = """\
1 A ■ abolishing pay schemes that are based on age
2 C ■ allows the expertise of older workers to be put to use
3 D ■ appeal to distinct groups of older workers
4 B ■ older people are good at running their own businesses
"""

In [3]:
def parse_questions_multiple_choice_select_one(raw_question_instructions, raw_questions_text, raw_answers_text):
    question_type = "multiple_choice"
    questions = []
    question_pattern = re.compile(r'(\d+)[^a-zA-Z\d:]+(.*)')
    option_pattern = re.compile(r'([A-Z])[^a-zA-Z\d:]+(.*)')
    answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d:]+([A-Z])[^a-zA-Z\d:]+(.*)')
    
    #  We will say that any numeric following a new line and followed by a space is a question number
    raw_questions_list = re.split(r'\n(?=\d+\s)', raw_questions_text)
    raw_answers_list = re.split(r'\n(?=\d+\s)', raw_answers_text)

    for q,a in zip(raw_questions_list, raw_answers_list):
        match = question_pattern.match(q.strip())
        if match:
            question_number = int(match.group(1))
            question_text = match.group(2).strip()
            question_options = list(map(lambda x: option_pattern.match(x.strip()).groups(),q.splitlines()[1:]))
            correct_answer = answer_pattern.match(a.strip()).groups()[1:]
            questions.append({
                "questionNumber": question_number,
                "questionText": question_text,
                "questionOptions": question_options,
                "correctAnswer": correct_answer
            })
    return {
        "questionType": question_type,
        "taskDescription": raw_question_instructions,
        "questions": questions
    }



In [4]:
def parse_questions_table_completion(question_task):
    question_type = "table_completion"
    raw_question_instructions, raw_questions_text, raw_answers_text = question_task['raw_question_instructions'], question_task['raw_questions_text'], question_task['raw_answers_text']
    questions = []
    answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d:]+(.*)')
    
    #  We will say that any numeric following a new line and followed by a space is a question number
    raw_tablerows_list = re.split(r'\n', raw_questions_text.strip())
    tablerows_list = [[r.strip() for r in re.split(r'\|',row) ] for row in raw_tablerows_list]
    answers_list = re.split(r'\n(?=\d+\s)', raw_answers_text)

    for a in answers_list:
        match = answer_pattern.match(a.strip())
        question_number = int(match.group(1))
        correct_answer = answer_pattern.match(a.strip()).group(2)
        questions.append({
            "questionNumber": question_number,
            "correctAnswer": correct_answer
        })
    return {
        "questionType": question_type,
        "taskDescription": raw_question_instructions,
        "questionContent": tablerows_list,
        "questions": questions
    }

In [5]:
def parse_questions_matching_headings(question_task):
    question_type = "matching_headings"
    raw_question_instructions,raw_headings_text, raw_examples_text, raw_questions_text, raw_answers_text =\
        question_task['raw_question_instructions'],question_task['raw_headings_text'],\
        question_task['raw_examples_text'], question_task['raw_questions_text'], question_task['raw_answers_text']

    heading_pattern = re.compile(r'([ivx]+)[^a-zA-Z\d:]+(.*)')
    question_pattern = re.compile(r'(\d+)[^a-zA-Z\d:]+(.*)')
    answer_pattern = re.compile(r'\d+[^a-zA-Z\d:]+([ivx]+)[^a-zA-Z\d:]+(.*)')
    # example_pattern = re.compile(r'Example:[^a-zA-Z\d:]+(.*)[^a-zA-Z\d:]+(.*)')

    headings_list = re.findall(heading_pattern, raw_headings_text)
    raw_questions_list = re.split(r'\n(?=\d+\s)', raw_questions_text)
    raw_answers_list = re.split(r'\n(?=\d+\s)', raw_answers_text)

    questions = []
    for q,a in zip(raw_questions_list, raw_answers_list):
        match = question_pattern.match(q.strip())
        if match:
            question_number = int(match.group(1))
            question_text = match.group(2).strip()
            correct_answer = answer_pattern.match(a.strip()).groups()
            questions.append({
                "questionNumber": question_number,
                "questionText": question_text,
                "questionOptions": headings_list,
                "correctAnswer": correct_answer
            })
    return {
        "questionType": question_type,
        "taskDescription": raw_question_instructions,
        "headingsList": headings_list,
        "exampleAnswer": raw_examples_text,
        "questions": questions
    }


In [53]:
# filename = "matching_headings.json"
# with open(filename, 'w') as f:
#     f.write(parsed_json)

## Table Completion

In [None]:
# filename = "table_completion.yaml"
# with open(filename, 'r') as f:
#     raw_data = yaml.safe_load(f)

# print(raw_data.keys())

Matching Headings


In [None]:
filename = "matching_headings.yaml"
with open(filename, 'r') as f:
    raw_data = yaml.safe_load(f)

print(raw_data.keys())
print(raw_data['questions_list'][0].keys())
question_task = raw_data['questions_list'][0]

print(question_task['raw_headings_text'])
print(question_task['raw_questions_text'])

## Matching Features

In [13]:
filename = "../components/assets/yaml/matching_features.yaml"
with open(filename, 'r') as f:
    raw_data = yaml.safe_load(f)

raw_data

{'raw_task_title': 'Academic Reading Sample Task – Matching Features\n',
 'raw_task_subtitle': '[Note: This is an extract from an Academic Reading passage on the development of rockets. \nThe text preceding this extract explored the slow development of the rocket and explained \nthe principle of propulsion.]\n',
 'raw_passage_title': '',
 'raw_passage_subtitle': '',
 'raw_passage_text': "The invention of rockets is linked inextricably with the invention of 'black powder'. Most \nhistorians of technology credit the Chinese with its discovery. They base their belief on \nstudies of Chinese writings or on the notebooks of early Europeans who settled in or \nmade long visits to China to study its history and civilisation. It is probable that, some \ntime in the tenth century, black powder was first compounded from its basic ingredients \nof saltpetre, charcoal and sulphur. But this does not mean that it was immediately used \nto propel rockets. By the thirteenth century, powder-propelled f

In [6]:
def parse_questions_matching_features(question_task):
    question_type, raw_question_instructions,raw_table_title, raw_table_text, raw_examples_text, raw_questions_text, raw_answers_text =\
        question_task['question_type'], question_task['raw_question_instructions'],question_task['raw_table_title'],question_task['raw_table_text'],\
        question_task['raw_examples_text'], question_task['raw_questions_text'], question_task['raw_answers_text']

    matching_table_data_pattern = re.compile(r'([A-Z])[^a-zA-Z\d:]+(.*)\s+') # "A the Chinese \nB the Indians \nC the British"
    question_pattern = re.compile(r'(\d+)[^a-zA-Z\d:]+(.*)') # " '7 black powder\n8 rocket-propelled arrows for fighting\n9 rockets as war weapons"
    answer_pattern = re.compile(r'\d+[^a-zA-Z\d:]+(.*)') # '7 A\n8 A\n9 B\n10 E\n'
    # example_pattern = re.compile(r'Example:[^a-zA-Z\d:]+(.*)[^a-zA-Z\d:]+(.*)')

    matching_table_data_options = re.findall(matching_table_data_pattern, raw_table_text)
    raw_questions_list = re.split(r'\n(?=\d+\s)', raw_questions_text)
    raw_answers_list = re.split(r'\n(?=\d+\s)', raw_answers_text)

    #Strip newlines and spaces from the beginning and end of each list
    matching_table_data_options = [tuple(a.strip() for a in item) for item in matching_table_data_options]

    questions = []
    for q,a in zip(raw_questions_list, raw_answers_list):
        q,a = q.strip(), a.strip()
        match = question_pattern.match(q)
        if match:
            question_number = int(match.group(1))
            question_text = match.group(2).strip()
            correct_answer = answer_pattern.match(a).groups()
            questions.append({
                "questionNumber": question_number,
                "questionText": question_text,
                "questionOptions": matching_table_data_options,
                "correctAnswer": correct_answer
            })
    return {
        "questionType": question_type,
        "taskDescription": raw_question_instructions,
        "tableTitle": raw_table_title.strip(),
        "tableData": matching_table_data_options,
        "exampleAnswer": raw_examples_text,
        "questions": questions
    }

In [None]:
filename = "../components/assets/yaml/matching_features.yaml"
parsed_dict = parse_from_yaml_file(filename)
parsed_dict["taskTime"] = 600 # 10 minutes
parsed_json = json.dumps(parsed_dict, indent=1)
print(parsed_json)

filename = "../components/assets/json/matching_features.json"
with open(filename, 'w') as f:
    f.write(parsed_json)

## Matching Sentence Endings

In [7]:
filename = "../components/assets/yaml/matching_sentence_endings.yaml"
with open(filename, 'r') as f:
    raw_data = yaml.safe_load(f)

raw_data

{'raw_task_title': 'Academic Reading Sample Task – Matching Sentence Endings\n',
 'raw_task_subtitle': '',
 'raw_passage_context': '[Note: This is an extract from a Part 3 text about the scientific community in London in the \n1500s.]\n',
 'raw_passage_title': 'Science in 16th-century London\n',
 'raw_passage_subtitle': 'The Jewel House, a new book by historical researcher and author \nDeborah Harkness\n',
 'raw_passage_text': 'Deborah Harkness devotes her elegant \nand erudite new book, The Jewel House, to the \nscientific community in 16th-century London. \nShe (rightly) argues that it is thanks to the \nimaginative collective efforts of the urban\nscientists that London became the melting pot \nin which a new mathematical and \nexperimental culture crystallized.\nHarkness is known for her ingenuity as a \nresearcher and her historical empathy. In The \nJewel House, Harkness turns her skills on the \ncity of London as a whole with surprising and \nfascinating results. She began her r

In [8]:
def parse_questions_matching_sentence_endings(question_task):
    question_type, raw_question_instructions,raw_table_title, raw_table_text, raw_examples_text, raw_questions_text, raw_answers_text =\
        question_task['question_type'], question_task['raw_question_instructions'],question_task['raw_table_title'],question_task['raw_table_text'],\
        question_task['raw_examples_text'], question_task['raw_questions_text'], question_task['raw_answers_text']
    
    raw_table_title = raw_table_title.strip() if raw_table_title else "List of Sentence Endings"

    matching_table_data_pattern = re.compile(r'([A-Z])[^a-zA-Z\d:]+(.*)\s+') # "A the Chinese \nB the Indians \nC the British"
    question_pattern = re.compile(r'(\d+)[^a-zA-Z\d:]+(.*)') # " '7 black powder\n8 rocket-propelled arrows for fighting\n9 rockets as war weapons"
    answer_pattern = re.compile(r'\d+[^a-zA-Z\d:]+([A-Z])[^a-zA-Z\d:]+(.*)') # '7 A\n8 A\n9 B\n10 E\n'
    # example_pattern = re.compile(r'Example:[^a-zA-Z\d:]+(.*)[^a-zA-Z\d:]+(.*)')

    matching_table_data_options = re.findall(matching_table_data_pattern, raw_table_text)
    raw_questions_list = re.split(r'\n(?=\d+\s)', raw_questions_text)
    raw_answers_list = re.split(r'\n(?=\d+\s)', raw_answers_text)

    #Strip newlines and spaces from the beginning and end of each list
    matching_table_data_options = [tuple(a.strip() for a in item) for item in matching_table_data_options]

    questions = []
    for q,a in zip(raw_questions_list, raw_answers_list):
        q,a = q.strip(), a.strip()
        match = question_pattern.match(q)
        if match:
            question_number = int(match.group(1))
            question_text = match.group(2).strip()
            correct_answer = answer_pattern.match(a).groups()
            questions.append({
                "questionNumber": question_number,
                "questionText": question_text,
                "questionOptions": matching_table_data_options,
                "correctAnswer": correct_answer
            })
    return {
        "questionType": question_type,
        "taskDescription": raw_question_instructions,
        "tableTitle": raw_table_title.strip(),
        "tableData": matching_table_data_options,
        "exampleAnswer": raw_examples_text,
        "questions": questions
    }

In [45]:
filename = "../components/assets/yaml/matching_sentence_endings.yaml"
parsed_dict = parse_from_yaml_file(filename)
parsed_dict["taskTime"] = 600 # 10 minutes
parsed_json = json.dumps(parsed_dict, indent=1)
print(parsed_json)

filename = "../components/assets/json/matching_sentence_endings.json"
with open(filename, 'w') as f:
    f.write(parsed_json)

{
 "taskTitle": "Academic Reading Sample Task \u2013 Matching Sentence Endings",
 "taskSubtitle": "",
 "passageContext": "[Note: This is an extract from a Part 3 text about the scientific community in London in the 1500s.]",
 "passageTitle": "Science in 16th-century London",
 "passageSubtitle": "The Jewel House, a new book by historical researcher and author Deborah Harkness",
 "passageText": "Deborah Harkness devotes her elegant and erudite new book, The Jewel House, to the scientific community in 16th-century London. She (rightly) argues that it is thanks to the imaginative collective efforts of the urbanscientists that London became the melting pot in which a new mathematical and experimental culture crystallized.\nHarkness is known for her ingenuity as a researcher and her historical empathy. In The Jewel House, Harkness turns her skills on the city of London as a whole with surprising and fascinating results. She began her research by asking herself a new question: not what caused

## Note Completion

In [11]:
filename = "../components/assets/yaml/note_completion.yaml"
with open(filename, 'r') as f:
    raw_data = yaml.safe_load(f)

raw_data

{'raw_task_title': 'Academic Reading Sample Task – Note Completion\n',
 'raw_task_subtitle': '',
 'raw_passage_context': '[Note: This is an extract from a Part 1 text about the scientist Marie Curie.]\nAdapted with permission from Encyclopaedia Britannica, © 2007 by Encyclopaedia Britannica, \nInc.\n',
 'raw_passage_title': 'The life and work of Marie Curie\n',
 'raw_passage_subtitle': '',
 'raw_passage_text': 'The marriage of Pierre and Marie Curie in 1895 marked the start of a partnership that was soon to \nachieve results of world significance. Following Henri Becquerel’s discovery in 1896 of a new \nphenomenon, which Marie later called ‘radioactivity’, Marie Curie decided to find out if the \nradioactivity discovered in uranium was to be found in other elements. She discovered that this \nwas true for thorium.\nTurning her attention to minerals, she found her interest drawn to pitchblende, a mineral whose \nradioactivity, superior to that of pure uranium, could be explained only by

In [17]:
def parse_questions_note_completion(question_task):
    # question_task = raw_data['questions_list'][0]
    question_type, raw_question_instructions, raw_questions_title, raw_questions_text, raw_answers_text =\
        question_task['question_type'],question_task['raw_question_instructions'], question_task['raw_questions_title'], question_task['raw_questions_text'], question_task['raw_answers_text']
    questions = []
    answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d:]+(.*)')

    #  We will say that any numeric following a new line and followed by a space is a question number
    raw_tablerows_list = re.split(r'\n', raw_questions_text.strip())
    # tablerows_list = [[r.strip() for r in re.split(r'\|',row) ] for row in raw_tablerows_list]
    answers_list = re.split(r'\n(?=\d+\s)', raw_answers_text)

    for a in answers_list:
        match = answer_pattern.match(a.strip())
        question_number = int(match.group(1))
        correct_answer = answer_pattern.match(a.strip()).group(2)
        questions.append({
            "questionNumber": question_number,
            "correctAnswer": correct_answer
        })
    return {
        "questionType": question_type,
        "taskDescription": raw_question_instructions,
        "questionTitle": raw_questions_title,
        "questionContent": raw_tablerows_list,
        "questions": questions
    }



In [20]:
filename = "../components/assets/yaml/note_completion.yaml"
parsed_dict = parse_from_yaml_file(filename)
parsed_dict["taskTime"] = 600 # 10 minutes
parsed_json = json.dumps(parsed_dict, indent=1)
print(parsed_json)

filename = "../components/assets/json/note_completion.json"
with open(filename, 'w') as f:
    f.write(parsed_json)

{
 "taskTitle": "Academic Reading Sample Task \u2013 Note Completion",
 "taskSubtitle": "",
 "passageContext": "[Note: This is an extract from a Part 1 text about the scientist Marie Curie.]Adapted with permission from Encyclopaedia Britannica, \u00a9 2007 by Encyclopaedia Britannica, Inc.\n",
 "passageTitle": "The life and work of Marie Curie",
 "passageSubtitle": "",
 "passageText": "The marriage of Pierre and Marie Curie in 1895 marked the start of a partnership that was soon to achieve results of world significance. Following Henri Becquerel\u2019s discovery in 1896 of a new phenomenon, which Marie later called \u2018radioactivity\u2019, Marie Curie decided to find out if the radioactivity discovered in uranium was to be found in other elements. She discovered that this was true for thorium.\nTurning her attention to minerals, she found her interest drawn to pitchblende, a mineral whose radioactivity, superior to that of pure uranium, could be explained only by the presence in the 

Sentence Completion

In [21]:
filename = "../components/assets/yaml/sentence_completion.yaml"
with open(filename, 'r') as f:
    raw_data = yaml.safe_load(f)

raw_data

{'raw_task_title': 'Academic Reading Sample Task – Sentence Completion\n',
 'raw_task_subtitle': '',
 'raw_passage_context': '[Note: This is an extract from a Part 2 task about the evolution of birds and their ancestry.]\n',
 'raw_passage_title': 'The origins of birds\n',
 'raw_passage_subtitle': '',
 'raw_passage_text': 'The science of evolutionary relationships has undergone a major change in recent decades. It used \nto be the case that all the features of organisms were important in working out their family tree. \nBut following the work of German entomologist Willi Hennig, many evolutionary scientists now \nbelieve that the only features which carry any useful information are the evolutionary ‘novelties’ \nshared between organisms. Mice, lizards and fish, for example, all have backbones – so the feature \n‘backbone’ tells us nothing about their evolutionary relationship. But the feature ‘four legs’ is \nuseful because it’s an evolutionary novelty – a characteristic shared only bet

In [23]:
def parse_questions_sentence_completion(question_task):
# question_task = raw_data['questions_list'][0]
    question_type, raw_question_instructions, raw_questions_title, raw_questions_text, raw_answers_text =\
        question_task['question_type'],question_task['raw_question_instructions'], question_task['raw_questions_title'], question_task['raw_questions_text'], question_task['raw_answers_text']
    questions = []
    answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d:]+(.*)')

    #  We will say that any numeric following a new line and followed by a space is a question number
    raw_questions_list = re.split(r'\n', raw_questions_text.strip())
    # tablerows_list = [[r.strip() for r in re.split(r'\|',row) ] for row in raw_tablerows_list]
    answers_list = re.split(r'\n(?=\d+\s)', raw_answers_text)

    for a in answers_list:
        match = answer_pattern.match(a.strip())
        question_number = int(match.group(1))
        correct_answer = answer_pattern.match(a.strip()).group(2)
        questions.append({
            "questionNumber": question_number,
            "correctAnswer": correct_answer
        })
    return {
        "questionType": question_type,
        "taskDescription": raw_question_instructions,
        "questionTitle": raw_questions_title,
        "questionContent": raw_questions_list,
        "questions": questions
    }

In [25]:
filename = "../components/assets/yaml/sentence_completion.yaml"
parsed_dict = parse_from_yaml_file(filename)
parsed_dict["taskTime"] = 600 # 10 minutes
parsed_json = json.dumps(parsed_dict, indent=1)
print(parsed_json)

filename = "../components/assets/json/sentence_completion.json"
with open(filename, 'w') as f:
    f.write(parsed_json)

{
 "taskTitle": "Academic Reading Sample Task \u2013 Sentence Completion",
 "taskSubtitle": "",
 "passageContext": "[Note: This is an extract from a Part 2 task about the evolution of birds and their ancestry.]",
 "passageTitle": "The origins of birds",
 "passageSubtitle": "",
 "passageText": "The science of evolutionary relationships has undergone a major change in recent decades. It used to be the case that all the features of organisms were important in working out their family tree. But following the work of German entomologist Willi Hennig, many evolutionary scientists now believe that the only features which carry any useful information are the evolutionary \u2018novelties\u2019 shared between organisms. Mice, lizards and fish, for example, all have backbones \u2013 so the feature \u2018backbone\u2019 tells us nothing about their evolutionary relationship. But the feature \u2018four legs\u2019 is useful because it\u2019s an evolutionary novelty \u2013 a characteristic shared only

## Summary Completion


In [26]:
filename = "../components/assets/yaml/summary_completion.yaml"
with open(filename, 'r') as f:
    raw_data = yaml.safe_load(f)

raw_data

{'raw_task_title': 'Academic Reading Sample Task – Summary Completion: selecting words from the text\n',
 'raw_task_subtitle': '',
 'raw_passage_context': '[Note: This is an extract from a Part 3 text about the ‘Plain English’ movement, which \npromotes the use of clear English.]\n‘The Cambridge Encyclopaedia of Language’, David Crystal, 3rd Edition, © Cambridge University \nPress, 2010.\n',
 'raw_passage_title': '',
 'raw_passage_subtitle': '',
 'raw_passage_text': 'The instructions accompanying do-it-yourself products are regularly cited as a source \nof unnecessary expense or frustration. Few companies seem to test their instructions \nby having them followed by a first-time user. Often, essential information is omitted, \nsteps in the construction process are taken for granted, and some degree of special \nknowledge is assumed. This is especially worrying in any fields where failure to \nfollow correct procedures can be dangerous.\nObjections to material in plain English have come 

In [29]:
def parse_questions_summary_completion(question_task):
    # question_task = raw_data['questions_list'][0]
    question_type, raw_question_instructions, raw_questions_title, raw_questions_text, raw_answers_text =\
        question_task['question_type'],question_task['raw_question_instructions'], question_task['raw_questions_title'], question_task['raw_questions_text'], question_task['raw_answers_text']
    questions = []
    answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d:]+(.*)')

    #  We will say that any numeric following a new line and followed by a space is a question number
    raw_questions_list = re.split(r'\n', remove_newlines(raw_questions_text).strip())
    # tablerows_list = [[r.strip() for r in re.split(r'\|',row) ] for row in raw_tablerows_list]
    answers_list = re.split(r'\n(?=\d+\s)', raw_answers_text)

    for a in answers_list:
        match = answer_pattern.match(a.strip())
        question_number = int(match.group(1))
        correct_answer = answer_pattern.match(a.strip()).group(2)
        questions.append({
            "questionNumber": question_number,
            "correctAnswer": correct_answer
        })
    return  {
        "questionType": question_type,
        "taskDescription": raw_question_instructions,
        "questionTitle": raw_questions_title,
        "questionContent": raw_questions_list,
        "questions": questions
    }

In [31]:
filename = "../components/assets/yaml/summary_completion.yaml"
parsed_dict = parse_from_yaml_file(filename)
parsed_dict["taskTime"] = 600 # 10 minutes
parsed_json = json.dumps(parsed_dict, indent=1)
print(parsed_json)

filename = "../components/assets/json/summary_completion.json"
with open(filename, 'w') as f:
    f.write(parsed_json)

{
 "taskTitle": "Academic Reading Sample Task \u2013 Summary Completion: selecting words from the text",
 "taskSubtitle": "",
 "passageContext": "[Note: This is an extract from a Part 3 text about the \u2018Plain English\u2019 movement, which promotes the use of clear English.]\u2018The Cambridge Encyclopaedia of Language\u2019, David Crystal, 3rd Edition, \u00a9 Cambridge University Press, 2010.\n",
 "passageTitle": "",
 "passageSubtitle": "",
 "passageText": "The instructions accompanying do-it-yourself products are regularly cited as a source of unnecessary expense or frustration. Few companies seem to test their instructions by having them followed by a first-time user. Often, essential information is omitted, steps in the construction process are taken for granted, and some degree of special knowledge is assumed. This is especially worrying in any fields where failure to follow correct procedures can be dangerous.\nObjections to material in plain English have come mainly from the 

# Main function

In [30]:
# Function to remove unnecessary newlines
def remove_newlines(raw_text):
    return re.sub(r'(?<!\.)\n', '', raw_text)

def parse_passage_section(data):
    parsed_dict = {}
    parsed_dict["taskTitle"] = remove_newlines(data['raw_task_title'])
    parsed_dict["taskSubtitle"] = remove_newlines(data['raw_task_subtitle'])
    parsed_dict["passageContext"] = remove_newlines(data['raw_passage_context'])
    parsed_dict["passageTitle"] = remove_newlines(data['raw_passage_title'])
    parsed_dict["passageSubtitle"] = remove_newlines(data['raw_passage_subtitle'])
    parsed_dict["passageText"] = remove_newlines(data['raw_passage_text'])
    return parsed_dict

def parse_from_yaml_file(filename):
    # Parse from raw yaml file
    with open(filename, 'r') as f:
        data = yaml.safe_load(f)

    parsed_dict = parse_passage_section(data)
    parsed_dict["questionsList"] = []
    for question_task in data['questions_list']:
        question_type = question_task['question_type']
        if question_type == "multiple_choice":
            questions_list_obj = parse_questions_multiple_choice_select_one(question_task['raw_question_instructions'], question_task['raw_questions_text'], question_task['raw_answers_text'])
        elif question_type == "table_completion":
            questions_list_obj = parse_questions_table_completion(question_task)
        elif question_type == "matching_headings":
            questions_list_obj = parse_questions_matching_headings(question_task)
        elif question_type == "matching_features":
            questions_list_obj = parse_questions_matching_features(question_task)
        elif question_type == "matching_sentence_endings":
            questions_list_obj = parse_questions_matching_sentence_endings(question_task)
        elif question_type == "note_completion":
            questions_list_obj = parse_questions_note_completion(question_task)
        elif question_type == "sentence_completion":
            questions_list_obj = parse_questions_sentence_completion(question_task)
        elif question_type == "summary_completion":
            questions_list_obj = parse_questions_summary_completion(question_task)
        # Add more question types here
        parsed_dict["questionsList"].append(questions_list_obj)

    return parsed_dict

# filename = "matching_headings.yaml"
# parsed_dict = parse_from_yaml_file(filename)
# parsed_json = json.dumps(parsed_dict, indent=1)
# print(parsed_json)