In [68]:
import re
import json
import yaml


## 1. MCQ

In [3]:
raw_passage_title = """\
Academic Reading Sample Task – Multiple Choice: 
one answer
"""
raw_passage_subtitle = """\
[Note: This is an extract from a Part 1 text about older people in the workforce.]
"""
raw_passage_text = """\
The general assumption is that older workers are paid more in spite of, rather than because of, 
their productivity. That might partly explain why, when employers are under pressure to cut costs, 
they persuade a 55-year old to take early retirement. Take away seniority-based pay scales, and 
older workers may become a much more attractive employment proposition. But most employers 
and many workers are uncomfortable with the idea of reducing someone’s pay in later life –
although manual workers on piece-rates often earn less as they get older. So retaining the 
services of older workers may mean employing them in different ways.

One innovation was devised by IBM Belgium. Faced with the need to cut staff costs, and having 
decided to concentrate cuts on 55 to 60-year olds, IBM set up a separate company called Skill 
Team, which re-employed any of the early retired who wanted to go on working up to the age of 
60. An employee who joined Skill Team at the age of 55 on a five-year contract would work for 
58% of his time, over the full period, for 88% of his last IBM salary. The company offered services 
to IBM, thus allowing it to retain access to some of the intellectual capital it would otherwise have 
lost.

The best way to tempt the old to go on working may be to build on such ‘bridge’ jobs: part- time or 
temporary employment that creates a more gradual transition from full-time work to retirement. 
Studies have found that, in the United States, nearly half of all men and women who had been in 
full-time jobs in middle age moved into such ‘bridge’ jobs at the end of their working lives. In 
general, it is the best-paid and worst-paid who carry on working. There seem to be two very 
different types of bridge job-holder – those who continue working because they have to and those 
who continue working because they want to, even though they could afford to retire.

If the job market grows more flexible, the old may find more jobs that suit them. Often, they will be 
self-employed. Sometimes, they may start their own businesses: a study by David Storey of 
Warwick University found that in Britain 70% of businesses started by people over 55 survived, 
compared with an overall national average of only 19%. But whatever pattern of employment they 
choose, in the coming years the skills of these ‘grey workers’ will have to be increasingly 
acknowledged and rewarded.
"""

In [48]:
# Function to remove unnecessary newlines
def remove_newlines(raw_text):
    return re.sub(r'(?<!\.)\n', '', raw_text)

new_title = remove_newlines(raw_passage_title)
new_subtitle = remove_newlines(raw_passage_subtitle)
new_text = remove_newlines(raw_passage_text)
print(new_title)
print(new_subtitle)
print(new_text)


Academic Reading Sample Task – Multiple Choice: one answer
[Note: This is an extract from a Part 1 text about older people in the workforce.]
The general assumption is that older workers are paid more in spite of, rather than because of, their productivity. That might partly explain why, when employers are under pressure to cut costs, they persuade a 55-year old to take early retirement. Take away seniority-based pay scales, and older workers may become a much more attractive employment proposition. But most employers and many workers are uncomfortable with the idea of reducing someone’s pay in later life –although manual workers on piece-rates often earn less as they get older. So retaining the services of older workers may mean employing them in different ways.
One innovation was devised by IBM Belgium. Faced with the need to cut staff costs, and having decided to concentrate cuts on 55 to 60-year olds, IBM set up a separate company called Skill Team, which re-employed any of the ear

In [53]:
raw_question_instructions = """\
Questions 1 – 4
Choose the correct letter, A, B, C or D.
Write the correct letter in boxes 1-4 on your answer sheet.
"""

raw_questions_text= """\
1 In paragraph one, the writer suggests that companies could consider
A abolishing pay schemes that are based on age.
B avoiding pay that is based on piece-rates.
C increasing pay for older workers.
D equipping older workers with new skills.
2 Skill Team is an example of a company which
A offers older workers increases in salary.
B allows people to continue working for as long as they want.
C allows the expertise of older workers to be put to use.
D treats older and younger workers equally.
3 According to the writer, ‘bridge’ jobs
A tend to attract people in middle-salary ranges.
B are better paid than some full-time jobs.
C originated in the United States.
D appeal to distinct groups of older workers.
4 David Storey’s study found that
A people demand more from their work as they get older.
B older people are good at running their own businesses.
C an increasing number of old people are self-employed.
D few young people have their own businesses.
"""

raw_answers_text = """\
1 A ■ abolishing pay schemes that are based on age
2 C ■ allows the expertise of older workers to be put to use
3 D ■ appeal to distinct groups of older workers
4 B ■ older people are good at running their own businesses
"""

In [56]:
# Split raw questions text into a list of questions
# We will say that any numeric following a new line and followed by a space is a question number
questions = re.split(r'\n(?=\d+\s)', raw_answers_text)
questions

['1 A ■ abolishing pay schemes that are based on age',
 '2 C ■ allows the expertise of older workers to be put to use',
 '3 D ■ appeal to distinct groups of older workers',
 '4 B ■ older people are good at running their own businesses\n']

In [57]:
def parse_questions_mcq(raw_question_instructions, raw_questions_text, raw_answers_text):
    question_type = "multiple_choice"
    questions = []
    question_pattern = re.compile(r'(\d+)[^a-zA-Z\d:]+(.*)')
    option_pattern = re.compile(r'([A-Z])[^a-zA-Z\d:]+(.*)')
    answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d:]+([A-Z])[^a-zA-Z\d:]+(.*)')
    
    #  We will say that any numeric following a new line and followed by a space is a question number
    raw_questions_list = re.split(r'\n(?=\d+\s)', raw_questions_text)
    raw_answers_list = re.split(r'\n(?=\d+\s)', raw_answers_text)

    for q,a in zip(raw_questions_list, raw_answers_list):
        match = question_pattern.match(q.strip())
        if match:
            question_number = int(match.group(1))
            question_text = match.group(2).strip()
            question_options = list(map(lambda x: option_pattern.match(x.strip()).groups(),q.splitlines()[1:]))
            correct_answer = answer_pattern.match(a.strip()).groups()[1:]
            questions.append({
                "questionNumber": question_number,
                "questionText": question_text,
                "questionOptions": question_options,
                "correctAnswer": correct_answer
            })
    return {
        "questionType": question_type,
        "taskDescription": raw_question_instructions,
        "questions": questions
    }

mcq_json = parse_questions_mcq(raw_question_instructions, raw_questions_text, raw_answers_text)
print(mcq_json)


{'questionType': 'multiple_choice', 'taskDescription': 'Questions 1 – 4\nChoose the correct letter, A, B, C or D.\nWrite the correct letter in boxes 1-4 on your answer sheet.\n', 'questions': [{'questionNumber': 1, 'questionText': 'In paragraph one, the writer suggests that companies could consider', 'questionOptions': [('A', 'abolishing pay schemes that are based on age.'), ('B', 'avoiding pay that is based on piece-rates.'), ('C', 'increasing pay for older workers.'), ('D', 'equipping older workers with new skills.')], 'correctAnswer': ('A', 'abolishing pay schemes that are based on age')}, {'questionNumber': 2, 'questionText': 'Skill Team is an example of a company which', 'questionOptions': [('A', 'offers older workers increases in salary.'), ('B', 'allows people to continue working for as long as they want.'), ('C', 'allows the expertise of older workers to be put to use.'), ('D', 'treats older and younger workers equally.')], 'correctAnswer': ('C', 'allows the expertise of older 

In [122]:
def parse_passage_section(data):
    parsed_dict = {}
    parsed_dict["taskTitle"] = remove_newlines(data['raw_task_title'])
    parsed_dict["taskSubtitle"] = remove_newlines(data['raw_task_subtitle'])
    parsed_dict["passageTitle"] = remove_newlines(data['raw_passage_title'])
    parsed_dict["passageSubtitle"] = remove_newlines(data['raw_passage_subtitle'])
    parsed_dict["passageText"] = remove_newlines(data['raw_passage_text'])
    return parsed_dict

def parse_from_yaml_file(filename):
    # Parse from raw yaml file
    with open(filename, 'r') as f:
        data = yaml.safe_load(f)

    parsed_dict = parse_passage_section(data)
    parsed_dict["questionsList"] = []
    for question_task in data['questions_list']:
        question_type = question_task['question_type']
        if question_type == "multiple_choice":
            questions_list_obj = parse_questions_mcq(question_task['raw_question_instructions'], question_task['raw_questions_text'], question_task['raw_answers_text'])
            parsed_dict["questionsList"].append(questions_list_obj)
        elif question_type == "table_completion":
            questions_list_obj = parse_questions_table(question_task)
            parsed_dict["questionsList"].append(questions_list_obj)
        elif question_type == "true_false_notgiven":
            pass
        # Add more question types here

    return parsed_dict

filename = "table.yaml"
parsed_dict = parse_from_yaml_file(filename)
parsed_json = json.dumps(parsed_dict, indent=2)
print(parsed_json)

{
  "taskTitle": "Academic Reading Sample Task \u2013 Table Completion",
  "taskSubtitle": "[Note: This is an extract from an Academic Reading passage on the subject of dung beetles. The text preceding this extract gave some background facts about dung beetles, and went on to describe a decision to introduce non-native varieties to Australia.]",
  "passageTitle": "",
  "passageSubtitle": "",
  "passageText": "Introducing dung1 beetles into a pasture is a simple process: approximately 1,500 beetles are released, a handful at a time, into fresh cow pats2 in the cow pasture. The beetles immediately disappear beneath the pats digging and tunnelling and, if they successfully adapt to their new environment, soon become a permanent, self-sustaining part of the local ecology. In time they multiply and within three or four years the benefits to the pasture are obvious.\nDung beetles work from the inside of the pat so they are sheltered from predators such as birds and foxes. Most species burrow

In [123]:
filename = "table.json"
with open(filename, 'w') as f:
    f.write(parsed_json)

## Table Completion

In [118]:
filename = "table.yaml"
with open(filename, 'r') as f:
    raw_data = yaml.safe_load(f)

print(raw_data.keys())

dict_keys(['raw_task_title', 'raw_task_subtitle', 'raw_passage_title', 'raw_passage_subtitle', 'raw_passage_text', 'questions_list'])


In [None]:
raw_questions_text = raw_data['questions_list'][0]['raw_questions_text']
raw_tablerows_list = re.split(r'\n', raw_questions_text.strip())
raw_tablerows_list
tablerows_list = [[r.strip() for r in re.split(r'\|',row) ] for row in raw_tablerows_list]
tablerows_list

In [104]:
raw_answers_text = raw_data['questions_list'][0]['raw_answers_text']
answers_list = re.split(r'\n(?=\d+\s)', raw_answers_text)
answers_list

['9 temperate',
 '10 early spring',
 '11 two to five / 2-5',
 '12 sub-tropical',
 '13 South African tunneling/tunnelling\n']

In [120]:
def parse_questions_table(question_task):
    question_type = "table_completion"
    raw_question_instructions, raw_questions_text, raw_answers_text = question_task['raw_question_instructions'], question_task['raw_questions_text'], question_task['raw_answers_text']
    questions = []
    answer_pattern = re.compile(r'(\d+)[^a-zA-Z\d:]+(.*)')
    
    #  We will say that any numeric following a new line and followed by a space is a question number
    raw_tablerows_list = re.split(r'\n', raw_questions_text.strip())
    tablerows_list = [[r.strip() for r in re.split(r'\|',row) ] for row in raw_tablerows_list]
    answers_list = re.split(r'\n(?=\d+\s)', raw_answers_text)

    for a in answers_list:
        match = answer_pattern.match(a.strip())
        question_number = int(match.group(1))
        correct_answer = answer_pattern.match(a.strip()).group(2)
        questions.append({
            "questionNumber": question_number,
            "correctAnswer": correct_answer
        })
    return {
        "questionType": question_type,
        "taskDescription": raw_question_instructions,
        "questionContent": tablerows_list,
        "questions": questions
    }

table_json = parse_questions_table(raw_data['questions_list'][0])
table_json

{'questionType': 'table_completion',
 'taskDescription': 'Question 9 – 13\nComplete the table below.\nChoose NO MORE THAN THREE WORDS from the passage for each answer.\nWrite your answers in boxes 9-13 on your answer sheet.\n',
 'questionContent': [['Species Size',
   'Preferred climate',
   'Complementary species',
   'Start of active period',
   'Number of generations per year'],
  ['French', '2.5 cm', 'cool', 'Spanish', 'late spring', '1 - 2'],
  ['Spanish', '1.25 cm', '[9]', '', '[10]', '[11]'],
  ['South African ball roller', '', '[12]', '[13]', '', '']],
 'questions': [{'questionNumber': 9, 'correctAnswer': 'temperate'},
  {'questionNumber': 10, 'correctAnswer': 'early spring'},
  {'questionNumber': 11, 'correctAnswer': 'two to five / 2-5'},
  {'questionNumber': 12, 'correctAnswer': 'sub-tropical'},
  {'questionNumber': 13,
   'correctAnswer': 'South African tunneling/tunnelling'}]}