In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re

import projects_parser

In [3]:
BOOKLET_FILE = "../booklets_text/2012.txt"

In [4]:
with open(BOOKLET_FILE, encoding="utf-8") as f:
    raw_lines = f.readlines()

In [5]:
print(raw_lines[0:20])
print(len(raw_lines))

['Project Title: Manufacture and Calibration of Line Scales for Quality Control\n', 'Name: Eoghan McManus\n', 'Email: eoghan.mcmanus5@mail.dcu.ie\n', 'Programme: Mechatronic Engineering\n', 'Supervisor: Dr. Dermot Brabazon\n', 'In the field of engineering, there is a constant need for higher standards of quality. In measurement,\n', 'accuracy and precision must continuously improve to provide these higher standards for various\n', 'applications.\n', 'This project aimed to manufacture line scales with micro-meter accuracy, using a laser. Line scales are\n', 'blocks of material, usually glass, which are marked with lines at set intervals. Line scales are primarily\n', 'used for the calibration of optical devices. Calibrated optical devices are used for quality control; in\n', 'industries such as biomedical devices, ICT chips and foodstuffs. A laser system, which was originally\n', 'built for the manufacture of microfluidics, was used to manufacture the line scales. The laser is an\n', 'N

In [6]:
# get rid of project numbering
filter_pattern = re.compile(r'^Project Number', re.M)
lines_filtered = list(filter(lambda line: not filter_pattern.match(line), raw_lines))

In [7]:
print(lines_filtered[0:30])
print(len(lines_filtered))

['Project Title: Manufacture and Calibration of Line Scales for Quality Control\n', 'Name: Eoghan McManus\n', 'Email: eoghan.mcmanus5@mail.dcu.ie\n', 'Programme: Mechatronic Engineering\n', 'Supervisor: Dr. Dermot Brabazon\n', 'In the field of engineering, there is a constant need for higher standards of quality. In measurement,\n', 'accuracy and precision must continuously improve to provide these higher standards for various\n', 'applications.\n', 'This project aimed to manufacture line scales with micro-meter accuracy, using a laser. Line scales are\n', 'blocks of material, usually glass, which are marked with lines at set intervals. Line scales are primarily\n', 'used for the calibration of optical devices. Calibrated optical devices are used for quality control; in\n', 'industries such as biomedical devices, ICT chips and foodstuffs. A laser system, which was originally\n', 'built for the manufacture of microfluidics, was used to manufacture the line scales. The laser is an\n', 'N

In [8]:
# Transform certain characters

lines = [projects_parser.normalize_characters(line) for line in lines_filtered]

In [9]:
print(lines[0:30])
print(len(lines))

['Project Title: Manufacture and Calibration of Line Scales for Quality Control\n', 'Name: Eoghan McManus\n', 'Email: eoghan.mcmanus5@mail.dcu.ie\n', 'Programme: Mechatronic Engineering\n', 'Supervisor: Dr. Dermot Brabazon\n', 'In the field of engineering, there is a constant need for higher standards of quality. In measurement,\n', 'accuracy and precision must continuously improve to provide these higher standards for various\n', 'applications.\n', 'This project aimed to manufacture line scales with micro-meter accuracy, using a laser. Line scales are\n', 'blocks of material, usually glass, which are marked with lines at set intervals. Line scales are primarily\n', 'used for the calibration of optical devices. Calibrated optical devices are used for quality control; in\n', 'industries such as biomedical devices, ICT chips and foodstuffs. A laser system, which was originally\n', 'built for the manufacture of microfluidics, was used to manufacture the line scales. The laser is an\n', 'N

In [10]:
# Separate lines into list of projects

projects_strings = projects_parser.lines_to_projects(lines)

In [11]:
print(projects_strings[0:3])
print(len(projects_strings))

['Project Title: Manufacture and Calibration of Line Scales for Quality Control\nName: Eoghan McManus\nEmail: eoghan.mcmanus5@mail.dcu.ie\nProgramme: Mechatronic Engineering\nSupervisor: Dr. Dermot Brabazon\nIn the field of engineering, there is a constant need for higher standards of quality. In measurement,\naccuracy and precision must continuously improve to provide these higher standards for various\napplications.\nThis project aimed to manufacture line scales with micro-meter accuracy, using a laser. Line scales are\nblocks of material, usually glass, which are marked with lines at set intervals. Line scales are primarily\nused for the calibration of optical devices. Calibrated optical devices are used for quality control; in\nindustries such as biomedical devices, ICT chips and foodstuffs. A laser system, which was originally\nbuilt for the manufacture of microfluidics, was used to manufacture the line scales. The laser is an\nNd:YAG 1064nm laser with a maximum output power of 3.

In [12]:
regex_schema = {
    'title': r'Project Title:(.*?)Name',
    'students': r'Name:(.*?)Programme',
    'programme': r'Programme:(.*)Supervisor',
    'supervisor_and_description': r'Supervisor:(.*?)(?:Primary Area:|Secondary Area:|Primary OS:|Primary Technology:|Secondary Technology:|$)',
    'primary_area': r'Primary Area:(.*?)(?:Secondary Area:|Primary OS:|Primary Technology:|Secondary Technology:|$)',
    'secondary_area': r'Secondary Area:(.*?)(?:Primary OS:|Primary Technology:|Secondary Technology:|$)',
    'primary_os': r'Primary OS:(.*?)(?:Primary Technology:|Secondary Technology:|$)',
    'primary_tech': r'Primary Technology:(.*?)(?:Secondary Technology|$)',
    'secondary_tech': r'Secondary Technology:(.*?)$',
}

In [13]:
projects = projects_parser.parse_projects(projects_strings, regex_schema)

In [14]:
print(projects[0:3])

[{'title': ' Manufacture and Calibration of Line Scales for Quality Control\n', 'students': ' Eoghan McManus\nEmail: eoghan.mcmanus5@mail.dcu.ie\n', 'programme': ' Mechatronic Engineering\n', 'supervisor_and_description': ' Dr. Dermot Brabazon\nIn the field of engineering, there is a constant need for higher standards of quality. In measurement,\naccuracy and precision must continuously improve to provide these higher standards for various\napplications.\nThis project aimed to manufacture line scales with micro-meter accuracy, using a laser. Line scales are\nblocks of material, usually glass, which are marked with lines at set intervals. Line scales are primarily\nused for the calibration of optical devices. Calibrated optical devices are used for quality control; in\nindustries such as biomedical devices, ICT chips and foodstuffs. A laser system, which was originally\nbuilt for the manufacture of microfluidics, was used to manufacture the line scales. The laser is an\nNd:YAG 1064nm la

In [15]:
projects_final = projects_parser.canonicalize_projects(projects, name_sep='Name:', email_sep='Email:')

In [16]:
projects_final[0:3]

[{'title': 'Manufacture and Calibration of Line Scales for Quality Control',
  'students': [{'name': 'Eoghan McManus',
    'email': 'eoghan.mcmanus5@mail.dcu.ie'}],
  'programme': 'Mechatronic Engineering',
  'primary_area': 'Quality Standards',
  'secondary_area': 'Statistial Analysis',
  'primary_os': '',
  'primary_tech': 'Nd:YAG 3.2W Laser',
  'secondary_tech': 'NSAI Metrology',
  'supervisor': 'Dr. Dermot Brabazon',
  'description': 'In the field of engineering, there is a constant need for higher standards of quality. In measurement, accuracy and precision must continuously improve to provide these higher standards for various applications. This project aimed to manufacture line scales with micro-meter accuracy, using a laser. Line scales are blocks of material, usually glass, which are marked with lines at set intervals. Line scales are primarily used for the calibration of optical devices. Calibrated optical devices are used for quality control; in industries such as biomedical

In [17]:
len(projects_final)

82

In [18]:
projects_area_combined = projects_parser.apply_transformation(projects_final, 
    projects_parser.combine_fields,
    'area',
    'primary_area', 
    'secondary_area')
projects_tech_combined = projects_parser.apply_transformation(projects_area_combined,
    projects_parser.combine_fields,
    'technology',
    'primary_tech',
    'secondary_tech')
projects_platform = projects_parser.apply_transformation(
    projects_tech_combined,
    projects_parser.rename_field,
    'platform',
    'primary_os')

In [19]:
print(projects_area_combined[0])
print(projects_tech_combined[0])
print(projects_platform[0])

{'title': 'Manufacture and Calibration of Line Scales for Quality Control', 'students': [{'name': 'Eoghan McManus', 'email': 'eoghan.mcmanus5@mail.dcu.ie'}], 'programme': 'Mechatronic Engineering', 'primary_os': '', 'primary_tech': 'Nd:YAG 3.2W Laser', 'secondary_tech': 'NSAI Metrology', 'supervisor': 'Dr. Dermot Brabazon', 'description': 'In the field of engineering, there is a constant need for higher standards of quality. In measurement, accuracy and precision must continuously improve to provide these higher standards for various applications. This project aimed to manufacture line scales with micro-meter accuracy, using a laser. Line scales are blocks of material, usually glass, which are marked with lines at set intervals. Line scales are primarily used for the calibration of optical devices. Calibrated optical devices are used for quality control; in industries such as biomedical devices, ICT chips and foodstuffs. A laser system, which was originally built for the manufacture of

In [20]:
print(projects_platform[:3])
print(len(projects_platform))

[{'title': 'Manufacture and Calibration of Line Scales for Quality Control', 'students': [{'name': 'Eoghan McManus', 'email': 'eoghan.mcmanus5@mail.dcu.ie'}], 'programme': 'Mechatronic Engineering', 'supervisor': 'Dr. Dermot Brabazon', 'description': 'In the field of engineering, there is a constant need for higher standards of quality. In measurement, accuracy and precision must continuously improve to provide these higher standards for various applications. This project aimed to manufacture line scales with micro-meter accuracy, using a laser. Line scales are blocks of material, usually glass, which are marked with lines at set intervals. Line scales are primarily used for the calibration of optical devices. Calibrated optical devices are used for quality control; in industries such as biomedical devices, ICT chips and foodstuffs. A laser system, which was originally built for the manufacture of microfluidics, was used to manufacture the line scales. The laser is an Nd:YAG 1064nm las

In [21]:
projects_parser.write_json('2012', projects_platform)