In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re

import projects_parser

In [3]:
BOOKLET_FILE = "../booklets_text/2015.txt"

In [4]:
with open(BOOKLET_FILE, encoding="utf-8") as f:
    raw_lines = f.readlines()

In [5]:
print(raw_lines[0:20])

['1\n', 'Project Title: Beet Washer Automation\n', 'Name: Jason Cross\n', 'Email: jason.cross3@mail.dcu.ie\n', 'Programme: ME\n', 'Supervisor: Harold Esmonde\n', 'Beet Washers are used in the production of Biogas. Beet is left to rot and ferment to create gas and in turn run a\n', 'generator to create electricity. All beet washers are hydraulically powered with manual controls. Engines are\n', 'currently used to run the machines and emissions are high. The aim of this project is to create an emission free\n', 'machine which is fully automated. This involves the use of a 90kw 3 phase motor and IFM automation system.\n', 'The automation system uses IFM PLC, speed sensors, oil pressure sensors and 7” screen.\n', 'Project Area: Automation\n', 'Project Technology: PLC\n', '2\n', 'Project Title: CoolWall\n', 'Name: Stephan McLean\n', 'Email: stephan.mclean2@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Supervisor: Renaat Verbruggen\n', 'CoolWall is a web and mobile application which

In [6]:
# get rid of project numbering
filter_pattern = re.compile(r'^\d+$', re.M)
lines_filtered = list(filter(lambda line: not filter_pattern.match(line), raw_lines))

In [7]:
print(lines_filtered[0:30])

['Project Title: Beet Washer Automation\n', 'Name: Jason Cross\n', 'Email: jason.cross3@mail.dcu.ie\n', 'Programme: ME\n', 'Supervisor: Harold Esmonde\n', 'Beet Washers are used in the production of Biogas. Beet is left to rot and ferment to create gas and in turn run a\n', 'generator to create electricity. All beet washers are hydraulically powered with manual controls. Engines are\n', 'currently used to run the machines and emissions are high. The aim of this project is to create an emission free\n', 'machine which is fully automated. This involves the use of a 90kw 3 phase motor and IFM automation system.\n', 'The automation system uses IFM PLC, speed sensors, oil pressure sensors and 7” screen.\n', 'Project Area: Automation\n', 'Project Technology: PLC\n', 'Project Title: CoolWall\n', 'Name: Stephan McLean\n', 'Email: stephan.mclean2@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Supervisor: Renaat Verbruggen\n', 'CoolWall is a web and mobile application which allows users 

In [8]:
# Transform certain characters

lines = [projects_parser.normalize_characters(line) for line in lines_filtered]

In [9]:
print(lines[0:30])
print(len(lines))

['Project Title: Beet Washer Automation\n', 'Name: Jason Cross\n', 'Email: jason.cross3@mail.dcu.ie\n', 'Programme: ME\n', 'Supervisor: Harold Esmonde\n', 'Beet Washers are used in the production of Biogas. Beet is left to rot and ferment to create gas and in turn run a\n', 'generator to create electricity. All beet washers are hydraulically powered with manual controls. Engines are\n', 'currently used to run the machines and emissions are high. The aim of this project is to create an emission free\n', 'machine which is fully automated. This involves the use of a 90kw 3 phase motor and IFM automation system.\n', "The automation system uses IFM PLC, speed sensors, oil pressure sensors and 7' screen.\n", 'Project Area: Automation\n', 'Project Technology: PLC\n', 'Project Title: CoolWall\n', 'Name: Stephan McLean\n', 'Email: stephan.mclean2@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Supervisor: Renaat Verbruggen\n', "CoolWall is a web and mobile application which allows users 

In [10]:
# Separate lines into list of projects

projects_strings = projects_parser.lines_to_projects(lines)

In [11]:
print(projects_strings[0:3])

["Project Title: Beet Washer Automation\nName: Jason Cross\nEmail: jason.cross3@mail.dcu.ie\nProgramme: ME\nSupervisor: Harold Esmonde\nBeet Washers are used in the production of Biogas. Beet is left to rot and ferment to create gas and in turn run a\ngenerator to create electricity. All beet washers are hydraulically powered with manual controls. Engines are\ncurrently used to run the machines and emissions are high. The aim of this project is to create an emission free\nmachine which is fully automated. This involves the use of a 90kw 3 phase motor and IFM automation system.\nThe automation system uses IFM PLC, speed sensors, oil pressure sensors and 7' screen.\nProject Area: Automation\nProject Technology: PLC\n", "Project Title: CoolWall\nName: Stephan McLean\nEmail: stephan.mclean2@mail.dcu.ie\nProgramme: Computer Applications\nSupervisor: Renaat Verbruggen\nCoolWall is a web and mobile application which allows users to collaborate on an online 'Wall' similar to how\nthey would on

In [12]:
len(projects_strings)

161

In [13]:
regex_schema = {
    'title': r'Project Title:(.*?)Name',
    'students': r'Name:(.*?)Programme',
    'programme': r'Programme:(.*)Supervisor',
    'supervisor_and_description': r'Supervisor:(.*)Project Area',
    'area': r'Project Area:(.*?)(?:Project Technology|$)',
    'technology': r'Project Technology:(.*?)$',
}

In [14]:
projects = projects_parser.parse_projects(projects_strings, regex_schema)

In [15]:
projects[0:3]

[{'title': ' Beet Washer Automation\n',
  'students': ' Jason Cross\nEmail: jason.cross3@mail.dcu.ie\n',
  'programme': ' ME\n',
  'supervisor_and_description': " Harold Esmonde\nBeet Washers are used in the production of Biogas. Beet is left to rot and ferment to create gas and in turn run a\ngenerator to create electricity. All beet washers are hydraulically powered with manual controls. Engines are\ncurrently used to run the machines and emissions are high. The aim of this project is to create an emission free\nmachine which is fully automated. This involves the use of a 90kw 3 phase motor and IFM automation system.\nThe automation system uses IFM PLC, speed sensors, oil pressure sensors and 7' screen.\n",
  'area': ' Automation\n',
  'technology': ' PLC'},
 {'title': ' CoolWall\n',
  'students': ' Stephan McLean\nEmail: stephan.mclean2@mail.dcu.ie\n',
  'programme': ' Computer Applications\n',
  'supervisor_and_description': " Renaat Verbruggen\nCoolWall is a web and mobile applica

In [16]:
projects_final = projects_parser.canonicalize_projects(projects, name_sep="Name:", email_sep="Email:")

In [17]:
projects_final[0:3]

[{'title': 'Beet Washer Automation',
  'students': [{'name': 'Jason Cross', 'email': 'jason.cross3@mail.dcu.ie'}],
  'programme': 'ME',
  'area': 'Automation',
  'technology': 'PLC',
  'supervisor': 'Harold Esmonde',
  'description': "Beet Washers are used in the production of Biogas. Beet is left to rot and ferment to create gas and in turn run a generator to create electricity. All beet washers are hydraulically powered with manual controls. Engines are currently used to run the machines and emissions are high. The aim of this project is to create an emission free machine which is fully automated. This involves the use of a 90kw 3 phase motor and IFM automation system. The automation system uses IFM PLC, speed sensors, oil pressure sensors and 7' screen."},
 {'title': 'CoolWall',
  'students': [{'name': 'Stephan McLean',
    'email': 'stephan.mclean2@mail.dcu.ie'}],
  'programme': 'Computer Applications',
  'area': 'Web_Application',
  'technology': 'JavaScript',
  'supervisor': 'Ren

In [18]:
len(projects_final)

161

In [19]:
# need extra processing to remove strange underscores and commas without spaces from the underlying PDF
def underscores_to_spaces(project, fields):
    project_copy = { **project }
    for field in fields:
        project_copy[field] = project_copy[field].replace('_', ' ')
    return project_copy

def space_commas(project, fields):
    project_copy = { **project }
    for field in fields:
        # Regex that finds commas with no whitespace after them
        project_copy[field] = re.sub(r'(?<=,)(?=[^\s])', r' ', project_copy[field])
    return project_copy

In [20]:
projects_no_underscores = projects_parser.apply_transformation(projects_final, underscores_to_spaces, ['area'])
projects_spaced = projects_parser.apply_transformation(projects_no_underscores, space_commas, ['area', 'technology'])

In [21]:
print(projects_spaced[:3])
print(len(projects_spaced))

[{'title': 'Beet Washer Automation', 'students': [{'name': 'Jason Cross', 'email': 'jason.cross3@mail.dcu.ie'}], 'programme': 'ME', 'area': 'Automation', 'technology': 'PLC', 'supervisor': 'Harold Esmonde', 'description': "Beet Washers are used in the production of Biogas. Beet is left to rot and ferment to create gas and in turn run a generator to create electricity. All beet washers are hydraulically powered with manual controls. Engines are currently used to run the machines and emissions are high. The aim of this project is to create an emission free machine which is fully automated. This involves the use of a 90kw 3 phase motor and IFM automation system. The automation system uses IFM PLC, speed sensors, oil pressure sensors and 7' screen."}, {'title': 'CoolWall', 'students': [{'name': 'Stephan McLean', 'email': 'stephan.mclean2@mail.dcu.ie'}], 'programme': 'Computer Applications', 'area': 'Web Application', 'technology': 'JavaScript', 'supervisor': 'Renaat Verbruggen', 'descripti

In [22]:
projects_parser.write_json("2015", projects_spaced)