In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re

import projects_parser

In [3]:
BOOKLET_FILE = "../booklets_text/2014.txt"

In [4]:
with open(BOOKLET_FILE, encoding="utf-8") as f:
    raw_lines = f.readlines()

In [5]:
print(raw_lines[0:20])
print(len(raw_lines))

['Project Title: Spark – Booking Management System\n', 'Name: Brian Fanning\n', 'Email: brian.fanning4@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Supervisor: Monica Ward\n', 'This web application is designed to be used by a business who repair/service vehicles on a daily basis.\n', 'The system creates and saves bookings, generates timetables for mechanics, and manages inventory.\n', 'Bookings saved in the system are assigned to a mechanic, who can view their timetable for the day on\n', 'their homepage. The timetables are generated by the system based on the bookings assigned to\n', 'mechanics.\n', 'The system can be used by different types of users, who can be granted specific access privileges\n', 'based on their role in the business.\n', '• Receptionists can create new bookings and assign them to mechanics. They can also view and\n', 'print out invoices created by a mechanic for a booking.\n', '• Mechanics can view their timetable for a particular day, and they can also 

In [6]:
# get rid of project numbering
filter_pattern = re.compile(r'^Project Number', re.M)
lines_filtered = list(filter(lambda line: not filter_pattern.match(line), raw_lines))

In [7]:
print(lines_filtered[0:30])
print(len(lines_filtered))

['Project Title: Spark – Booking Management System\n', 'Name: Brian Fanning\n', 'Email: brian.fanning4@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Supervisor: Monica Ward\n', 'This web application is designed to be used by a business who repair/service vehicles on a daily basis.\n', 'The system creates and saves bookings, generates timetables for mechanics, and manages inventory.\n', 'Bookings saved in the system are assigned to a mechanic, who can view their timetable for the day on\n', 'their homepage. The timetables are generated by the system based on the bookings assigned to\n', 'mechanics.\n', 'The system can be used by different types of users, who can be granted specific access privileges\n', 'based on their role in the business.\n', '• Receptionists can create new bookings and assign them to mechanics. They can also view and\n', 'print out invoices created by a mechanic for a booking.\n', '• Mechanics can view their timetable for a particular day, and they can also 

In [8]:
# Transform certain characters

lines = [projects_parser.normalize_characters(line) for line in lines_filtered]

In [9]:
print(lines[0:30])
print(len(lines))

['Project Title: Spark – Booking Management System\n', 'Name: Brian Fanning\n', 'Email: brian.fanning4@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Supervisor: Monica Ward\n', 'This web application is designed to be used by a business who repair/service vehicles on a daily basis.\n', 'The system creates and saves bookings, generates timetables for mechanics, and manages inventory.\n', 'Bookings saved in the system are assigned to a mechanic, who can view their timetable for the day on\n', 'their homepage. The timetables are generated by the system based on the bookings assigned to\n', 'mechanics.\n', 'The system can be used by different types of users, who can be granted specific access privileges\n', 'based on their role in the business.\n', '• Receptionists can create new bookings and assign them to mechanics. They can also view and\n', 'print out invoices created by a mechanic for a booking.\n', '• Mechanics can view their timetable for a particular day, and they can also 

In [10]:
# Separate lines into list of projects

projects_strings = projects_parser.lines_to_projects(lines)

In [11]:
print(projects_strings[0:3])
print(len(projects_strings))

['Project Title: Spark – Booking Management System\nName: Brian Fanning\nEmail: brian.fanning4@mail.dcu.ie\nProgramme: Computer Applications\nSupervisor: Monica Ward\nThis web application is designed to be used by a business who repair/service vehicles on a daily basis.\nThe system creates and saves bookings, generates timetables for mechanics, and manages inventory.\nBookings saved in the system are assigned to a mechanic, who can view their timetable for the day on\ntheir homepage. The timetables are generated by the system based on the bookings assigned to\nmechanics.\nThe system can be used by different types of users, who can be granted specific access privileges\nbased on their role in the business.\n• Receptionists can create new bookings and assign them to mechanics. They can also view and\nprint out invoices created by a mechanic for a booking.\n• Mechanics can view their timetable for a particular day, and they can also search through the\ninventory system to remove any inven

In [12]:
# Reference:
# x(.*)y          - extract the text between x and y
# x(.*)$          - extract all the text after x
# x(.*?y)         - extract the text between x and y, but as little text as possible
# x(.*?)(?:y|z)   - extract the text between x and either y or z

regex_schema = {
    'title': r'Project Title:(.*?)Name',
    'students': r'Name:(.*?)Programme',
    'programme': r'Programme:(.*)Supervisor',
    'supervisor_and_description': r'Supervisor:(.*)Project Area',
    'area': r'Project Area:(.*)Project Technology',
    'technology': r'Project Technology:(.*?)(?:Project Platform|$)',
    'platform': r'Project Platform:(.*?)$',
}

In [13]:
projects = projects_parser.parse_projects(projects_strings, regex_schema)

platform ANOMALY: 9 Project Title: Development of a Low-cost, Portable
Photoacoustic Apparatus for Solid Samples
Using High Power Light Emitting Diodes.
Name: Fiachra Kenny
Email: fiachra.kenny5@mail.dcu.ie
Programme: Electronic Engineering
Supervisor: Prof Patrick McNally
Photoacoustic spectroscopy is a method of examining gases liquids and solids by measuring
a photoacoustic signal which is produced by the absorption of light in the sample.
The goal of this project was to develop, build and test a portable photoacoustic setup for solid
samples using high power LEDs. The aim in the development of this setup is to simplify a
conventional photoacoustic setup, reduce the cost and increase its portability.
A large factor in achieving these goals is the use of high power LEDs as a light source instead of a
conventional source such as a laser or Xenon lamp.
The photoacoustic setup developed includes a photoacoustic cell where samples are housed along
with a LED light source and an electret 

In [14]:
projects[0:3]

[{'title': ' Spark – Booking Management System\n',
  'students': ' Brian Fanning\nEmail: brian.fanning4@mail.dcu.ie\n',
  'programme': ' Computer Applications\n',
  'supervisor_and_description': ' Monica Ward\nThis web application is designed to be used by a business who repair/service vehicles on a daily basis.\nThe system creates and saves bookings, generates timetables for mechanics, and manages inventory.\nBookings saved in the system are assigned to a mechanic, who can view their timetable for the day on\ntheir homepage. The timetables are generated by the system based on the bookings assigned to\nmechanics.\nThe system can be used by different types of users, who can be granted specific access privileges\nbased on their role in the business.\n• Receptionists can create new bookings and assign them to mechanics. They can also view and\nprint out invoices created by a mechanic for a booking.\n• Mechanics can view their timetable for a particular day, and they can also search throug

In [15]:
projects_final = projects_parser.canonicalize_projects(projects, name_sep='Name:', email_sep='Email:')

In [16]:
print(projects_final[0:3])
print(len(projects_final))

[{'title': 'Spark – Booking Management System', 'students': [{'name': 'Brian Fanning', 'email': 'brian.fanning4@mail.dcu.ie'}], 'programme': 'Computer Applications', 'area': 'Web Application', 'technology': 'Spring MVC + Spring Security, MySQL, Hibernate, Bootstrap CSS, JSTL, JUnit, Selenium', 'platform': 'Unix/Linux', 'supervisor': 'Monica Ward', 'description': 'This web application is designed to be used by a business who repair/service vehicles on a daily basis. The system creates and saves bookings, generates timetables for mechanics, and manages inventory. Bookings saved in the system are assigned to a mechanic, who can view their timetable for the day on their homepage. The timetables are generated by the system based on the bookings assigned to mechanics. The system can be used by different types of users, who can be granted specific access privileges based on their role in the business. • Receptionists can create new bookings and assign them to mechanics. They can also view and

In [17]:
projects_parser.write_json('2014', projects_final)