In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re

import projects_parser

In [3]:
BOOKLET_FILE = "../booklets_text/2011.txt"

In [4]:
with open(BOOKLET_FILE, encoding="utf-8") as f:
    raw_lines = f.readlines()

In [5]:
print(raw_lines[0:20])
print(len(raw_lines))

['Project Title: TMT Project (Take Me There) - MultiModal National\n', 'Public Transport Information System\n', 'Name: Ovidiu Bernaschi\n', 'Email: bernaso2@computing.dcu.ie\n', 'Programme: Information Systems\n', 'Supervisor: Dr. Martin Crane\n', 'TMT Project aims to deliver a public Information System which would generate on request a national\n', 'Multi-Modal Public Transport Travel Itinerary that will guide commuters from their point of origin to\n', 'their desired destination within the Republic of Ireland.\n', 'It aims to integrate as many Irish public transport companies as possible, then offer routing and access\n', 'to this public IS from all web-compatible devices like smartphones, tablet PCs, laptops, desktops etc\n', 'TMT Project received collaboration & support from: Department of Transport Ireland; Former Minister\n', 'for Transport Mr Noel Dempsey TD; National Transport Authority; Transport 21 office; Dublin Chamber\n', 'of Commerce; Irish Rail, Eirebus/Urbus and Dublin 

In [6]:
# get rid of project numbering
filter_pattern = re.compile(r'^Project Number', re.M)
lines_filtered = list(filter(lambda line: not filter_pattern.match(line), raw_lines))

In [7]:
print(lines_filtered[0:30])
print(len(lines_filtered))

['Project Title: TMT Project (Take Me There) - MultiModal National\n', 'Public Transport Information System\n', 'Name: Ovidiu Bernaschi\n', 'Email: bernaso2@computing.dcu.ie\n', 'Programme: Information Systems\n', 'Supervisor: Dr. Martin Crane\n', 'TMT Project aims to deliver a public Information System which would generate on request a national\n', 'Multi-Modal Public Transport Travel Itinerary that will guide commuters from their point of origin to\n', 'their desired destination within the Republic of Ireland.\n', 'It aims to integrate as many Irish public transport companies as possible, then offer routing and access\n', 'to this public IS from all web-compatible devices like smartphones, tablet PCs, laptops, desktops etc\n', 'TMT Project received collaboration & support from: Department of Transport Ireland; Former Minister\n', 'for Transport Mr Noel Dempsey TD; National Transport Authority; Transport 21 office; Dublin Chamber\n', 'of Commerce; Irish Rail, Eirebus/Urbus and Dublin 

In [8]:
# Transform certain characters

lines = [projects_parser.normalize_characters(line) for line in lines_filtered]

In [9]:
print(lines[0:30])
print(len(lines))

['Project Title: TMT Project (Take Me There) - MultiModal National\n', 'Public Transport Information System\n', 'Name: Ovidiu Bernaschi\n', 'Email: bernaso2@computing.dcu.ie\n', 'Programme: Information Systems\n', 'Supervisor: Dr. Martin Crane\n', 'TMT Project aims to deliver a public Information System which would generate on request a national\n', 'Multi-Modal Public Transport Travel Itinerary that will guide commuters from their point of origin to\n', 'their desired destination within the Republic of Ireland.\n', 'It aims to integrate as many Irish public transport companies as possible, then offer routing and access\n', 'to this public IS from all web-compatible devices like smartphones, tablet PCs, laptops, desktops etc\n', 'TMT Project received collaboration & support from: Department of Transport Ireland; Former Minister\n', 'for Transport Mr Noel Dempsey TD; National Transport Authority; Transport 21 office; Dublin Chamber\n', 'of Commerce; Irish Rail, Eirebus/Urbus and Dublin 

In [10]:
# Separate lines into list of projects

projects_strings = projects_parser.lines_to_projects(lines)

In [11]:
print(projects_strings[0:3])
print(len(projects_strings))

["Project Title: TMT Project (Take Me There) - MultiModal National\nPublic Transport Information System\nName: Ovidiu Bernaschi\nEmail: bernaso2@computing.dcu.ie\nProgramme: Information Systems\nSupervisor: Dr. Martin Crane\nTMT Project aims to deliver a public Information System which would generate on request a national\nMulti-Modal Public Transport Travel Itinerary that will guide commuters from their point of origin to\ntheir desired destination within the Republic of Ireland.\nIt aims to integrate as many Irish public transport companies as possible, then offer routing and access\nto this public IS from all web-compatible devices like smartphones, tablet PCs, laptops, desktops etc\nTMT Project received collaboration & support from: Department of Transport Ireland; Former Minister\nfor Transport Mr Noel Dempsey TD; National Transport Authority; Transport 21 office; Dublin Chamber\nof Commerce; Irish Rail, Eirebus/Urbus and Dublin Bikes; IBM Ireland; DCU.\nPresentation website: www.

In [12]:
regex_schema = {
    'title': r'Project Title:(.*?)Name',
    'students': r'Name:(.*?)Programme',
    'programme': r'Programme:(.*)Supervisor',
    'supervisor_and_description': r'Supervisor:(.*?)(?:Primary Area:|Secondary Area:|Primary OS:|Primary Technology:|Secondary Technology:|$)',
    'primary_area': r'Primary Area:(.*?)(?:Secondary Area:|Primary OS:|Primary Technology:|Secondary Technology:|$)',
    'secondary_area': r'Secondary Area:(.*?)(?:Primary OS:|Primary Technology:|Secondary Technology:|$)',
    'primary_os': r'Primary OS:(.*?)(?:Primary Technology:|Secondary Technology:|$)',
    'primary_tech': r'Primary Technology:(.*?)(?:Secondary Technology|$)',
    'secondary_tech': r'Secondary Technology:(.*?)$',
}

In [13]:
projects = projects_parser.parse_projects(projects_strings, regex_schema)

In [14]:
projects[0:3]

[{'title': ' TMT Project (Take Me There) - MultiModal National\nPublic Transport Information System\n',
  'students': ' Ovidiu Bernaschi\nEmail: bernaso2@computing.dcu.ie\n',
  'programme': ' Information Systems\n',
  'supervisor_and_description': " Dr. Martin Crane\nTMT Project aims to deliver a public Information System which would generate on request a national\nMulti-Modal Public Transport Travel Itinerary that will guide commuters from their point of origin to\ntheir desired destination within the Republic of Ireland.\nIt aims to integrate as many Irish public transport companies as possible, then offer routing and access\nto this public IS from all web-compatible devices like smartphones, tablet PCs, laptops, desktops etc\nTMT Project received collaboration & support from: Department of Transport Ireland; Former Minister\nfor Transport Mr Noel Dempsey TD; National Transport Authority; Transport 21 office; Dublin Chamber\nof Commerce; Irish Rail, Eirebus/Urbus and Dublin Bikes; IB

In [15]:
projects_final = projects_parser.canonicalize_projects(projects, name_sep='Name:', email_sep='Email:')

In [16]:
projects_final[0:3]

[{'title': 'TMT Project (Take Me There) - MultiModal National Public Transport Information System',
  'students': [{'name': 'Ovidiu Bernaschi',
    'email': 'bernaso2@computing.dcu.ie'}],
  'programme': 'Information Systems',
  'primary_area': 'Web Application',
  'secondary_area': 'Navigation Information Platform',
  'primary_os': 'Multi-platform',
  'primary_tech': 'Google Maps Javascript API V3',
  'secondary_tech': 'Sencha Ext JS, MySQL, PHP',
  'supervisor': 'Dr. Martin Crane',
  'description': "TMT Project aims to deliver a public Information System which would generate on request a national Multi-Modal Public Transport Travel Itinerary that will guide commuters from their point of origin to their desired destination within the Republic of Ireland. It aims to integrate as many Irish public transport companies as possible, then offer routing and access to this public IS from all web-compatible devices like smartphones, tablet PCs, laptops, desktops etc TMT Project received collabo

In [17]:
len(projects_final)

69

In [18]:
# in 2011 some fields have "None" for them instead of just an ommitted field
def normalize_none(project):
    project_copy = {**project}
    project_copy['secondary_area'] = None if project['secondary_area'] == 'None' else project['secondary_area']
    project_copy['secondary_tech'] =  None if project['secondary_tech'] == 'None' else project['secondary_tech']
    return project_copy

In [19]:
projects_none_normalized = projects_parser.apply_transformation(projects_final, normalize_none)

In [20]:
print(projects_none_normalized[:3])
print(len(projects_none_normalized))

[{'title': 'TMT Project (Take Me There) - MultiModal National Public Transport Information System', 'students': [{'name': 'Ovidiu Bernaschi', 'email': 'bernaso2@computing.dcu.ie'}], 'programme': 'Information Systems', 'primary_area': 'Web Application', 'secondary_area': 'Navigation Information Platform', 'primary_os': 'Multi-platform', 'primary_tech': 'Google Maps Javascript API V3', 'secondary_tech': 'Sencha Ext JS, MySQL, PHP', 'supervisor': 'Dr. Martin Crane', 'description': "TMT Project aims to deliver a public Information System which would generate on request a national Multi-Modal Public Transport Travel Itinerary that will guide commuters from their point of origin to their desired destination within the Republic of Ireland. It aims to integrate as many Irish public transport companies as possible, then offer routing and access to this public IS from all web-compatible devices like smartphones, tablet PCs, laptops, desktops etc TMT Project received collaboration & support from:

In [21]:
projects_area_combined = projects_parser.apply_transformation(projects_none_normalized, 
    projects_parser.combine_fields,
    'area',
    'primary_area', 
    'secondary_area')
projects_tech_combined = projects_parser.apply_transformation(projects_area_combined,
    projects_parser.combine_fields,
    'technology',
    'primary_tech',
    'secondary_tech')
projects_platform = projects_parser.apply_transformation(
    projects_tech_combined,
    projects_parser.rename_field,
    'platform',
    'primary_os')

In [22]:
print(projects_platform[:3])
print(len(projects_platform))

[{'title': 'TMT Project (Take Me There) - MultiModal National Public Transport Information System', 'students': [{'name': 'Ovidiu Bernaschi', 'email': 'bernaso2@computing.dcu.ie'}], 'programme': 'Information Systems', 'supervisor': 'Dr. Martin Crane', 'description': "TMT Project aims to deliver a public Information System which would generate on request a national Multi-Modal Public Transport Travel Itinerary that will guide commuters from their point of origin to their desired destination within the Republic of Ireland. It aims to integrate as many Irish public transport companies as possible, then offer routing and access to this public IS from all web-compatible devices like smartphones, tablet PCs, laptops, desktops etc TMT Project received collaboration & support from: Department of Transport Ireland; Former Minister for Transport Mr Noel Dempsey TD; National Transport Authority; Transport 21 office; Dublin Chamber of Commerce; Irish Rail, Eirebus/Urbus and Dublin Bikes; IBM Irela

In [23]:
projects_parser.write_json('2011', projects_platform)