In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re

import projects_parser

In [3]:
BOOKLET_FILE = '../booklets_text/2017.txt'

In [4]:
with open(BOOKLET_FILE, encoding="utf-8") as f:
    raw_lines = f.readlines()

In [5]:
print(raw_lines[0:20])

['Project Number: 1\n', 'Project Title: Virtual society\n', 'Name: Jingyun Xu\n', 'Email: jingyun.xu5@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Supervisor: Alistair.Sutherland@dcu.ie\n', 'Virtual society is an application that could simulate virtual humans to solve tasks with different behaviours in a\n', 'virtual environment. Different humans in groups with different personalities will make different decisions in a\n', 'same event. Some are cooperative and others are aggressive. This project is about to simulate humans which\n', 'have different strategy try to get higher rewards in a society, then find out which strategy will works out best in\n', 'long term.\n', 'Project Area: Artificial Intelligence\n', 'Project Technology: C#, SQLite\n', 'Project Number: 2\n', 'Project Title: Design and evaluation of a testing unit to measure sealing pressure of fasteners joints\n', 'project\n', 'Name: Pádraig Wall\n', 'Email: padraig.wall5@mail.dcu.ie\n', 'Programme: Mechanical and Ma

In [6]:
# get rid of project numbering
filter_pattern = re.compile(r'^Project Number', re.M)
lines_filtered = list(filter(lambda line: not filter_pattern.match(line), raw_lines))

In [7]:
print(lines_filtered[0:30])

['Project Title: Virtual society\n', 'Name: Jingyun Xu\n', 'Email: jingyun.xu5@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Supervisor: Alistair.Sutherland@dcu.ie\n', 'Virtual society is an application that could simulate virtual humans to solve tasks with different behaviours in a\n', 'virtual environment. Different humans in groups with different personalities will make different decisions in a\n', 'same event. Some are cooperative and others are aggressive. This project is about to simulate humans which\n', 'have different strategy try to get higher rewards in a society, then find out which strategy will works out best in\n', 'long term.\n', 'Project Area: Artificial Intelligence\n', 'Project Technology: C#, SQLite\n', 'Project Title: Design and evaluation of a testing unit to measure sealing pressure of fasteners joints\n', 'project\n', 'Name: Pádraig Wall\n', 'Email: padraig.wall5@mail.dcu.ie\n', 'Programme: Mechanical and Manufacturing Engineering\n', 'Supervisor: Dermo

In [8]:
# Transform certain characters

lines = [projects_parser.normalize_characters(line) for line in lines_filtered]

In [9]:
print(lines[0:30])
print(len(lines))

['Project Title: Virtual society\n', 'Name: Jingyun Xu\n', 'Email: jingyun.xu5@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Supervisor: Alistair.Sutherland@dcu.ie\n', 'Virtual society is an application that could simulate virtual humans to solve tasks with different behaviours in a\n', 'virtual environment. Different humans in groups with different personalities will make different decisions in a\n', 'same event. Some are cooperative and others are aggressive. This project is about to simulate humans which\n', 'have different strategy try to get higher rewards in a society, then find out which strategy will works out best in\n', 'long term.\n', 'Project Area: Artificial Intelligence\n', 'Project Technology: C#, SQLite\n', 'Project Title: Design and evaluation of a testing unit to measure sealing pressure of fasteners joints\n', 'project\n', 'Name: Pádraig Wall\n', 'Email: padraig.wall5@mail.dcu.ie\n', 'Programme: Mechanical and Manufacturing Engineering\n', 'Supervisor: Dermo

In [10]:
# Separate lines into list of projects

projects_strings = projects_parser.lines_to_projects(lines)

In [11]:
print(projects_strings[0:3])
print(len(projects_strings))

['Project Title: Virtual society\nName: Jingyun Xu\nEmail: jingyun.xu5@mail.dcu.ie\nProgramme: Computer Applications\nSupervisor: Alistair.Sutherland@dcu.ie\nVirtual society is an application that could simulate virtual humans to solve tasks with different behaviours in a\nvirtual environment. Different humans in groups with different personalities will make different decisions in a\nsame event. Some are cooperative and others are aggressive. This project is about to simulate humans which\nhave different strategy try to get higher rewards in a society, then find out which strategy will works out best in\nlong term.\nProject Area: Artificial Intelligence\nProject Technology: C#, SQLite\n', 'Project Title: Design and evaluation of a testing unit to measure sealing pressure of fasteners joints\nproject\nName: Pádraig Wall\nEmail: padraig.wall5@mail.dcu.ie\nProgramme: Mechanical and Manufacturing Engineering\nSupervisor: Dermot.Brabazon@dcu.ie\nThis project was to design and oversee the ma

243

In [13]:
regex_schema = {
    'title': r'Project Title:(.*?)Name',
    'students': r'Name:(.*?)Programme',
    'programme': r'Programme:(.*)Supervisor',
    'supervisor_and_description': r'Supervisor:(.*)Project Area',
    'area': r'Project Area:(.*?)(?:Project Technology|$)',
    'technology': r'Project Technology:(.*?)$',
}

In [14]:
projects = projects_parser.parse_projects(projects_strings, regex_schema)

In [15]:
projects[0:3]

[{'title': ' Virtual society\n',
  'students': ' Jingyun Xu\nEmail: jingyun.xu5@mail.dcu.ie\n',
  'programme': ' Computer Applications\n',
  'supervisor_and_description': ' Alistair.Sutherland@dcu.ie\nVirtual society is an application that could simulate virtual humans to solve tasks with different behaviours in a\nvirtual environment. Different humans in groups with different personalities will make different decisions in a\nsame event. Some are cooperative and others are aggressive. This project is about to simulate humans which\nhave different strategy try to get higher rewards in a society, then find out which strategy will works out best in\nlong term.\n',
  'area': ' Artificial Intelligence\n',
  'technology': ' C#, SQLite'},
 {'title': ' Design and evaluation of a testing unit to measure sealing pressure of fasteners joints\nproject\n',
  'students': ' Pádraig Wall\nEmail: padraig.wall5@mail.dcu.ie\n',
  'programme': ' Mechanical and Manufacturing Engineering\n',
  'supervisor_a

In [16]:
projects_final = projects_parser.canonicalize_projects(projects, name_sep='Name:', email_sep='Email:')

In [17]:
projects_final[0:3]

[{'title': 'Virtual society',
  'students': [{'name': 'Jingyun Xu', 'email': 'jingyun.xu5@mail.dcu.ie'}],
  'programme': 'Computer Applications',
  'area': 'Artificial Intelligence',
  'technology': 'C#, SQLite',
  'supervisor': 'Alistair.Sutherland@dcu.ie',
  'description': 'Virtual society is an application that could simulate virtual humans to solve tasks with different behaviours in a virtual environment. Different humans in groups with different personalities will make different decisions in a same event. Some are cooperative and others are aggressive. This project is about to simulate humans which have different strategy try to get higher rewards in a society, then find out which strategy will works out best in long term.'},
 {'title': 'Design and evaluation of a testing unit to measure sealing pressure of fasteners joints project',
  'students': [{'name': 'Pádraig Wall', 'email': 'padraig.wall5@mail.dcu.ie'}],
  'programme': 'Mechanical and Manufacturing Engineering',
  'area': 

In [18]:
len(projects_final)

243

In [19]:
projects_parser.write_json('2017', projects_final)