In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re

import projects_parser

In [3]:
BOOK_FILE = "../booklets_text/2016.txt"

In [4]:
with open(BOOK_FILE, encoding="utf-8") as f:
    raw_lines = f.readlines()

In [5]:
print(raw_lines[0:20])

['1\n', 'Project Title: Crowd Control\n', 'Name: Harry Quigley\n', 'Email: harry.quigley2@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Supervisor: Suzanne Little\n', 'Crowd Control is a people-counting and attendance management system. The system consists of a raspberry\n', 'pi, camera module, cloud backend and a web application. On the pi, using computer vision algorithms, people\n', 'entering/leaving the building are detected. The amount of people that have entered/left and the time stamp is\n', 'then sent to the server. The Crowd Control web application uses this data to display how many people are in a\n', 'venue in real time and represents trends graphically over time.\n', 'Project Area: Computer Vision, Image Video Processing, RaspberryPi, Web Application, Databases\n', 'Project Technology: HTML5, Java, JavaScript, Python, REST, SpringMVC, SQL\n', '2\n', 'Project Title: Assassin\n', 'Name: Sean Kelly\n', 'Email: sean.kelly224@mail.dcu.ie\n', 'Programme: Computer Applica

In [6]:
# Get rid of project numbering
# also filter out "Final Year Projects Expo 2016" separators
filter_pattern = re.compile(r'(^\d+|Final Year Projects Expo 2016)', re.M)
lines_filtered = list(filter(lambda line: not filter_pattern.match(line), raw_lines))

In [7]:
print(lines_filtered[0:30])

['Project Title: Crowd Control\n', 'Name: Harry Quigley\n', 'Email: harry.quigley2@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Supervisor: Suzanne Little\n', 'Crowd Control is a people-counting and attendance management system. The system consists of a raspberry\n', 'pi, camera module, cloud backend and a web application. On the pi, using computer vision algorithms, people\n', 'entering/leaving the building are detected. The amount of people that have entered/left and the time stamp is\n', 'then sent to the server. The Crowd Control web application uses this data to display how many people are in a\n', 'venue in real time and represents trends graphically over time.\n', 'Project Area: Computer Vision, Image Video Processing, RaspberryPi, Web Application, Databases\n', 'Project Technology: HTML5, Java, JavaScript, Python, REST, SpringMVC, SQL\n', 'Project Title: Assassin\n', 'Name: Sean Kelly\n', 'Email: sean.kelly224@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Sup

In [8]:
# Transform certain characters

lines = [projects_parser.normalize_characters(line) for line in lines_filtered]

In [9]:
print(lines[0:30])
print(len(lines))

['Project Title: Crowd Control\n', 'Name: Harry Quigley\n', 'Email: harry.quigley2@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Supervisor: Suzanne Little\n', 'Crowd Control is a people-counting and attendance management system. The system consists of a raspberry\n', 'pi, camera module, cloud backend and a web application. On the pi, using computer vision algorithms, people\n', 'entering/leaving the building are detected. The amount of people that have entered/left and the time stamp is\n', 'then sent to the server. The Crowd Control web application uses this data to display how many people are in a\n', 'venue in real time and represents trends graphically over time.\n', 'Project Area: Computer Vision, Image Video Processing, RaspberryPi, Web Application, Databases\n', 'Project Technology: HTML5, Java, JavaScript, Python, REST, SpringMVC, SQL\n', 'Project Title: Assassin\n', 'Name: Sean Kelly\n', 'Email: sean.kelly224@mail.dcu.ie\n', 'Programme: Computer Applications\n', 'Sup

In [10]:
# Separate lines into list of projects

projects_strings = projects_parser.lines_to_projects(lines)

In [11]:
print(projects_strings[0:3])
print(len(projects_strings))

['Project Title: Crowd Control\nName: Harry Quigley\nEmail: harry.quigley2@mail.dcu.ie\nProgramme: Computer Applications\nSupervisor: Suzanne Little\nCrowd Control is a people-counting and attendance management system. The system consists of a raspberry\npi, camera module, cloud backend and a web application. On the pi, using computer vision algorithms, people\nentering/leaving the building are detected. The amount of people that have entered/left and the time stamp is\nthen sent to the server. The Crowd Control web application uses this data to display how many people are in a\nvenue in real time and represents trends graphically over time.\nProject Area: Computer Vision, Image Video Processing, RaspberryPi, Web Application, Databases\nProject Technology: HTML5, Java, JavaScript, Python, REST, SpringMVC, SQL\n', "Project Title: Assassin\nName: Sean Kelly\nEmail: sean.kelly224@mail.dcu.ie\nProgramme: Computer Applications\nSupervisor: Geoff Hamilton\nAssassin is an Android multiplayer 

In [12]:
regex_schema = {
    'title': r'Project Title:(.*?)Name',
    'students': r'Name:(.*?)Programme',
    'programme': r'Programme:(.*)Supervisor',
    'supervisor_and_description': r'Supervisor:(.*)Project Area',
    'area': r'Project Area:(.*?)(?:Project Technology|$)',
    'technology': r'Project Technology:(.*?)$',
}

In [13]:
projects = projects_parser.parse_projects(projects_strings, regex_schema)

In [14]:
projects[0:3]

[{'title': ' Crowd Control\n',
  'students': ' Harry Quigley\nEmail: harry.quigley2@mail.dcu.ie\n',
  'programme': ' Computer Applications\n',
  'supervisor_and_description': ' Suzanne Little\nCrowd Control is a people-counting and attendance management system. The system consists of a raspberry\npi, camera module, cloud backend and a web application. On the pi, using computer vision algorithms, people\nentering/leaving the building are detected. The amount of people that have entered/left and the time stamp is\nthen sent to the server. The Crowd Control web application uses this data to display how many people are in a\nvenue in real time and represents trends graphically over time.\n',
  'area': ' Computer Vision, Image Video Processing, RaspberryPi, Web Application, Databases\n',
  'technology': ' HTML5, Java, JavaScript, Python, REST, SpringMVC, SQL'},
 {'title': ' Assassin\n',
  'students': ' Sean Kelly\nEmail: sean.kelly224@mail.dcu.ie\n',
  'programme': ' Computer Applications\n

In [15]:
projects_final = projects_parser.canonicalize_projects(projects, name_sep="Name:", email_sep="Email:")

In [16]:
projects_final[0:3]

[{'title': 'Crowd Control',
  'students': [{'name': 'Harry Quigley',
    'email': 'harry.quigley2@mail.dcu.ie'}],
  'programme': 'Computer Applications',
  'area': 'Computer Vision, Image Video Processing, RaspberryPi, Web Application, Databases',
  'technology': 'HTML5, Java, JavaScript, Python, REST, SpringMVC, SQL',
  'supervisor': 'Suzanne Little',
  'description': 'Crowd Control is a people-counting and attendance management system. The system consists of a raspberry pi, camera module, cloud backend and a web application. On the pi, using computer vision algorithms, people entering/leaving the building are detected. The amount of people that have entered/left and the time stamp is then sent to the server. The Crowd Control web application uses this data to display how many people are in a venue in real time and represents trends graphically over time.'},
 {'title': 'Assassin',
  'students': [{'name': 'Sean Kelly', 'email': 'sean.kelly224@mail.dcu.ie'}],
  'programme': 'Computer Ap

In [17]:
len(projects_final)

206

In [18]:
projects_parser.write_json("2016", projects_final)