# Parse and create an oriented graph of COMP courses

The idea is to parse the course website and get prerequisite field for each course, not caring wether it's an AND or OR condition.

In [1]:
from time import sleep
from random import randint
from selenium import webdriver
from pyvirtualdisplay import Display

In [2]:
url = "https://w5.ab.ust.hk/wcq/cgi-bin/1710/subject/COMP"

In [3]:
print('starting driver...')
display = Display(visible=0, size=(800, 600))
display.start()
driver = webdriver.Chrome()
sleep(4)

starting driver...


In [4]:
print('getting page...')
driver.get(url)
sleep(randint(2,3))

getting page...


In [5]:
all_courses = []
links = []

for div in driver.find_elements_by_xpath('//div[@id="classes"]')[0].find_elements_by_class_name('course'):
    name = div.find_element_by_tag_name('h2').text
    description = ""
    prerequisite = ""
    dept = name.split(' ')[0]
#         print name
    courseinfo = div.find_elements_by_xpath('./*[@class="courseinfo"]/*[@class="courseattr popup"]/*[@class="popupdetail"]/table/tbody/tr')
#         print len(courseinfo)
    for info in courseinfo:
        if info.find_element_by_tag_name('th').get_attribute('innerHTML') == 'DESCRIPTION':
            description = info.find_element_by_tag_name('td').get_attribute('innerHTML')
        if info.find_element_by_tag_name('th').get_attribute('innerHTML') == 'PRE-REQUISITE':
            prerequisite = info.find_element_by_tag_name('td').get_attribute('innerHTML')
#                 print description
    current_course = {
        'name':name,
        'description':description,
        'dept':dept,
        'prerequisite':prerequisite
    }
    all_courses.append(current_course)
        

In [6]:
# Find prerequisites

### Open Problem

Find the Regex to match COMP ISOM or MATH course codes, I couldn't figure it out myself


#### Sample examples:

```
COMP 2011 OR COMP 2012 OR COMP 2012H
(COMP 3711 OR COMP 3711H) AND (MATH 2111)
COMP 1002 (prior to 2013-14) OR COMP 1004 (prior to 2013-14) OR COMP 1021 OR COMP 1022Q OR ISOM 3230
COMP 1021 OR COMP 1022P OR COMP 1022Q OR ISOM 3230
COMP 2611 OR [ELEC 2300 AND (COMP 1002 (prior to 2013-14) OR COMP 1004 (prior to 2013-14) OR COMP 2011 OR COMP 2012H)]
```

#### Sample outputs:

```python
["COMP 2011","COMP 2012","COMP 2012H"]
["COMP 3711","COMP 3711H","MATH 2111"]
["COMP 1002","COMP 1004","COMP 1021","COMP 1022Q","ISOM 3230"]
["COMP 1021","COMP 1022P","COMP 1022Q","ISOM 3230"]
["COMP 2611","ELEC 2300","COMP 1002","COMP 1004","COMP 2011","COMP 2012H"]
```

### Never mind, found one

Regex: `\w\w\w+ \d\w+`  
Check [https://regex101.com](https://regex101.com) and [http://regex.inginf.units.it](http://regex.inginf.units.it)

In [7]:
import re
p = re.compile('\w\w\w+ \d\w+')

for course in all_courses:
    course['prerequisite_list'] = p.findall(course['prerequisite'])
    print p.findall(course['prerequisite'])
    

[]
[]
[]
[]
[u'COMP 1002', u'COMP 1004', u'COMP 1021', u'COMP 1022P']
[u'COMP 1021', u'COMP 1022P', u'COMP 1022Q', u'ISOM 3230', u'ISOM 3320']
[u'COMP 1002', u'COMP 1004', u'COMP 1021', u'COMP 1022Q', u'ISOM 3230']
[u'COMP 1002', u'COMP 1004', u'COMP 1022P', u'COMP 1022Q', u'ISOM 3230', u'ISOM 3320']
[u'COMP 1002', u'COMP 1004', u'COMP 1021', u'COMP 1022P', u'ISOM 3320']
[]
[]
[u'COMP 1021', u'COMP 1022P', u'COMP 1022Q', u'ISOM 3230']
[u'COMP 1004', u'COMP 2011']
[u'COMP 1002', u'COMP 1021', u'COMP 1022P', u'COMP 1022Q', u'ISOM 3230']
[]
[u'COMP 1004', u'COMP 2011', u'COMP 2012H']
[]
[u'MATH 1014', u'MATH 1020', u'MATH 1024']
[u'COMP 2012', u'COMP 2012H']
[u'COMP 2012', u'COMP 2012H']
[u'COMP 2012', u'COMP 2012H']
[u'COMP 2012', u'COMP 2012H']
[u'COMP 2011', u'COMP 2012', u'COMP 2012H']
[u'COMP 2011', u'COMP 2012', u'COMP 2012H']
[u'COMP 2611', u'ELEC 2300', u'COMP 1002', u'COMP 1004', u'COMP 2011', u'COMP 2012H']
[u'COMP 2012', u'COMP 2012H']
[u'COMP 2011', u'COMP 2012', u'COMP 2012H'

In [8]:
import pandas as pd
print len(all_courses)
df = pd.DataFrame.from_dict(all_courses)
df

63


Unnamed: 0,dept,description,name,prerequisite,prerequisite_list
0,COMP,This course is an introduction to computers an...,COMP 1001 - Exploring Multimedia and Internet ...,,[]
1,COMP,This course introduces students to the world o...,COMP 1021 - Introduction to Computer Science (...,,[]
2,COMP,This course is designed to equip students with...,COMP 1022P - Introduction to Computing with Ja...,,[]
3,COMP,This course is designed to equip students with...,COMP 1022Q - Introduction to Computing with Ex...,,[]
4,COMP,This course provides a basic introduction to m...,COMP 1029A - Introduction to Mobile Applicatio...,COMP 1002 (prior to 2013-14) OR COMP 1004 (pri...,"[COMP 1002, COMP 1004, COMP 1021, COMP 1022P]"
5,COMP,This course introduces the C programming langu...,COMP 1029C - C Programming Bridging Course (1 ...,COMP 1021 OR COMP 1022P OR COMP 1022Q OR ISOM ...,"[COMP 1021, COMP 1022P, COMP 1022Q, ISOM 3230,..."
6,COMP,This course introduces the Java programming la...,COMP 1029J - Java Programming Bridging Course ...,COMP 1002 (prior to 2013-14) OR COMP 1004 (pri...,"[COMP 1002, COMP 1004, COMP 1021, COMP 1022Q, ..."
7,COMP,This course introduces the Python programming ...,COMP 1029P - Python Programming Bridging Cours...,COMP 1002 (prior to 2013-14) OR COMP 1004 (pri...,"[COMP 1002, COMP 1004, COMP 1022P, COMP 1022Q,..."
8,COMP,This course introduces the VBA programming lan...,COMP 1029V - Excel VBA Programming Bridging Co...,COMP 1002 (prior to 2013-14) OR COMP 1004 (pri...,"[COMP 1002, COMP 1004, COMP 1021, COMP 1022P, ..."
9,COMP,Full-time internship training for a period of ...,COMP 1991 - Industrial Experience (0 units),,[]


In [9]:
print('closing driver...')
display.stop()
driver.quit()
print('closed!')

closing driver...
closed!
