# Parse and create an oriented graph of COMP courses

The idea is to parse the course website and get prerequisite field for each course, not caring wether it's an AND or OR condition.

In [1]:
from time import sleep
from random import randint
from selenium import webdriver
from pyvirtualdisplay import Display

In [2]:
url = "https://w5.ab.ust.hk/wcq/cgi-bin/1710/subject/COMP"

In [3]:
print('starting driver...')
display = Display(visible=0, size=(800, 600))
display.start()
driver = webdriver.Chrome()
sleep(4)

starting driver...


In [4]:
print('getting page...')
driver.get(url)
sleep(randint(2,3))

getting page...


In [5]:
all_courses = []
links = []

for div in driver.find_elements_by_xpath('//div[@id="classes"]')[0].find_elements_by_class_name('course'):
    name = div.find_element_by_tag_name('h2').text
    description = ""
    prerequisite = ""
    dept = name.split(' ')[0]
#         print name
    courseinfo = div.find_elements_by_xpath('./*[@class="courseinfo"]/*[@class="courseattr popup"]/*[@class="popupdetail"]/table/tbody/tr')
#         print len(courseinfo)
    for info in courseinfo:
        if info.find_element_by_tag_name('th').get_attribute('innerHTML') == 'DESCRIPTION':
            description = info.find_element_by_tag_name('td').get_attribute('innerHTML')
        if info.find_element_by_tag_name('th').get_attribute('innerHTML') == 'PRE-REQUISITE':
            prerequisite = info.find_element_by_tag_name('td').get_attribute('innerHTML')
#                 print description
    current_course = {
        'name':name,
        'description':description,
        'dept':dept,
        'prerequisite':prerequisite
    }
    all_courses.append(current_course)
        

In [6]:
# Find prerequisites

### Open Problem

Find the Regex to match COMP ISOM or MATH course codes, I couldn't figure it out myself


#### Sample examples:

```
COMP 2011 OR COMP 2012 OR COMP 2012H
(COMP 3711 OR COMP 3711H) AND (MATH 2111)
COMP 1002 (prior to 2013-14) OR COMP 1004 (prior to 2013-14) OR COMP 1021 OR COMP 1022Q OR ISOM 3230
COMP 1021 OR COMP 1022P OR COMP 1022Q OR ISOM 3230
COMP 2611 OR [ELEC 2300 AND (COMP 1002 (prior to 2013-14) OR COMP 1004 (prior to 2013-14) OR COMP 2011 OR COMP 2012H)]
```

#### Sample outputs:

```python
["COMP 2011","COMP 2012","COMP 2012H"]
["COMP 3711","COMP 3711H","MATH 2111"]
["COMP 1002","COMP 1004","COMP 1021","COMP 1022Q","ISOM 3230"]
["COMP 1021","COMP 1022P","COMP 1022Q","ISOM 3230"]
["COMP 2611","ELEC 2300","COMP 1002","COMP 1004","COMP 2011","COMP 2012H"]
```

### Never mind, found one

Regex: `\w\w\w+ \d\w+`  
Check [https://regex101.com](https://regex101.com) and [http://regex.inginf.units.it](http://regex.inginf.units.it)

In [7]:
import re
p = re.compile('\w\w\w+ \d\w+')
html = "<style>.regex_match{color:red}</style>"

for course in all_courses:
    text = course['prerequisite']
    prerequisite_list = p.findall(course['prerequisite'])
    course['prerequisite_list'] = prerequisite_list
    for highlight in prerequisite_list:
        text = re.sub(str(highlight),r'<span class="regex_match">\g<0></span>',text)
    html+=text+'<br>'
#     print text

from IPython.core.display import display, HTML
display(HTML(html))
# small error with Honors courses, they come from the display, issue with "COMP1022Q" which is not labelled as the others.

In [8]:
nodes = []
links = []

for course in all_courses:
    name = course['name'].split(' - ')[0]
    nodes.append(name)
    for highlight in course['prerequisite_list']:
        links.append((name,highlight))
        if highlight not in nodes:
            nodes.append(highlight)

In [9]:
nodes

[u'COMP 1001',
 u'COMP 1021',
 u'COMP 1022P',
 u'COMP 1022Q',
 u'COMP 1029A',
 u'COMP 1002',
 u'COMP 1004',
 u'COMP 1029C',
 u'ISOM 3230',
 u'ISOM 3320',
 u'COMP 1029J',
 u'COMP 1029P',
 u'COMP 1029V',
 u'COMP 1991',
 u'COMP 1999',
 u'COMP 2011',
 u'COMP 2012',
 u'COMP 2012H',
 u'COMP 2521',
 u'COMP 2611',
 u'COMP 2711',
 u'COMP 2711H',
 u'MATH 1014',
 u'MATH 1020',
 u'MATH 1024',
 u'COMP 3021',
 u'COMP 3031',
 u'COMP 3111',
 u'COMP 3111H',
 u'COMP 3211',
 u'COMP 3311',
 u'COMP 3511',
 u'ELEC 2300',
 u'COMP 3632',
 u'COMP 3711',
 u'MATH 2343',
 u'COMP 3711H',
 u'COMP 3721',
 u'MATH 2111',
 u'MATH 2121',
 u'MATH 2350',
 u'COMP 4021',
 u'COMP 4311',
 u'COMP 4331',
 u'IELM 2510',
 u'ISOM 2500',
 u'LIFS 3150',
 u'MATH 2411',
 u'COMP 4421',
 u'MATH 2011',
 u'MATH 2351',
 u'MATH 2352',
 u'COMP 4461',
 u'COMP 4621',
 u'COMP 4651',
 u'COMP 4900',
 u'COMP 4901J',
 u'COMP 4971A',
 u'COMP 4971C',
 u'COMP 4971D',
 u'COMP 4971F',
 u'COMP 4981',
 u'COMP 4981H',
 u'COMP 4988',
 u'COMP 4989',
 u'COMP 

In [10]:
links

[(u'COMP 1029A', u'COMP 1002'),
 (u'COMP 1029A', u'COMP 1004'),
 (u'COMP 1029A', u'COMP 1021'),
 (u'COMP 1029A', u'COMP 1022P'),
 (u'COMP 1029C', u'COMP 1021'),
 (u'COMP 1029C', u'COMP 1022P'),
 (u'COMP 1029C', u'COMP 1022Q'),
 (u'COMP 1029C', u'ISOM 3230'),
 (u'COMP 1029C', u'ISOM 3320'),
 (u'COMP 1029J', u'COMP 1002'),
 (u'COMP 1029J', u'COMP 1004'),
 (u'COMP 1029J', u'COMP 1021'),
 (u'COMP 1029J', u'COMP 1022Q'),
 (u'COMP 1029J', u'ISOM 3230'),
 (u'COMP 1029P', u'COMP 1002'),
 (u'COMP 1029P', u'COMP 1004'),
 (u'COMP 1029P', u'COMP 1022P'),
 (u'COMP 1029P', u'COMP 1022Q'),
 (u'COMP 1029P', u'ISOM 3230'),
 (u'COMP 1029P', u'ISOM 3320'),
 (u'COMP 1029V', u'COMP 1002'),
 (u'COMP 1029V', u'COMP 1004'),
 (u'COMP 1029V', u'COMP 1021'),
 (u'COMP 1029V', u'COMP 1022P'),
 (u'COMP 1029V', u'ISOM 3320'),
 (u'COMP 2011', u'COMP 1021'),
 (u'COMP 2011', u'COMP 1022P'),
 (u'COMP 2011', u'COMP 1022Q'),
 (u'COMP 2011', u'ISOM 3230'),
 (u'COMP 2012', u'COMP 1004'),
 (u'COMP 2012', u'COMP 2011'),
 (u'C

In [11]:
# import matplotlib.pyplot as plt
# import networkx as nx

# G=nx.Graph()
# G.add_nodes_from(nodes)
# pos=nx.spring_layout(G) # positions for all nodes

# # nodes
# nx.draw_networkx_nodes(G,pos,
#                        nodelist=nodes,
#                        node_color='r',
#                        node_size=800,
#                         alpha=0.8)

# # edges
# nx.draw_networkx_edges(G,pos,width=1.0,alpha=0.5)
# nx.draw_networkx_edges(G,pos,
#                        edgelist=links,
#                        width=8,alpha=0.5,edge_color='r')


# # # some math labels
# # labels={}
# # labels[0]=r'$a$'
# # labels[1]=r'$b$'
# # labels[2]=r'$c$'
# # labels[3]=r'$d$'
# # labels[4]=r'$\alpha$'
# # labels[5]=r'$\beta$'
# # labels[6]=r'$\gamma$'
# # labels[7]=r'$\delta$'
# # nx.draw_networkx_labels(G,pos,labels,font_size=16)

# plt.axis('off')
# plt.savefig("labels_and_colors.png") # save as png
# plt.show() # display

In [12]:
# G = nx.connected_watts_strogatz_graph(30,5,.2)
# nodes = G.nodes()
# edges = G.edges()
# # define the initial positions of the nodes using networkx's spring_layout function, and add to the nodes_dict.
import networkx as nx
import matplotlib.pyplot as plt
import visJS2jupyter
from visJS2jupyter import visJS_module

G=nx.Graph()
G.add_nodes_from(nodes)
pos=nx.spring_layout(G) # positions for all nodes
pos = nx.spring_layout(G)
nodes_dict = [{"id":n,
              "x":pos[n][0]*10,
              "y":pos[n][1]*10} for n in nodes]
node_map = dict(zip(nodes,range(len(nodes))))  # map to indices for source/target in edges

edges_dict = [{"source":node_map[links[i][0]], "target":node_map[links[i][1]], 
              "title":'test'} for i in range(len(links))]

visJS2jupyter.visJS_module.visjs_network(nodes_dict,edges_dict)

The previous block of code render the graph below:

![alt text](images/graph.png "Graph")

In [13]:
import pandas as pd
print len(all_courses)
df = pd.DataFrame.from_dict(all_courses)
df

63


Unnamed: 0,dept,description,name,prerequisite,prerequisite_list
0,COMP,This course is an introduction to computers an...,COMP 1001 - Exploring Multimedia and Internet ...,,[]
1,COMP,This course introduces students to the world o...,COMP 1021 - Introduction to Computer Science (...,,[]
2,COMP,This course is designed to equip students with...,COMP 1022P - Introduction to Computing with Ja...,,[]
3,COMP,This course is designed to equip students with...,COMP 1022Q - Introduction to Computing with Ex...,,[]
4,COMP,This course provides a basic introduction to m...,COMP 1029A - Introduction to Mobile Applicatio...,COMP 1002 (prior to 2013-14) OR COMP 1004 (pri...,"[COMP 1002, COMP 1004, COMP 1021, COMP 1022P]"
5,COMP,This course introduces the C programming langu...,COMP 1029C - C Programming Bridging Course (1 ...,COMP 1021 OR COMP 1022P OR COMP 1022Q OR ISOM ...,"[COMP 1021, COMP 1022P, COMP 1022Q, ISOM 3230,..."
6,COMP,This course introduces the Java programming la...,COMP 1029J - Java Programming Bridging Course ...,COMP 1002 (prior to 2013-14) OR COMP 1004 (pri...,"[COMP 1002, COMP 1004, COMP 1021, COMP 1022Q, ..."
7,COMP,This course introduces the Python programming ...,COMP 1029P - Python Programming Bridging Cours...,COMP 1002 (prior to 2013-14) OR COMP 1004 (pri...,"[COMP 1002, COMP 1004, COMP 1022P, COMP 1022Q,..."
8,COMP,This course introduces the VBA programming lan...,COMP 1029V - Excel VBA Programming Bridging Co...,COMP 1002 (prior to 2013-14) OR COMP 1004 (pri...,"[COMP 1002, COMP 1004, COMP 1021, COMP 1022P, ..."
9,COMP,Full-time internship training for a period of ...,COMP 1991 - Industrial Experience (0 units),,[]


In [14]:
nodes_json = []
links_json = []

for node in nodes:
    current_node = {
        'id':node,
        'group':node[5]
    }
    nodes_json.append(current_node)
    
for link in links:
    current_link = {
        'source':link[0],
        'target':link[1],
        'value':1
    }
    links_json.append(current_link)

In [15]:
nodes_json

[{'group': u'1', 'id': u'COMP 1001'},
 {'group': u'1', 'id': u'COMP 1021'},
 {'group': u'1', 'id': u'COMP 1022P'},
 {'group': u'1', 'id': u'COMP 1022Q'},
 {'group': u'1', 'id': u'COMP 1029A'},
 {'group': u'1', 'id': u'COMP 1002'},
 {'group': u'1', 'id': u'COMP 1004'},
 {'group': u'1', 'id': u'COMP 1029C'},
 {'group': u'3', 'id': u'ISOM 3230'},
 {'group': u'3', 'id': u'ISOM 3320'},
 {'group': u'1', 'id': u'COMP 1029J'},
 {'group': u'1', 'id': u'COMP 1029P'},
 {'group': u'1', 'id': u'COMP 1029V'},
 {'group': u'1', 'id': u'COMP 1991'},
 {'group': u'1', 'id': u'COMP 1999'},
 {'group': u'2', 'id': u'COMP 2011'},
 {'group': u'2', 'id': u'COMP 2012'},
 {'group': u'2', 'id': u'COMP 2012H'},
 {'group': u'2', 'id': u'COMP 2521'},
 {'group': u'2', 'id': u'COMP 2611'},
 {'group': u'2', 'id': u'COMP 2711'},
 {'group': u'2', 'id': u'COMP 2711H'},
 {'group': u'1', 'id': u'MATH 1014'},
 {'group': u'1', 'id': u'MATH 1020'},
 {'group': u'1', 'id': u'MATH 1024'},
 {'group': u'3', 'id': u'COMP 3021'},
 {'

In [16]:
links_json

[{'source': u'COMP 1029A', 'target': u'COMP 1002', 'value': 1},
 {'source': u'COMP 1029A', 'target': u'COMP 1004', 'value': 1},
 {'source': u'COMP 1029A', 'target': u'COMP 1021', 'value': 1},
 {'source': u'COMP 1029A', 'target': u'COMP 1022P', 'value': 1},
 {'source': u'COMP 1029C', 'target': u'COMP 1021', 'value': 1},
 {'source': u'COMP 1029C', 'target': u'COMP 1022P', 'value': 1},
 {'source': u'COMP 1029C', 'target': u'COMP 1022Q', 'value': 1},
 {'source': u'COMP 1029C', 'target': u'ISOM 3230', 'value': 1},
 {'source': u'COMP 1029C', 'target': u'ISOM 3320', 'value': 1},
 {'source': u'COMP 1029J', 'target': u'COMP 1002', 'value': 1},
 {'source': u'COMP 1029J', 'target': u'COMP 1004', 'value': 1},
 {'source': u'COMP 1029J', 'target': u'COMP 1021', 'value': 1},
 {'source': u'COMP 1029J', 'target': u'COMP 1022Q', 'value': 1},
 {'source': u'COMP 1029J', 'target': u'ISOM 3230', 'value': 1},
 {'source': u'COMP 1029P', 'target': u'COMP 1002', 'value': 1},
 {'source': u'COMP 1029P', 'target':

In [17]:
import json
with open('comp_links.json', 'w') as outfile:
    json.dump({
        'nodes':nodes_json,
        'links':links_json    
    }, outfile)

# Generates

![alt text](images/js.png "Javascript")

In [18]:
print('closing driver...')
display.stop()
driver.quit()
print('closed!')

closing driver...


AttributeError: 'function' object has no attribute 'stop'