# Parse and create an oriented graph of COMP courses

The idea is to parse the course website and get prerequisite field for each course, not caring wether it's an AND or OR condition.

In [1]:
from time import sleep
from random import randint
from selenium import webdriver
from pyvirtualdisplay import Display

In [2]:
url = "https://w5.ab.ust.hk/wcq/cgi-bin/2010/subject/COMP"

In [3]:
print('starting driver...')
display = Display(visible=0, size=(800, 600))
display.start()
driver = webdriver.Chrome()
sleep(4)

starting driver...


In [4]:
print('getting page...')
driver.get(url)
sleep(randint(2,3))

getting page...


In [5]:
all_courses = []
links = []

for div in driver.find_elements_by_xpath('//div[@id="classes"]')[0].find_elements_by_class_name('course'):
    name = div.find_element_by_tag_name('h2').text
    description = ""
    prerequisite = ""
    dept = name.split(' ')[0]
#         print name
    courseinfo = div.find_elements_by_xpath('./*[@class="courseinfo"]/*[@class="courseattr popup"]/*[@class="popupdetail"]/table/tbody/tr')
#         print len(courseinfo)
    for info in courseinfo:
        if info.find_element_by_tag_name('th').get_attribute('innerHTML') == 'DESCRIPTION':
            description = info.find_element_by_tag_name('td').get_attribute('innerHTML')
        if info.find_element_by_tag_name('th').get_attribute('innerHTML') == 'PRE-REQUISITE':
            prerequisite = info.find_element_by_tag_name('td').get_attribute('innerHTML')
#                 print description
    current_course = {
        'name':name,
        'description':description,
        'dept':dept,
        'prerequisite':prerequisite
    }
    all_courses.append(current_course)
        

In [6]:
# Find prerequisites

### Open Problem

Find the Regex to match COMP ISOM or MATH course codes, I couldn't figure it out myself


#### Sample examples:

```
COMP 2011 OR COMP 2012 OR COMP 2012H
(COMP 3711 OR COMP 3711H) AND (MATH 2111)
COMP 1002 (prior to 2013-14) OR COMP 1004 (prior to 2013-14) OR COMP 1021 OR COMP 1022Q OR ISOM 3230
COMP 1021 OR COMP 1022P OR COMP 1022Q OR ISOM 3230
COMP 2611 OR [ELEC 2300 AND (COMP 1002 (prior to 2013-14) OR COMP 1004 (prior to 2013-14) OR COMP 2011 OR COMP 2012H)]
```

#### Sample outputs:

```python
["COMP 2011","COMP 2012","COMP 2012H"]
["COMP 3711","COMP 3711H","MATH 2111"]
["COMP 1002","COMP 1004","COMP 1021","COMP 1022Q","ISOM 3230"]
["COMP 1021","COMP 1022P","COMP 1022Q","ISOM 3230"]
["COMP 2611","ELEC 2300","COMP 1002","COMP 1004","COMP 2011","COMP 2012H"]
```

### Never mind, found one

Regex: `\w\w\w+ \d\w+`  
Check [https://regex101.com](https://regex101.com) and [http://regex.inginf.units.it](http://regex.inginf.units.it)

In [7]:
import re
p = re.compile('\w\w\w+ \d\w+')
html = "<style>.regex_match{color:red}</style>"

for course in all_courses:
    text = course['prerequisite']
    prerequisite_list = p.findall(course['prerequisite'])
    course['prerequisite_list'] = prerequisite_list
    for highlight in prerequisite_list:
        text = re.sub(str(highlight),r'<span class="regex_match">\g<0></span>',text)
    html+=text+'<br>'
#     print text

from IPython.core.display import display, HTML
display(HTML(html))
# small error with Honors courses, they come from the display, issue with "COMP1022Q" which is not labelled as the others.

In [8]:
nodes = []
links = []

for course in all_courses:
    name = course['name'].split(' - ')[0]
    nodes.append(name)
    for highlight in course['prerequisite_list']:
        links.append((name,highlight))
        if highlight not in nodes:
            nodes.append(highlight)

In [9]:
nodes

['COMP 1001',
 'COMP 1021',
 '',
 '',
 'COMP 1022P',
 'COMP 1022Q',
 'ISOM 3230',
 'ISOM 3320',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'COMP 2011',
 '',
 '',
 'COMP 2012H',
 '',
 '',
 'MATH 1014',
 'MATH 1020',
 'MATH 1024',
 '',
 'COMP 2012',
 '',
 '',
 '',
 '',
 '',
 '',
 'COMP 2611',
 'ELEC 2300',
 '',
 '',
 'COMP 2711',
 'COMP 2711H',
 'MATH 2343',
 '',
 '',
 'COMP 3711',
 'COMP 3711H',
 'MATH 2111',
 'MATH 2121',
 'MATH 2350',
 '',
 '',
 'ELEC 2600',
 'IEDA 2510',
 'IEDA 2520',
 'IEDA 2540',
 'ISOM 2500',
 'LIFS 3150',
 'MATH 2411',
 'MATH 2421',
 'MATH 2431',
 '',
 '',
 '',
 'MATH 2011',
 'MATH 2351',
 'MATH 2352',
 '',
 '',
 '',
 'COMP 3511',
 '',
 '',
 'MATH 2131',
 '',
 '',
 '',
 'COMP 4901O',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'MATH 2033',
 'MATH 4432',
 '',
 '',
 'COMP 6911',
 '',
 '',
 '']

In [10]:
links

[('', 'COMP 1021'),
 ('', 'COMP 1022P'),
 ('', 'COMP 1022Q'),
 ('', 'ISOM 3230'),
 ('', 'ISOM 3320'),
 ('', 'COMP 1021'),
 ('', 'COMP 1022Q'),
 ('', 'ISOM 3230'),
 ('', 'COMP 1022P'),
 ('', 'COMP 1022Q'),
 ('', 'ISOM 3230'),
 ('', 'ISOM 3320'),
 ('', 'COMP 1021'),
 ('', 'COMP 1022P'),
 ('', 'ISOM 3320'),
 ('', 'COMP 1021'),
 ('', 'COMP 1022P'),
 ('', 'COMP 1022Q'),
 ('', 'ISOM 3230'),
 ('', 'COMP 2011'),
 ('', 'COMP 1021'),
 ('', 'COMP 1022P'),
 ('', 'COMP 1022Q'),
 ('', 'ISOM 3230'),
 ('', 'COMP 2011'),
 ('', 'COMP 2012H'),
 ('', 'MATH 1014'),
 ('', 'MATH 1020'),
 ('', 'MATH 1024'),
 ('', 'COMP 2012'),
 ('', 'COMP 2012H'),
 ('', 'COMP 2012'),
 ('', 'COMP 2012H'),
 ('', 'COMP 2012'),
 ('', 'COMP 2012H'),
 ('', 'COMP 2012'),
 ('', 'COMP 2012H'),
 ('', 'COMP 2011'),
 ('', 'COMP 2012'),
 ('', 'COMP 2012H'),
 ('', 'COMP 2011'),
 ('', 'COMP 2012'),
 ('', 'COMP 2012H'),
 ('', 'COMP 2611'),
 ('', 'ELEC 2300'),
 ('', 'COMP 2011'),
 ('', 'COMP 2012H'),
 ('', 'COMP 2012'),
 ('', 'COMP 2012H'),
 

In [11]:
# import matplotlib.pyplot as plt
# import networkx as nx

# G=nx.Graph()
# G.add_nodes_from(nodes)
# pos=nx.spring_layout(G) # positions for all nodes

# # nodes
# nx.draw_networkx_nodes(G,pos,
#                        nodelist=nodes,
#                        node_color='r',
#                        node_size=800,
#                         alpha=0.8)

# # edges
# nx.draw_networkx_edges(G,pos,width=1.0,alpha=0.5)
# nx.draw_networkx_edges(G,pos,
#                        edgelist=links,
#                        width=8,alpha=0.5,edge_color='r')


# # # some math labels
# # labels={}
# # labels[0]=r'$a$'
# # labels[1]=r'$b$'
# # labels[2]=r'$c$'
# # labels[3]=r'$d$'
# # labels[4]=r'$\alpha$'
# # labels[5]=r'$\beta$'
# # labels[6]=r'$\gamma$'
# # labels[7]=r'$\delta$'
# # nx.draw_networkx_labels(G,pos,labels,font_size=16)

# plt.axis('off')
# plt.savefig("labels_and_colors.png") # save as png
# plt.show() # display

In [16]:
# G = nx.connected_watts_strogatz_graph(30,5,.2)
# nodes = G.nodes()
# edges = G.edges()
# # define the initial positions of the nodes using networkx's spring_layout function, and add to the nodes_dict.
import networkx as nx
import matplotlib.pyplot as plt
import visJS2jupyter
print('loading visJS2jupyter')
from visJS2jupyter import visJS_module
print('loaded visJS2jupyter')
G=nx.Graph()
G.add_nodes_from(nodes)
pos=nx.spring_layout(G) # positions for all nodes
pos = nx.spring_layout(G)
nodes_dict = [{"id":n,
              "x":pos[n][0]*10,
              "y":pos[n][1]*10} for n in nodes]
node_map = dict(zip(nodes,range(len(nodes))))  # map to indices for source/target in edges

edges_dict = [{"source":node_map[links[i][0]], "target":node_map[links[i][1]], 
              "title":'test'} for i in range(len(links))]

visJS2jupyter.visJS_module.visjs_network(nodes_dict,edges_dict)

KeyboardInterrupt: 

The previous block of code render the graph below:

![alt text](images/graph.png "Graph")

In [18]:
import pandas as pd
print(len(all_courses))
df = pd.DataFrame.from_dict(all_courses)
df

58


Unnamed: 0,name,description,dept,prerequisite,prerequisite_list
0,COMP 1001 - Exploring Multimedia and Internet ...,This course is an introduction to computers an...,COMP,,[]
1,COMP 1021 - Introduction to Computer Science (...,This course introduces students to the world o...,COMP,,[]
2,,This course is designed to equip students with...,,,[]
3,,This course introduces the C programming langu...,,COMP 1021 OR COMP 1022P OR COMP 1022Q (prior t...,"[COMP 1021, COMP 1022P, COMP 1022Q, ISOM 3230,..."
4,,This course introduces the Java programming la...,,COMP 1021 OR COMP 1022Q (prior to 2020-21) OR ...,"[COMP 1021, COMP 1022Q, ISOM 3230]"
5,,This course introduces the Python programming ...,,COMP 1022P OR COMP 1022Q (prior to 2020-21) OR...,"[COMP 1022P, COMP 1022Q, ISOM 3230, ISOM 3320]"
6,,This course introduces the VBA programming lan...,,COMP 1021 OR COMP 1022P OR ISOM 3320,"[COMP 1021, COMP 1022P, ISOM 3320]"
7,,An experiential common core course in creative...,,,[]
8,,Full-time internship training for a period of ...,,,[]
9,,This course covers programming and data struct...,,COMP 1021 OR COMP 1022P OR COMP 1022Q (prior t...,"[COMP 1021, COMP 1022P, COMP 1022Q, ISOM 3230]"


In [19]:
nodes_json = []
links_json = []

for node in nodes:
    current_node = {
        'id':node,
        'group':node[5]
    }
    nodes_json.append(current_node)
    
for link in links:
    current_link = {
        'source':link[0],
        'target':link[1],
        'value':1
    }
    links_json.append(current_link)

IndexError: string index out of range

In [20]:
nodes_json

[{'id': 'COMP 1001', 'group': '1'}, {'id': 'COMP 1021', 'group': '1'}]

In [21]:
links_json

[]

In [22]:
import json
with open('comp_links.json', 'w') as outfile:
    json.dump({
        'nodes':nodes_json,
        'links':links_json    
    }, outfile)

# Generates

![alt text](images/js.png "Javascript")

In [23]:
print('closing driver...')
global display
display.stop()
driver.quit()
print('closed!')

closing driver...


AttributeError: 'function' object has no attribute 'stop'