In [1]:
import json
import re

from bs4 import BeautifulSoup
import pandas as pd
import requests

# References
- [re — Regular expression operations](https://docs.python.org/3/library/re.html)
- [Beautiful Soup Documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

# Free Courses

In [2]:
response = requests.get('https://www.udacity.com/georgia-tech')
soup = BeautifulSoup(response.text)
courses = soup.find('ul', class_='courses').find_all('li')

In [3]:
courses[0].text.strip()

'Artificial Intelligence for Robotics'

# Specializations


Data Format:
```{json}
{
    "Computational Perception and Robotics": {
        "Core Courses": [
            {
                "Pick": 1,
                "Courses: [...]
            },
            {
                "Pick": 1,
                "Courses: [...]
            }
        ],
        "Electives": [
            {
                "Pick", 3,
                "Courses": [...]
            }
        ]
    }
}

```

Limitations:
- No information about constraints like "at least one from each sub-area" in Computational Perception and Robotics and Human-Computer Interaction
- No information about "OR" constraints in Computing Systems and Human-Computer Interaction


In [4]:
response = requests.get('https://www.cc.gatech.edu/ms-computer-science-specializations')
soup = BeautifulSoup(response.text)
div = soup.find('div', class_='gt-main-content').find('div', class_='field')

In [5]:
core_electives = re.compile('Core Courses|Electives', re.I)
pick_take = re.compile('Pick|Take', re.I)
num = re.compile('one|two|three', re.I)
word_to_num = {
    'one': 1,
    'two': 2,
    'three': 3,
}
PICK = 'Pick'
COURSES = 'Courses'

tracks = {}
track = {}
course_subset = []

for child in div.children:
    text = child.text.strip()
    # A new track
    if child.name == 'h4':
        track = {}
        tracks[text] = track
    elif child.name == 'p':
        # A new course type (core or elective)
        if match := re.match(core_electives, text):
            course_type = match[0].title()
            course_subset = []
            track[course_type] = course_subset
            # If the next or next next text do not contain "pick" or "take"
            next = child.next_sibling.next_sibling
            next_next = next.next_sibling.next_sibling
            if not re.search(pick_take, next.text) and not re.search(pick_take, next_next.text):
                courses = {PICK: -1, COURSES: []}
                course_subset.append(courses)
        # A new course subset
        elif match := re.search(pick_take, text):
            # Extract the number of courses need to pick from the subset
            number_in_word = re.search(num, text)[0].lower()
            pick = word_to_num[number_in_word]
            courses = {PICK: pick, COURSES: []}
            course_subset.append(courses)
        # Handle an edge case
        elif text.lower() == 'or':
            courses[PICK] = 1
        
    # Course List
    elif child.name == 'ul':
        lis = child.find_all('li')
        for li in child.find_all('li'):
            course = re.sub(r'\s+', ' ', li.text.strip())
            if ' OR ' in course:
                for c in course.split(' OR '):
                    courses[COURSES].append(c)
            else:
                courses[COURSES].append(course)
        # If it does not say take a certain number of courses, all need to be taken
        if courses[PICK] == -1:
            courses[PICK] = len(lis)

del tracks['Human Centered Computing']
del tracks['Interactive Intelligence']
with open('data.json', 'w') as f:
    json.dump(tracks, f, indent=4)

# Sanity Check

In [6]:
def map_dict_value(d, func, *args):
    return dict(map(lambda k: (k, func(d[k], *args)), d))

def flatten(l):
    flat = []
    for e in l:
        if type(e) == list:
            for nested_e in flatten(e):
                flat.append(nested_e)
        else:
            flat.append(e)
    return flat

In [7]:
pd.Series(map_dict_value(tracks, len))

Computational Perception and Robotics    2
Computer Graphics                        2
Computing Systems                        2
High Performance Computing               2
Human-Computer Interaction               2
Machine Learning                         2
Modeling and Simulations                 2
Scientific Computing                     2
Social Computing                         2
Visual Analytics                         2
dtype: int64

In [8]:
track_to_type_courses = map_dict_value(tracks, map_dict_value, lambda v: flatten([e[COURSES] for e in v]))

In [9]:
pd.DataFrame(map_dict_value(track_to_type_courses, map_dict_value, len)).T

Unnamed: 0,Core Courses,Electives
Computational Perception and Robotics,9,13
Computer Graphics,5,8
Computing Systems,11,25
High Performance Computing,2,7
Human-Computer Interaction,3,19
Machine Learning,10,22
Modeling and Simulations,4,7
Scientific Computing,3,8
Social Computing,3,16
Visual Analytics,3,8


In [10]:
track_to_courses = map_dict_value(tracks, lambda v: flatten([e[COURSES] for e in flatten(v.values())]))

In [11]:
pd.DataFrame(map_dict_value(track_to_courses, len), index=['Total Courses']).T

Unnamed: 0,Total Courses
Computational Perception and Robotics,22
Computer Graphics,13
Computing Systems,36
High Performance Computing,9
Human-Computer Interaction,22
Machine Learning,32
Modeling and Simulations,11
Scientific Computing,11
Social Computing,19
Visual Analytics,11


In [12]:
courses = pd.Series(flatten(track_to_courses.values()))
len(courses)

186

In [13]:
courses.value_counts()[:20]

CS 6505 Computability, Algorithms, and Complexity                                 5
CSE 6220 High Performance Computing                                               5
CS 6515 Introduction to Graduate Algorithms                                       5
CSE 6140 Computational Science and Engineering Algorithms                         4
CS 7280 Network Science                                                           4
CS 7450 Information Visualization                                                 4
MATH 6640 Introduction to Numerical Methods for Partial Differential Equations    4
CS 7650 Natural Language                                                          3
CS 6750 Human-Computer Interaction                                                3
CS 6456 Principles of User Interface Software                                     3
CS 6730 Data Visualization: Principles & Applications                             3
CS 7451 Human-Centered Data Analysis                                        

In [14]:
l = []
for track_a, courses_a in track_to_courses.items():
    for track_b, courses_b in track_to_courses.items():
        common = 0
        if track_a != track_b:
            common = len(set(courses_a) & set(courses_b))
        l.append([track_a, track_b, common])

In [15]:
pd.pivot(pd.DataFrame(l, columns=['Track_a', 'Track_b', 'Intersections']), index='Track_b', columns='Track_a', values='Intersections')

Track_a,Computational Perception and Robotics,Computer Graphics,Computing Systems,High Performance Computing,Human-Computer Interaction,Machine Learning,Modeling and Simulations,Scientific Computing,Social Computing,Visual Analytics
Track_b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Computational Perception and Robotics,0,4,3,1,0,11,0,1,3,1
Computer Graphics,4,0,2,0,2,3,0,0,3,1
Computing Systems,3,2,0,3,0,4,2,1,8,0
High Performance Computing,1,0,3,0,0,1,2,4,0,0
Human-Computer Interaction,0,2,0,0,0,0,0,0,8,7
Machine Learning,11,3,4,1,0,0,1,1,4,2
Modeling and Simulations,0,0,2,2,0,1,0,2,1,0
Scientific Computing,1,0,1,4,0,1,2,0,0,0
Social Computing,3,3,8,0,8,4,1,0,0,6
Visual Analytics,1,1,0,0,7,2,0,0,6,0
