In [8]:
import json
import os
import re

from bs4 import BeautifulSoup
import pandas as pd
import requests

from util import load_json, to_json, map_dict_value, flatten

def to_json_in(data_dir, file_name, obj):
    to_json(os.path.join(data_dir, file_name), obj)

# References


- [re — Regular expression operations](https://docs.python.org/3/library/re.html)
- [Beautiful Soup Documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

# Specializations

Data Format:
```{json}
{
    "Computational Perception and Robotics": {
        "Core Courses": [
            {
                "Pick": 1,
                "Courses: [...]
            },
            {
                "Pick": 1,
                "Courses: [...]
            }
        ],
        "Electives": [
            {
                "Pick", 3,
                "Courses": [...]
            }
        ]
    }
}

```

Limitations:
- No information about constraints like "at least one from each sub-area" in Computational Perception and Robotics and Human-Computer Interaction
- No information about "OR" constraints in Computing Systems and Human-Computer Interaction


In [9]:
response = requests.get('https://www.cc.gatech.edu/ms-computer-science-specializations')
soup = BeautifulSoup(response.text)
div = soup.find('div', class_='gt-main-content').find('div', class_='field')

In [10]:
core_electives = re.compile('Core Courses|Electives', re.I)
pick_take = re.compile('Pick|Take', re.I)
num = re.compile('one|two|three', re.I)
word_to_num = {
    'one': 1,
    'two': 2,
    'three': 3,
}
PICK = 'Pick'
COURSES = 'Courses'

tracks = {}
track = {}
course_subset = []

for child in div.children:
    text = child.text.strip()
    # A new track
    if child.name == 'h4':
        track = {}
        tracks[text] = track
    elif child.name == 'p':
        # A new course type (core or elective)
        if match := re.match(core_electives, text):
            course_type = match[0].title()
            course_subset = []
            track[course_type] = course_subset
            # If the next or next next text do not contain "pick" or "take"
            next = child.next_sibling.next_sibling
            next_next = next.next_sibling.next_sibling
            if not re.search(pick_take, next.text) and not re.search(pick_take, next_next.text):
                courses = {PICK: -1, COURSES: []}
                course_subset.append(courses)
        # A new course subset
        elif match := re.search(pick_take, text):
            # Extract the number of courses need to pick from the subset
            number_in_word = re.search(num, text)[0].lower()
            pick = word_to_num[number_in_word]
            courses = {PICK: pick, COURSES: []}
            course_subset.append(courses)
        # Handle an edge case
        elif text.lower() == 'or':
            courses[PICK] = 1
        
    # Course List
    elif child.name == 'ul':
        lis = child.find_all('li')
        for li in child.find_all('li'):
            course = re.sub(r'\s+', ' ', li.text.strip())
            if ' OR ' in course:
                for c in course.split(' OR '):
                    courses[COURSES].append(c)
            else:
                courses[COURSES].append(course)
        # If it does not say take a certain number of courses, all need to be taken
        if courses[PICK] == -1:
            courses[PICK] = len(lis)

del tracks['Human Centered Computing']
del tracks['Interactive Intelligence']
to_json_in('data', 'tracks.json', tracks)

# Free Courses

In [11]:
response = requests.get('https://www.udacity.com/georgia-tech')
soup = BeautifulSoup(response.text)
lis = soup.find('ul', class_='courses').find_all('li')

In [12]:

free_courses = {}
for li in lis:
    a = li.find_all('a')[1]
    href = a['href']
    free_courses[a.text.strip()] = f'https://www.udacity.com/{href}'

# Data Analysis

In [13]:
PICK = 'Pick'
COURSES = 'Courses'

tracks = load_json(os.path.join('data', 'tracks.json'))

In [14]:
track_to_type_courses = map_dict_value(tracks, map_dict_value, lambda v: flatten([e[COURSES] for e in v]))
track_to_courses = map_dict_value(tracks, lambda v: flatten([e[COURSES] for e in flatten(v.values())]))
courses = pd.Series(flatten(track_to_courses.values()))

Does each track have two course types (core and elective)?

In [15]:
pd.DataFrame(map_dict_value(tracks, len), index=['Course Types']).T

Unnamed: 0,Course Types
Computational Perception and Robotics,2
Computer Graphics,2
Computing Systems,2
High Performance Computing,2
Human-Computer Interaction,2
Machine Learning,2
Modeling and Simulations,2
Scientific Computing,2
Social Computing,2
Visual Analytics,2


How many courses can you choose from for each track?

In [16]:
track_type_course_count = pd.merge(
    pd.DataFrame(map_dict_value(track_to_type_courses, map_dict_value, len)).T,
    pd.DataFrame(map_dict_value(track_to_courses, len), index=['Total']).T,
    left_index=True,
    right_index=True,
)
track_type_course_count.T.to_json(os.path.join('data', 'track_type_course_count.json'), indent=4)
track_type_course_count

Unnamed: 0,Core Courses,Electives,Total
Computational Perception and Robotics,9,13,22
Computer Graphics,5,8,13
Computing Systems,11,25,36
High Performance Computing,2,7,9
Human-Computer Interaction,3,19,22
Machine Learning,10,22,32
Modeling and Simulations,4,7,11
Scientific Computing,3,8,11
Social Computing,3,16,19
Visual Analytics,3,8,11


How many unique courses in the whole program?

In [17]:
len(set(courses))

119

In [18]:
courses.value_counts()[:20]

CS 6505 Computability, Algorithms, and Complexity                                 5
CSE 6220 High Performance Computing                                               5
CS 6515 Introduction to Graduate Algorithms                                       5
CSE 6140 Computational Science and Engineering Algorithms                         4
CS 7280 Network Science                                                           4
CS 7450 Information Visualization                                                 4
MATH 6640 Introduction to Numerical Methods for Partial Differential Equations    4
CS 7650 Natural Language                                                          3
CS 6750 Human-Computer Interaction                                                3
CS 6456 Principles of User Interface Software                                     3
CS 6730 Data Visualization: Principles & Applications                             3
CS 7451 Human-Centered Data Analysis                                        

In [19]:
track_overlap_courses = {}
for track_a, courses_a in track_to_courses.items():
    track_overlap_courses[track_a] = {}
    for track_b, courses_b in track_to_courses.items():
        overlap_courses = list(set(courses_a) & set(courses_b)) if track_a != track_b else []
        track_overlap_courses[track_a][track_b] = overlap_courses
to_json_in('data', 'track_overlap_courses.json', track_overlap_courses)

In [20]:
pd.DataFrame(map_dict_value(track_overlap_courses, map_dict_value, len))

Unnamed: 0,Computational Perception and Robotics,Computer Graphics,Computing Systems,High Performance Computing,Human-Computer Interaction,Machine Learning,Modeling and Simulations,Scientific Computing,Social Computing,Visual Analytics
Computational Perception and Robotics,0,4,3,1,0,11,0,1,3,1
Computer Graphics,4,0,2,0,2,3,0,0,3,1
Computing Systems,3,2,0,3,0,4,2,1,8,0
High Performance Computing,1,0,3,0,0,1,2,4,0,0
Human-Computer Interaction,0,2,0,0,0,0,0,0,8,7
Machine Learning,11,3,4,1,0,0,1,1,4,2
Modeling and Simulations,0,0,2,2,0,1,0,2,1,0
Scientific Computing,1,0,1,4,0,1,2,0,0,0
Social Computing,3,3,8,0,8,4,1,0,0,6
Visual Analytics,1,1,0,0,7,2,0,0,6,0


In [21]:
count = 0
for course in set(courses):
    for free_course in free_courses:
        if free_course in course:
            print(course)
            count += 1
            break

count

CS 7646 Machine Learning for Trading
CS 7545 Machine Learning Theory
CS 6300 Software Development Process
CS 7642 Reinforcement Learning and Decision Making
CS 6460 Educational Technology: Conceptual Foundations
CS 6035 Introduction to Information Security
CS 6750 Human-Computer Interaction
CS 6515 Introduction to Graduate Algorithms
CSE 6220 High Performance Computing
CS 7465 Educational Technology: Design and Evaluation
CS 6475 Computational Photography
CS 7641 Machine Learning
CS 6200 Graduate Introduction to Operating Systems
CS 7638 Artificial Intelligence Techniques for Robotics
CS 6601 Artificial Intelligence
CS 7644 Machine Learning for Robotics
CS 6210 Advanced Operating Systems
CS 6262 Network Security


18