In [1]:
import os
import re

from bs4 import BeautifulSoup
import pandas as pd
import requests

from util import load_json_in, to_json_in, map_dict_value, flatten

# References


- [re — Regular expression operations](https://docs.python.org/3/library/re.html)
- [Beautiful Soup Documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

# Specializations

Data Format:
```{json}
{
    "Computational Perception and Robotics": {
        "Core Courses": [
            {
                "Pick": 1,
                "Courses: [...]
            },
            {
                "Pick": 1,
                "Courses: [...]
            }
        ],
        "Electives": [
            {
                "Pick", 3,
                "Courses": [...]
            }
        ]
    }
}

```

Limitations:
- No information about constraints like "at least one from each sub-area" in Computational Perception and Robotics and Human-Computer Interaction
- No information about "OR" constraints in Computing Systems and Human-Computer Interaction


In [2]:
response = requests.get('https://www.cc.gatech.edu/ms-computer-science-specializations')
soup = BeautifulSoup(response.text)
div = soup.find('div', class_='gt-main-content').find('div', class_='field')

In [3]:
core_electives = re.compile('Core Courses|Electives', re.I)
pick_take = re.compile('Pick|Take', re.I)
num = re.compile('one|two|three', re.I)
word_to_num = {
    'one': 1,
    'two': 2,
    'three': 3,
}
PICK = 'Pick'
COURSES = 'Courses'

tracks = {}
track = {}
course_subset = []

for child in div.children:
    text = child.text.strip()
    # A new track
    if child.name == 'h4':
        track = {}
        tracks[text] = track
    elif child.name == 'p':
        # A new course type (core or elective)
        if match := re.match(core_electives, text):
            course_type = match[0].title()
            course_subset = []
            track[course_type] = course_subset
            # If the next or next next text do not contain "pick" or "take"
            next = child.next_sibling.next_sibling
            next_next = next.next_sibling.next_sibling
            if not re.search(pick_take, next.text) and not re.search(pick_take, next_next.text):
                courses = {PICK: -1, COURSES: []}
                course_subset.append(courses)
        # A new course subset
        elif match := re.search(pick_take, text):
            # Extract the number of courses need to pick from the subset
            number_in_word = re.search(num, text)[0].lower()
            pick = word_to_num[number_in_word]
            courses = {PICK: pick, COURSES: []}
            course_subset.append(courses)
        # Handle an edge case
        elif text.lower() == 'or':
            courses[PICK] = 1
        
    # Course List
    elif child.name == 'ul':
        lis = child.find_all('li')
        for li in child.find_all('li'):
            course = re.sub(r'\s+', ' ', li.text.strip())
            if ' OR ' in course:
                for c in course.split(' OR '):
                    courses[COURSES].append(c)
            else:
                courses[COURSES].append(course)
        # If it does not say take a certain number of courses, all need to be taken
        if courses[PICK] == -1:
            courses[PICK] = len(lis)

del tracks['Human Centered Computing']
del tracks['Interactive Intelligence']
to_json_in('data', 'tracks.json', tracks)

# Free Courses

In [4]:
response = requests.get('https://www.udacity.com/georgia-tech')
soup = BeautifulSoup(response.text)
lis = soup.find('ul', class_='courses').find_all('li')

In [5]:

free_course_to_link = {}
for li in lis:
    a = li.find_all('a')[1]
    href = a['href']
    free_course_to_link[a.text.strip()] = f'https://www.udacity.com/{href}'

# OMSCS Courses

In [6]:
response = requests.get('https://omscs.gatech.edu/current-courses')
soup = BeautifulSoup(response.text)
div = soup.find('div', class_='body')
lis = div.find_all('li')
as_ = [li.a for li in lis]

In [7]:
omscs_course_to_link = {}
for a in as_:
    course = re.sub(r'\s+', ' ', a.text.strip())
    course = re.sub(':', '', a.text.strip())
    omscs_course_to_link[course] = a['href']

In [8]:
len(omscs_course_to_link)

61

# Data Analysis

In [9]:
PICK = 'Pick'
COURSES = 'Courses'

tracks = load_json_in('data', 'tracks.json')

In [10]:
track_to_type_courses = map_dict_value(tracks, map_dict_value, lambda v: flatten([e[COURSES] for e in v]))
track_to_courses = map_dict_value(tracks, lambda v: flatten([e[COURSES] for e in flatten(v.values())]))


In [11]:
courses = flatten(track_to_courses.values())

all_courses = list(set(courses))
all_courses = [course for course in all_courses if not re.search('Any|Substitutions', course)]

Does each track have two course types (core and elective)?

In [12]:
pd.DataFrame(map_dict_value(tracks, len), index=['Course Types']).T

Unnamed: 0,Course Types
Computational Perception and Robotics,2
Computer Graphics,2
Computing Systems,2
High Performance Computing,2
Human-Computer Interaction,2
Machine Learning,2
Modeling and Simulations,2
Scientific Computing,2
Social Computing,2
Visual Analytics,2


How many courses can you choose from for each track?

In [13]:
track_type_course_count = pd.merge(
    pd.DataFrame(map_dict_value(track_to_type_courses, map_dict_value, len)).T,
    pd.DataFrame(map_dict_value(track_to_courses, len), index=['Total']).T,
    left_index=True,
    right_index=True,
)
track_type_course_count.T.to_json(os.path.join('data', 'track_type_course_count.json'), indent=4)
track_type_course_count

Unnamed: 0,Core Courses,Electives,Total
Computational Perception and Robotics,9,13,22
Computer Graphics,5,8,13
Computing Systems,11,25,36
High Performance Computing,2,7,9
Human-Computer Interaction,3,19,22
Machine Learning,10,22,32
Modeling and Simulations,4,7,11
Scientific Computing,3,8,11
Social Computing,3,16,19
Visual Analytics,3,8,11


How many unique courses in the whole program?

In [14]:
to_json_in('data', 'all_courses.json', sorted(all_courses))
len(all_courses)

116

What the number of overlapping courses between tracks?

In [15]:
track_overlap_courses = {}
for track_a, courses_a in track_to_courses.items():
    track_overlap_courses[track_a] = {}
    for track_b, courses_b in track_to_courses.items():
        overlap_courses = list(set(courses_a) & set(courses_b)) if track_a != track_b else []
        track_overlap_courses[track_a][track_b] = sorted(overlap_courses)

to_json_in('data', 'track_overlap_courses.json', track_overlap_courses)
pd.DataFrame(map_dict_value(track_overlap_courses, map_dict_value, len))

Unnamed: 0,Computational Perception and Robotics,Computer Graphics,Computing Systems,High Performance Computing,Human-Computer Interaction,Machine Learning,Modeling and Simulations,Scientific Computing,Social Computing,Visual Analytics
Computational Perception and Robotics,0,4,3,1,0,11,0,1,3,1
Computer Graphics,4,0,2,0,2,3,0,0,3,1
Computing Systems,3,2,0,3,0,4,2,1,8,0
High Performance Computing,1,0,3,0,0,1,2,4,0,0
Human-Computer Interaction,0,2,0,0,0,0,0,0,8,7
Machine Learning,11,3,4,1,0,0,1,1,4,2
Modeling and Simulations,0,0,2,2,0,1,0,2,1,0
Scientific Computing,1,0,1,4,0,1,2,0,0,0
Social Computing,3,3,8,0,8,4,1,0,0,6
Visual Analytics,1,1,0,0,7,2,0,0,6,0


In [16]:
course_to_free_course = {
    "CS 6200 Graduate Introduction to Operating Systems": "Introduction to Operating Systems",
    "CS 6300 Software Development Process": "Software Development Process",
    "CS 7642 Reinforcement Learning and Decision Making": "Reinforcement Learning",
    "CS 6250 Computer Networks": "Computer Networking",
    "CS 6515 Introduction to Graduate Algorithms": "Introduction to Graduate Algorithms",
    "CS 6505 Computability, Algorithms, and Complexity": "Computability, Complexity & Algorithms",
    "CS 6400 Database Systems Concepts and Designs": "Database Systems Concepts & Design",
    "CS 6460 Educational Technology: Conceptual Foundations": "Educational Technology",
    "CS 6750 Human-Computer Interaction": "Human-Computer Interaction",
    "CS 6476 Computer Vision": "Introduction to Computer Vision",
    "CSE 6220 High Performance Computing": "High Performance Computing",
    "CS 6601 Artificial Intelligence": "Artificial Intelligence",
    "CS 6210 Advanced Operating Systems": "Advanced Operating Systems",
    "CS 7638 Artificial Intelligence Techniques for Robotics": "Artificial Intelligence for Robotics",
    "CS 6310 Software Architecture and Design": "Software Architecture & Design",
    "CS 7641 Machine Learning": "Machine Learning",
    "CS 6475 Computational Photography": "Computational Photography",
    "CS 6262 Network Security": "Network Security",
    "CSE 6250 Big Data for Health": "Big Data Analytics in Healthcare",
    "CS 6290 High-Performance Computer Architecture": "High Performance Computer Architecture",
    "CS 6340 Software Analysis and Testing": "Software Analysis & Testing",
    "CS 7646 Machine Learning for Trading": "Machine Learning for Trading",
    "CS 6035 Introduction to Information Security": "Introduction to Information Security",
    "CSE 6242 Data and Visual Analytics": "Data Analysis and Visualization"
}
# to_json_in('data', 'course_to_free_course.json', course_to_free_course)

In [17]:
[
    'Knowledge-Based AI: Cognitive Systems',
    'Linear Algebra Refresher Course',
    'GT - Refresher - Advanced OS',
    'Compilers: Theory and Practice',
    'Embedded Systems',
    'Cyber-Physical Systems Security',
    'Machine Learning: Unsupervised Learning'
]

['Knowledge-Based AI: Cognitive Systems',
 'Linear Algebra Refresher Course',
 'GT - Refresher - Advanced OS',
 'Compilers: Theory and Practice',
 'Embedded Systems',
 'Cyber-Physical Systems Security',
 'Machine Learning: Unsupervised Learning']

In [18]:
course_to_free_course_info = pd.merge(
    pd.Series(course_to_free_course, name='Free Course'),
    pd.Series(free_course_to_link, name='Free Course Link'),
    left_on='Free Course',
    right_index=True,
)
course_to_free_course_info

Unnamed: 0,Free Course,Free Course Link
CS 6200 Graduate Introduction to Operating Systems,Introduction to Operating Systems,https://www.udacity.com/course/introduction-to...
CS 6300 Software Development Process,Software Development Process,https://www.udacity.com/course/software-develo...
CS 7642 Reinforcement Learning and Decision Making,Reinforcement Learning,https://www.udacity.com/course/reinforcement-l...
CS 6250 Computer Networks,Computer Networking,https://www.udacity.com/course/computer-networ...
CS 6515 Introduction to Graduate Algorithms,Introduction to Graduate Algorithms,https://www.udacity.com/course/introduction-to...
"CS 6505 Computability, Algorithms, and Complexity","Computability, Complexity & Algorithms",https://www.udacity.com/course/computability-c...
CS 6400 Database Systems Concepts and Designs,Database Systems Concepts & Design,https://www.udacity.com/course/database-system...
CS 6460 Educational Technology: Conceptual Foundations,Educational Technology,https://www.udacity.com/course/educational-tec...
CS 6750 Human-Computer Interaction,Human-Computer Interaction,https://www.udacity.com/course/human-computer-...
CS 6476 Computer Vision,Introduction to Computer Vision,https://www.udacity.com/course/introduction-to...


In [19]:
all_course_df = pd.DataFrame(all_courses, columns=['Course'])
omscs_course_to_link_df = pd.DataFrame(omscs_course_to_link, index=['OMSCS Course Link']).T
omscs_course_to_link_df = omscs_course_to_link_df.reset_index(names='Course')

In [20]:
all_course_df['Course Code'] = all_course_df['Course'].apply(lambda e: ' '.join(e.split(' ')[:2]))
all_course_df

Unnamed: 0,Course,Course Code
0,CSE 8803 Special Topics: Algorithms for Medica...,CSE 8803
1,CS 6400 Database Systems Concepts and Designs,CS 6400
2,CS 7470 Mobile and Ubiquitous Computing,CS 7470
3,CS 6475 Computational Photography,CS 6475
4,CS 7650 Natural Language,CS 7650
...,...,...
111,ISYE 6664 Stochastic Optimization,ISYE 6664
112,CS 6460 Educational Technology: Conceptual Fou...,CS 6460
113,CS 6422 Database System Implementation,CS 6422
114,CSE 6730 Modeling and Simulation: Foundations ...,CSE 6730


In [21]:
omscs_course_to_link_df['Course Code'] = omscs_course_to_link_df['Course'].apply(lambda e: ' '.join(e.split(' ')[:2]))
omscs_course_to_link_df

Unnamed: 0,Course,OMSCS Course Link,Course Code
0,CS 6035 Introduction to Information Security,https://omscs.gatech.edu/cs-6035-introduction-...,CS 6035
1,CS 6150 Computing for Good,https://omscs.gatech.edu/cs-6150-computing-good,CS 6150
2,CS 6200 Introduction to Operating Systems,https://omscs.gatech.edu/cs-6200-introduction-...,CS 6200
3,CS 6210 Advanced Operating Systems,https://omscs.gatech.edu/cs-6210-advanced-oper...,CS 6210
4,CS 6211 System Design for Cloud Computing,https://omscs.gatech.edu/cs-6211-system-design...,CS 6211
...,...,...,...
56,ISYE 6644 Simulation and Modeling for Engineer...,https://omscs.gatech.edu/isye-6644-simulation-...,ISYE 6644
57,ISYE 6669 Deterministic Optimization,https://omscs.gatech.edu/isye-6669-determinist...,ISYE 6669
58,ISYE 8803 Topics on High-Dimensional Data Anal...,https://omscs.gatech.edu/isye-8803-topics-high...,ISYE 8803
59,MGT 6311 Digital Marketing,https://omscs.gatech.edu/mgt-6311-digital-mark...,MGT 6311


In [22]:
course_to_link = pd.merge(
    all_course_df,
    omscs_course_to_link_df,
    left_on='Course Code',
    right_on='Course Code',
    how='left',
    suffixes=('', ' OMSCS')
)
course_to_link = course_to_link.query('`Course Code` != "CS 8803"')
course_to_link = course_to_link.set_index('Course').rename(columns={'Course OMSCS': 'OMSCS Course'})
course_to_link

Unnamed: 0_level_0,Course Code,OMSCS Course,OMSCS Course Link
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CSE 8803 Special Topics: Algorithms for Medical Imaging and Inverse Problems,CSE 8803,,
CS 6400 Database Systems Concepts and Designs,CS 6400,CS 6400 Database Systems Concepts and Design,https://omscs.gatech.edu/cs-6400-database-syst...
CS 7470 Mobile and Ubiquitous Computing,CS 7470,CS 7470 Mobile & Ubiquitous Computing,https://omscs.gatech.edu/cs-7470-mobile-ubiqui...
CS 6475 Computational Photography,CS 6475,CS 6475 Computational Photography,https://omscs.gatech.edu/cs-6475-computational...
CS 7650 Natural Language,CS 7650,CS 7650 Natural Language Processing,https://omscs.gatech.edu/cs-7650-natural-langu...
...,...,...,...
ISYE 6664 Stochastic Optimization,ISYE 6664,,
CS 6460 Educational Technology: Conceptual Foundations,CS 6460,CS 6460 Educational Technology,https://omscs.gatech.edu/cs-6460-educational-t...
CS 6422 Database System Implementation,CS 6422,,
CSE 6730 Modeling and Simulation: Foundations and Implementation,CSE 6730,,


In [23]:
course_to_link['OMSCS Course'].value_counts()

OMSCS Course
CS 6457 Video Game Design                                        2
CS 6400 Database Systems Concepts and Design                     1
CS 6675 Advanced Internet Systems and Applications               1
CS 6300 Software Development Process                             1
CS 6440 Intro to Health Informatics                              1
CS 6476 Computer Vision                                          1
CS 7638 Robotics AI Techniques                                   1
CS 7210 Distributed Computing                                    1
CS 6603 AI, Ethics, and Society                                  1
CS 6250 Computer Networks                                        1
CS 6262 Network Security                                         1
CS 6260 Applied Cryptography                                     1
CS 6515 Intro to Graduate Algorithms                             1
CS 7646 Machine Learning for Trading                             1
CS 6263 Intro to Cyber Physical Systems Security 

In [24]:
course_to_link.query('`Course Code` == "CS 6457"')

Unnamed: 0_level_0,Course Code,OMSCS Course,OMSCS Course Link
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CS 6457 Video Game Design and Programming,CS 6457,CS 6457 Video Game Design,https://omscs.gatech.edu/cs-6457-video-game-de...
CS 6457 Video Game Design,CS 6457,CS 6457 Video Game Design,https://omscs.gatech.edu/cs-6457-video-game-de...


In [25]:
course_to_links = pd.merge(
    course_to_link,
    course_to_free_course_info,
    left_index=True,
    right_index=True,
    how='left',
)
course_to_links.T.to_json(os.path.join('data', 'course_to_links.json'), indent=4)
course_to_links

Unnamed: 0_level_0,Course Code,OMSCS Course,OMSCS Course Link,Free Course,Free Course Link
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CSE 8803 Special Topics: Algorithms for Medical Imaging and Inverse Problems,CSE 8803,,,,
CS 6400 Database Systems Concepts and Designs,CS 6400,CS 6400 Database Systems Concepts and Design,https://omscs.gatech.edu/cs-6400-database-syst...,Database Systems Concepts & Design,https://www.udacity.com/course/database-system...
CS 7470 Mobile and Ubiquitous Computing,CS 7470,CS 7470 Mobile & Ubiquitous Computing,https://omscs.gatech.edu/cs-7470-mobile-ubiqui...,,
CS 6475 Computational Photography,CS 6475,CS 6475 Computational Photography,https://omscs.gatech.edu/cs-6475-computational...,Computational Photography,https://www.udacity.com/course/computational-p...
CS 7650 Natural Language,CS 7650,CS 7650 Natural Language Processing,https://omscs.gatech.edu/cs-7650-natural-langu...,,
...,...,...,...,...,...
ISYE 6664 Stochastic Optimization,ISYE 6664,,,,
CS 6460 Educational Technology: Conceptual Foundations,CS 6460,CS 6460 Educational Technology,https://omscs.gatech.edu/cs-6460-educational-t...,Educational Technology,https://www.udacity.com/course/educational-tec...
CS 6422 Database System Implementation,CS 6422,,,,
CSE 6730 Modeling and Simulation: Foundations and Implementation,CSE 6730,,,,


In [26]:
set(course_to_free_course) - set(course_to_links.dropna().index)

{'CS 6505 Computability, Algorithms, and Complexity'}

In [27]:
len(course_to_links.dropna(subset=['OMSCS Course Link']))

40