In [34]:
import os
import re

from bs4 import BeautifulSoup
import pandas as pd
import requests

from util import load_json_in, to_json_in, map_dict_value, flatten

# References


- [re — Regular expression operations](https://docs.python.org/3/library/re.html)
- [Beautiful Soup Documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

# Specializations

Data Format:
```{json}
{
    "Computational Perception and Robotics": {
        "Core Courses": [
            {
                "Pick": 1,
                "Courses: [...]
            },
            {
                "Pick": 1,
                "Courses: [...]
            }
        ],
        "Electives": [
            {
                "Pick", 3,
                "Courses": [...]
            }
        ]
    }
}

```

In [35]:
response = requests.get('https://www.cc.gatech.edu/ms-computer-science-specializations')
soup = BeautifulSoup(response.text)
div = soup.find('div', class_='gt-main-content').find('div', class_='field')

In [36]:
core_electives = re.compile('core courses|electives', re.I)
pick_take = re.compile('pick|take|from', re.I)
at_least = re.compile('at least (\w+)', re.I)
num = re.compile('one|two|three', re.I)
word_to_num = {
    'one': 1,
    'two': 2,
    'three': 3,
}
PICK = 'Pick'
COURSES = 'Courses'
SUBAREA = 'Sub-area'
SUBAREA_CONDITION = 'Sub-area Condition'

tracks = {}
is_electives = False

for child in div.children:
    text = child.text.strip()
    # A new track
    if child.name == 'h4':
        track = {}
        tracks[text] = track
        is_electives = False
        subarea = ''
    elif child.name == 'p':
        # A new course type (core or elective)
        if match := re.match(core_electives, text):
            course_type = match[0].title()
            is_electives = (course_type == 'Electives')
            course_subset = []
            track[course_type] = course_subset
            # If the next or next next text do not contain "pick" or "take"
            next = child.next_sibling.next_sibling
            next_next = next.next_sibling.next_sibling
            if not re.search(pick_take, next.text) and not re.search(pick_take, next_next.text):
                courses = {PICK: -1, COURSES: []}
                course_subset.append(courses)
        # A new course subset
        elif match := re.search(pick_take, text):
            # Extract the number of courses need to pick from the subset
            number_in_word = re.search(num, text)[0].lower()
            pick = word_to_num[number_in_word]
            courses = {PICK: pick, COURSES: []}
            course_subset.append(courses)
            if match := re.search(at_least, text):
                at_least_num = word_to_num[match[1]]
                courses[SUBAREA_CONDITION] = f'at least {at_least_num} from each sub-area'
        # Sub-area
        elif is_electives and child.next_sibling.next_sibling.name == 'ul':
            subarea = child.text.replace('Sub-area: ', '')
        # Handle an edge case
        elif text.lower() == 'or':
            courses[PICK] = 1
        elif text.lower() == 'and':
            courses[PICK] += 1
    # Course List
    elif child.name == 'ul':
        lis = child.find_all('li')
        if subarea and not courses.get(SUBAREA):
            courses[SUBAREA] = []
        for li in child.find_all('li'):
            course = re.sub(r'\s+', ' ', li.text.strip())
            # Add XOR courses
            if ' OR ' in course:
                xor_courses = list(course.split(' OR '))
                courses['XOR'] = {}
                for c in xor_courses:
                    courses[COURSES].append(c)
                    courses['XOR'][c] = sorted(list(set(xor_courses) - set([c])))
            else:
                courses[COURSES].append(course)
                # Add sub-area
                if subarea:
                    courses[SUBAREA].append(subarea)
        # If it does not say take a certain number of courses, all need to be taken
        if courses[PICK] == -1:
            courses[PICK] = len(lis)

# del tracks['Human Centered Computing']
# del tracks['Interactive Intelligence']
to_json_in('data', 'tracks.json', tracks)

# Free Courses

In [37]:
response = requests.get('https://www.udacity.com/georgia-tech')
soup = BeautifulSoup(response.text)
lis = soup.find('ul', class_='courses').find_all('li')

In [38]:

free_course_to_link = {}
for li in lis:
    a = li.find_all('a')[1]
    href = a['href']
    free_course_to_link[a.text.strip()] = f'https://www.udacity.com/{href}'

In [39]:
len(free_course_to_link)

31

# OMSCS Courses

In [40]:
response = requests.get('https://omscs.gatech.edu/current-courses')
soup = BeautifulSoup(response.text)
div = soup.find('div', class_='body')
lis = div.find_all('li')
as_ = [li.a for li in lis]

In [41]:
omscs_course_to_link = {}
for a in as_:
    course = re.sub(r'\s+', ' ', a.text.strip())
    course = re.sub(':', '', course)
    omscs_course_to_link[course] = a['href']

In [42]:
len(omscs_course_to_link)

61

# Data Analysis

In [43]:
track_to_type_courses = map_dict_value(tracks, map_dict_value, lambda v: flatten([e[COURSES] for e in v]))
track_to_courses = map_dict_value(tracks, lambda v: flatten([e[COURSES] for e in flatten(v.values())]))

to_json_in('data', 'track_to_courses.json', track_to_courses)

Does each track have two course types (core and elective)?

In [44]:
pd.DataFrame(map_dict_value(tracks, len), index=['Course Types']).T

Unnamed: 0,Course Types
Computational Perception and Robotics,2
Computer Graphics,2
Computing Systems,2
High Performance Computing,2
Human Centered Computing,2
Human-Computer Interaction,2
Interactive Intelligence,2
Machine Learning,2
Modeling and Simulations,2
Scientific Computing,2


How many courses can you choose from for each track?

In [45]:
track_type_course_count = pd.merge(
    pd.DataFrame(map_dict_value(track_to_type_courses, map_dict_value, len)).T,
    pd.DataFrame(map_dict_value(track_to_courses, len), index=['Total']).T,
    left_index=True,
    right_index=True,
).sort_index()
track_type_course_count.T.to_json(os.path.join('data', 'track_type_course_count.json'), indent=4)
track_type_course_count

Unnamed: 0,Core Courses,Electives,Total
Computational Perception and Robotics,9,13,22
Computer Graphics,5,8,13
Computing Systems,11,25,36
High Performance Computing,2,7,9
Human Centered Computing,3,30,33
Human-Computer Interaction,3,19,22
Interactive Intelligence,8,19,27
Machine Learning,10,22,32
Modeling and Simulations,4,7,11
Scientific Computing,3,8,11


What the number of overlapping courses between tracks?

In [46]:
track_overlap_courses = {}
for track_a, courses_a in track_to_courses.items():
    track_overlap_courses[track_a] = {}
    for track_b, courses_b in track_to_courses.items():
        overlap_courses = list(set(courses_a) & set(courses_b)) if track_a != track_b else []
        track_overlap_courses[track_a][track_b] = sorted(overlap_courses)

to_json_in('data', 'track_overlap_courses.json', track_overlap_courses)
pd.DataFrame(map_dict_value(track_overlap_courses, map_dict_value, len))

Unnamed: 0,Computational Perception and Robotics,Computer Graphics,Computing Systems,High Performance Computing,Human Centered Computing,Human-Computer Interaction,Interactive Intelligence,Machine Learning,Modeling and Simulations,Scientific Computing,Social Computing,Visual Analytics
Computational Perception and Robotics,0,4,3,1,3,0,8,11,0,1,3,1
Computer Graphics,4,0,2,0,2,2,3,3,0,0,3,1
Computing Systems,3,2,0,3,0,0,4,4,2,1,8,0
High Performance Computing,1,0,3,0,0,0,1,1,2,4,0,0
Human Centered Computing,3,2,0,0,0,12,11,2,0,0,9,6
Human-Computer Interaction,0,2,0,0,12,0,5,0,0,0,8,7
Interactive Intelligence,8,3,4,1,11,5,0,8,0,1,6,4
Machine Learning,11,3,4,1,2,0,8,0,1,1,4,2
Modeling and Simulations,0,0,2,2,0,0,0,1,0,2,1,0
Scientific Computing,1,0,1,4,0,0,1,1,2,0,0,0


Find courses that are available on Udacity and offered at OMSCS

In [47]:
# Manual mapping
course_to_free_course = {
    "CS 6200 Graduate Introduction to Operating Systems": "Introduction to Operating Systems",
    "CS 6300 Software Development Process": "Software Development Process",
    "CS 7642 Reinforcement Learning and Decision Making": "Reinforcement Learning",
    "CS 6250 Computer Networks": "Computer Networking",
    "CS 6515 Introduction to Graduate Algorithms": "Introduction to Graduate Algorithms",
    "CS 6505 Computability, Algorithms, and Complexity": "Computability, Complexity & Algorithms",
    "CS 6400 Database Systems Concepts and Designs": "Database Systems Concepts & Design",
    "CS 6460 Educational Technology: Conceptual Foundations": "Educational Technology",
    "CS 6750 Human-Computer Interaction": "Human-Computer Interaction",
    "CS 6476 Computer Vision": "Introduction to Computer Vision",
    "CSE 6220 High Performance Computing": "High Performance Computing",
    "CS 6601 Artificial Intelligence": "Artificial Intelligence",
    "CS 6210 Advanced Operating Systems": "Advanced Operating Systems",
    "CS 7638 Artificial Intelligence Techniques for Robotics": "Artificial Intelligence for Robotics",
    "CS 6310 Software Architecture and Design": "Software Architecture & Design",
    "CS 7641 Machine Learning": "Machine Learning",
    "CS 6475 Computational Photography": "Computational Photography",
    "CS 6262 Network Security": "Network Security",
    "CSE 6250 Big Data for Health": "Big Data Analytics in Healthcare",
    "CS 6290 High-Performance Computer Architecture": "High Performance Computer Architecture",
    "CS 6340 Software Analysis and Testing": "Software Analysis & Testing",
    "CS 7646 Machine Learning for Trading": "Machine Learning for Trading",
    "CS 6035 Introduction to Information Security": "Introduction to Information Security",
    "CSE 6242 Data and Visual Analytics": "Data Analysis and Visualization",
    "CS 7637 Knowledge-Based AI": "Knowledge-Based AI: Cognitive Systems",
}

In [48]:
# Udacity courses that are not in MSCS but some are in OMSCS
[
    'Linear Algebra Refresher Course',
    'GT - Refresher - Advanced OS',
    'Embedded Systems', # CS 6291 Embedded Software Optimization
    'Compilers: Theory and Practice', # CS 8803 O08: Compilers - Theory and Practice
    'Cyber-Physical Systems Security',
    'Machine Learning: Unsupervised Learning'
]

['Linear Algebra Refresher Course',
 'GT - Refresher - Advanced OS',
 'Embedded Systems',
 'Compilers: Theory and Practice',
 'Cyber-Physical Systems Security',
 'Machine Learning: Unsupervised Learning']

In [49]:
# Map courses to Udacity courses and links
course_to_free_course_info = pd.merge(
    pd.Series(course_to_free_course, name='Free Course'),
    pd.Series(free_course_to_link, name='Free Course Link'),
    left_on='Free Course',
    right_index=True,
)
# Verify manual mapping is correct
assert len(course_to_free_course_info) == len(course_to_free_course)

course_to_free_course_info.head()

Unnamed: 0,Free Course,Free Course Link
CS 6200 Graduate Introduction to Operating Systems,Introduction to Operating Systems,https://www.udacity.com/course/introduction-to...
CS 6300 Software Development Process,Software Development Process,https://www.udacity.com/course/software-develo...
CS 7642 Reinforcement Learning and Decision Making,Reinforcement Learning,https://www.udacity.com/course/reinforcement-l...
CS 6250 Computer Networks,Computer Networking,https://www.udacity.com/course/computer-networ...
CS 6515 Introduction to Graduate Algorithms,Introduction to Graduate Algorithms,https://www.udacity.com/course/introduction-to...


In [50]:
# All courses (with some duplciates)
courses = flatten(track_to_courses.values())

all_courses = list(set(courses))
all_courses = [course for course in all_courses if not re.search('Any|Substitutions', course)]

In [51]:
all_course_df = pd.DataFrame(all_courses, columns=['Course'])
omscs_course_to_link_df = pd.DataFrame(omscs_course_to_link, index=['OMSCS Course Link']).T
omscs_course_to_link_df = omscs_course_to_link_df.reset_index(names='Course')

In [52]:
# Get the course codes for all the courses
get_course_code = lambda e: ' '.join(e.split(' ')[:2])
all_course_df['Course Code'] = all_course_df['Course'].apply(get_course_code)
omscs_course_to_link_df['Course Code'] = omscs_course_to_link_df['Course'].apply(get_course_code)

In [53]:
# Join the courses with omscs courses by course codes
course_to_omscs_course_info = pd.merge(
    all_course_df,
    omscs_course_to_link_df,
    left_on='Course Code',
    right_on='Course Code',
    how='left',
    suffixes=('', ' OMSCS')
)
# Remove 8803 courses
course_to_omscs_course_info = course_to_omscs_course_info.query('`Course Code` != "CS 8803"').query('`Course Code` != "CSE 8803"')
course_to_omscs_course_info = course_to_omscs_course_info.set_index('Course').rename(columns={'Course OMSCS': 'OMSCS Course'})
course_to_omscs_course_info.head()

Unnamed: 0_level_0,Course Code,OMSCS Course,OMSCS Course Link
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CS 7643 Deep Learning,CS 7643,CS 7643 Deep Learning,https://omscs.gatech.edu/cs-7643-deep-learning
CS 6474 Social Computing,CS 6474,,
CS/CSE 8803 Special Topics: Parallel Numerical Algorithms,CS/CSE 8803,,
CS 7467 Computer-Supported Collaborative Learning,CS 7467,,
CS 6730 Data Visualization Principles,CS 6730,,


In [54]:
# Check for duplicate course codes
course_code_counts = course_to_omscs_course_info['Course Code'].value_counts()
dup_course_codes = course_code_counts[course_code_counts > 1]
dup_course_codes

Course Code
CS 6457     2
CS 7633     2
CS 6220     2
CS 6491     2
CS 6730     2
CSE 6740    2
CS 6456     2
CS 7631     2
Name: count, dtype: int64

Duplicate courses

In [55]:
course_to_omscs_course_info.query(f'`Course Code` in {dup_course_codes.index.to_list()}').sort_values('Course Code')

Unnamed: 0_level_0,Course Code,OMSCS Course,OMSCS Course Link
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CS 6220 Big Data Systems & Analysis,CS 6220,,
CS 6220 Big Data Systems and Analytics,CS 6220,,
CS 6456 User Interface Software,CS 6456,,
CS 6456 Principles of User Interface Software,CS 6456,,
CS 6457 Video Game Design and Programming,CS 6457,CS 6457 Video Game Design,https://omscs.gatech.edu/cs-6457-video-game-de...
CS 6457 Video Game Design,CS 6457,CS 6457 Video Game Design,https://omscs.gatech.edu/cs-6457-video-game-de...
CS 6491 Foundations of Computer Graphics,CS 6491,,
CS 6491 Computer Graphics,CS 6491,,
CS 6730 Data Visualization Principles,CS 6730,,
CS 6730 Data Visualization: Principles & Applications,CS 6730,,


In [56]:
# Join courses and OMSCS info with Udacity course info
course_to_links = pd.merge(
    course_to_omscs_course_info,
    course_to_free_course_info,
    left_index=True,
    right_index=True,
    how='left',
).sort_index()
course_to_links.T.to_json(os.path.join('data', 'course_to_links.json'), indent=4)
course_to_links.head()

Unnamed: 0_level_0,Course Code,OMSCS Course,OMSCS Course Link,Free Course,Free Course Link
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CS 6010 Principles of Design,CS 6010,,,,
CS 6035 Introduction to Information Security,CS 6035,CS 6035 Introduction to Information Security,https://omscs.gatech.edu/cs-6035-introduction-...,Introduction to Information Security,https://www.udacity.com/course/intro-to-inform...
CS 6200 Graduate Introduction to Operating Systems,CS 6200,CS 6200 Introduction to Operating Systems,https://omscs.gatech.edu/cs-6200-introduction-...,Introduction to Operating Systems,https://www.udacity.com/course/introduction-to...
CS 6210 Advanced Operating Systems,CS 6210,CS 6210 Advanced Operating Systems,https://omscs.gatech.edu/cs-6210-advanced-oper...,Advanced Operating Systems,https://www.udacity.com/course/advanced-operat...
CS 6220 Big Data Systems & Analysis,CS 6220,,,,


Number of offered in MSCS

In [57]:
all_courses_dedup = course_to_links.drop_duplicates(subset='Course Code').index

to_json_in('data', 'all_courses.json', sorted(all_courses_dedup))
len(all_courses_dedup)

121

Which Udacity courses are not offered at MSCS?

In [58]:
set(course_to_free_course) - set(course_to_links.dropna().index)

{'CS 6505 Computability, Algorithms, and Complexity'}

Number of OMSCS courses offered in MSCS

In [59]:
len(course_to_links.dropna(subset=['OMSCS Course Link']))

42