## courses + new changes
**july 14: don't use this version, use the reverted one in the same folder**


- run entire notebook, the very last function is called `do_everything()`. to get, say, ECON course changes, just run `do_everything('ECON')`

the following are hard-coded (will need to be updated in future years): 
- the base URL that everything begins from
- typing 20182019 and 20192020 (eg next year change to 20192020 and 20202021)

june 26, 2019


In [12]:
# autumn + winter + spring courses for 2018-2019
BASE_PAGE_URL = 'https://explorecourses.stanford.edu/search?q=LINGUIST&view=catalog&filter-term-Winter=on&filter-departmentcode-LINGUIST=on&filter-catalognumber-LINGUIST=on&academicYear=20182019&filter-term-Autumn=on&filter-term-Spring=on&page=0&filter-coursestatus-Active=on&collapse='

In [2]:
from bs4 import BeautifulSoup
from subprocess import check_output

In [3]:
def get_n_pages(pgs):
    lst = []
    for x in pgs:
        try:
            lst.append(int(x))
        except:
            pass
    return max(lst)

def get_search_results_on_page(url, year, ith):
    print('Parsing page', ith + 1)
    new_url = url.replace('page=0', 'page=' + str(ith))
    new_src = check_output([
        'wget',
        '-qO-',
        new_url
    ])
    parse = BeautifulSoup(new_src)
    return parse.body.find(
        'div',
        attrs={'id': 'searchResults'}
    )
    
def get_results_list(url, year='20182019'):
    src = check_output([
        'wget',
        '-qO-',
        url
    ])
    parse = BeautifulSoup(src)
    n_pages = get_n_pages(parse.body.find(
        'div',
        attrs={'id': 'pagination'}
    ).text.split())
    print('Need to parse', n_pages, 'pages')
    to_return = [get_search_results_on_page(url, year, ith) for ith in range(n_pages)]
    print('Done parsing!')
    return to_return

In [4]:
results_list = get_results_list(BASE_PAGE_URL)

Need to parse 7 pages
Parsing page 1
Parsing page 2
Parsing page 3
Parsing page 4
Parsing page 5
Parsing page 6
Parsing page 7
Done parsing!


In [5]:
legal_fields = {
    'Terms',
    'Units',
    'UG Reqs',
    'Grading'
}

def extract_data(fields):
    relevant_name_to_val = {}
    for item in fields:
        idx = item.find(':')
        field_name = item[:idx]
        if field_name in legal_fields:
            field_val = item[idx + 1:].strip()
            relevant_name_to_val[field_name] = field_val
    return relevant_name_to_val

def fetch_info(course, mp):
    code = course.find(
        'span',
        attrs={'class': 'courseNumber'}
    ).text.strip(':')
    mp[code] = {}
    name = course.find(
        'span',
        attrs={'class': 'courseTitle'}
    ).text
    mp[code]['Title'] = name
    
    attribs = course.find(
        'div',
        attrs={'class': 'courseAttributes'}
    ).text
    fields = [x.strip() for x in ' '.join(attribs.split()).split('|')]
    
    attribs_all = course.findAll(
        'div',
        attrs={'class': 'courseAttributes'}
    )
    
    instructors_set = None
    if len(attribs_all) > 1:
        fields = attribs_all[1].text.strip().split('\n')
        instructors_set = set([f.strip() for f in fields if 'PI' in f])
    
    extracted = extract_data(fields)    
    for field in legal_fields:
        if field in extracted:
            mp[code][field] = extracted[field]
        else:
            mp[code][field] = None
    
    mp[code]['Instructors'] = instructors_set
    
    desc = course.find(
        'div',
        attrs={'class': 'courseDescription'}
    ).text
    mp[code]['Desc.'] = desc

def parse_results_list(results_list):
    course_to_field_to_info_map_map = {}
    print('Parsing results html...')
    for results in results_list:
        all_courses = results.findAll(
            'div',
            attrs={'class': 'courseInfo'}
        )
        for course in all_courses:
            fetch_info(course, course_to_field_to_info_map_map)
    print('Done parsing!')
    return course_to_field_to_info_map_map

In [6]:
ling_20182019 = parse_results_list(results_list)

Parsing results html...
Done parsing!


In [7]:
url_20192020 = BASE_PAGE_URL.replace('20182019', '20192020')
ling_20192020 = parse_results_list(get_results_list(url_20192020))

Need to parse 7 pages
Parsing page 1
Parsing page 2
Parsing page 3
Parsing page 4
Parsing page 5
Parsing page 6
Parsing page 7
Done parsing!
Parsing results html...
Done parsing!


In [8]:
import re
from termcolor import cprint

fields = [
    'Title',
    'Terms',
    'Units',
    'UG Reqs',
    'Grading'
]

# stackoverflow Mark Byers
def natural_sort(l): 
    convert = lambda text: int(text) if text.isdigit() else text.lower() 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

def find_differences(A, B, a, b):
    print('Courses in ' + a + ' but not in ' + b + ':')
    for course in A:
        if course not in B:
            cprint(' - ' + course + '\n   - ' + A[course]['Title'], attrs=['bold'])
    print('\n')
    print('Courses in ' + b + ' but not in ' + a + ':')
    for course in B:
        if course not in A:
#             print(course, B[course]['Title'], B[course]['Terms'])
            cprint(' - ' + course + '\n   - ' + B[course]['Title'] + ' (' + str(B[course]['Terms']) + ')', attrs=['bold'])
    print('\nCourses Retained:')
    overlap = A.keys() & B.keys()
    for course in natural_sort(overlap):
        cprint(' - ' + course, attrs=['bold'])
        for f in fields:
            print('    ' + f + ': ', end=' ')
            if f not in A[course] or f not in B[course]:
                if f not in A[course]:
                    cprint('None', 'red', attrs=['bold'])
                else:
                    cprint(A[course], 'red', attrs=['bold'])
                if f not in B[course]:
                    cprint('None', 'green', attrs=['bold'])
                else:
                    cprint(B[course], 'green', attrs=['bold'])
            elif A[course][f] != B[course][f]:
                cprint(A[course][f], 'red', end=' ', attrs=['bold'])
                cprint(B[course][f], 'green', attrs=['bold'])
            else:
                print(A[course][f])

In [9]:
find_differences(ling_20182019, ling_20192020, 'Ling 18-19', 'Ling 19-20')

Courses in Ling 18-19 but not in Ling 19-20:
[1m - LINGUIST 55N
   - Language in the City[0m
[1m - LINGUIST 65
   - African American Vernacular English (AFRICAAM 21, CSRE 21, LINGUIST 265)[0m
[1m - LINGUIST 83Q
   - Translation[0m
[1m - LINGUIST 121A
   - The Syntax of English[0m
[1m - LINGUIST 152
   - Sociolinguistics and Pidgin Creole Studies (LINGUIST 252)[0m
[1m - LINGUIST 157
   - Sociophonetics (LINGUIST 257)[0m
[1m - LINGUIST 160
   - Introduction to Language Change[0m
[1m - LINGUIST 200
   - Foundations of Linguistic Theory[0m
[1m - LINGUIST 207A
   - Advanced Phonetics[0m
[1m - LINGUIST 211
   - Metrics[0m
[1m - LINGUIST 225
   - Seminar in Syntax: Distributed Morphology[0m
[1m - LINGUIST 236
   - Seminar in Semantics: Causation[0m
[1m - LINGUIST 247
   - Seminar in Psycholinguistics: Advanced Topics (PSYCH 227)[0m
[1m - LINGUIST 250
   - Sociolinguistic Theory and Analysis[0m
[1m - LINGUIST 252
   - Sociolinguistics and Pidgin Creole Studies (LIN

In [10]:
def do_everything(dept_code):
    url = BASE_PAGE_URL.replace('LINGUIST', dept_code)
    url_2 = url.replace('20182019', '20192020')
    a = parse_results_list(get_results_list(url))
    b = parse_results_list(get_results_list(url_2))
    find_differences(a, b, dept_code + ' 18-19', dept_code + ' 19-20')

In [11]:
do_everything('CS')

Need to parse 21 pages
Parsing page 1
Parsing page 2
Parsing page 3
Parsing page 4
Parsing page 5
Parsing page 6
Parsing page 7
Parsing page 8
Parsing page 9
Parsing page 10
Parsing page 11
Parsing page 12
Parsing page 13
Parsing page 14
Parsing page 15
Parsing page 16
Parsing page 17
Parsing page 18
Parsing page 19
Parsing page 20
Parsing page 21
Done parsing!
Parsing results html...
Done parsing!
Need to parse 20 pages
Parsing page 1
Parsing page 2
Parsing page 3
Parsing page 4
Parsing page 5
Parsing page 6
Parsing page 7
Parsing page 8
Parsing page 9
Parsing page 10
Parsing page 11
Parsing page 12
Parsing page 13
Parsing page 14
Parsing page 15
Parsing page 16
Parsing page 17
Parsing page 18
Parsing page 19
Parsing page 20
Done parsing!
Parsing results html...
Done parsing!
Courses in CS 18-19 but not in CS 19-20:
[1m - CS 11SI
   - How to Make VR: Introduction to Virtual Reality Design and Development[0m
[1m - CS 17SI
   - Frontiers in Reproductive Technology[0m
[1m - CS 18SI


In [13]:
do_everything('MATH')

Need to parse 9 pages
Parsing page 1
Parsing page 2
Parsing page 3
Parsing page 4
Parsing page 5
Parsing page 6
Parsing page 7
Parsing page 8
Parsing page 9
Done parsing!
Parsing results html...
Done parsing!
Need to parse 9 pages
Parsing page 1
Parsing page 2
Parsing page 3
Parsing page 4
Parsing page 5
Parsing page 6
Parsing page 7
Parsing page 8
Parsing page 9
Done parsing!
Parsing results html...
Done parsing!
Courses in MATH 18-19 but not in MATH 19-20:
[1m - MATH 114
   - Introduction to Scientific Computing (CME 108)[0m
[1m - MATH 137
   - Mathematical Methods of Classical Mechanics[0m
[1m - MATH 145
   - Algebraic Geometry[0m
[1m - MATH 148
   - Algebraic Topology[0m
[1m - MATH 154
   - Algebraic Number Theory[0m
[1m - MATH 161
   - Set Theory[0m
[1m - MATH 234
   - Large Deviations Theory (STATS 374)[0m
[1m - MATH 235A
   - Topics in combinatorics[0m
[1m - MATH 237A
   - Topics in Financial Math: Market microstructure and trading algorithms[0m
[1m - MATH 249

In [14]:
do_everything('PWR')

Need to parse 12 pages
Parsing page 1
Parsing page 2
Parsing page 3
Parsing page 4
Parsing page 5
Parsing page 6
Parsing page 7
Parsing page 8
Parsing page 9
Parsing page 10
Parsing page 11
Parsing page 12
Done parsing!
Parsing results html...
Done parsing!
Need to parse 14 pages
Parsing page 1
Parsing page 2
Parsing page 3
Parsing page 4
Parsing page 5
Parsing page 6
Parsing page 7
Parsing page 8
Parsing page 9
Parsing page 10
Parsing page 11
Parsing page 12
Parsing page 13
Parsing page 14
Done parsing!
Parsing results html...
Done parsing!
Courses in PWR 18-19 but not in PWR 19-20:
[1m - PWR 1IF
   - Writing & Rhetoric 1: The Rhetoric of Language and Social Identity in America[0m
[1m - PWR 1RL
   - Writing & Rhetoric 1: The Rhetoric of Happiness[0m
[1m - PWR 2BRB
   - Writing & Rhetoric 2: Eurekas and Epiphanies: The Rhetoric of Inspiration[0m
[1m - PWR 91EE
   - Intermediate Writing: Saving Lives with Picture Books[0m
[1m - PWR 91KD
   - Intermediate Writing: Scripting Ente

    UG Reqs:  None
    Grading:  None
[1m - PWR 2GME[0m
    Title:  Writing & Rhetoric 2: Our America: Conviction, Passion, Paranoia
    Terms:  None
    Units:  None
    UG Reqs:  None
    Grading:  None
[1m - PWR 2IY[0m
    Title:  Writing & Rhetoric 2: Many Faces of Sherlock: Race, Gender, Power, and the Rhetoric of the Detective
    Terms:  None
    Units:  None
    UG Reqs:  None
    Grading:  None
[1m - PWR 2JDC[0m
    Title:  Writing & Rhetoric 2: The Rhetoric of Collaboration: From Fandoms to Entrepreneurs
    Terms:  None
    Units:  None
    UG Reqs:  None
    Grading:  None
[1m - PWR 2JJ[0m
    Title:  Writing & Rhetoric 2: The Rhetoric of Language, Identity and Power
    Terms:  None
    Units:  None
    UG Reqs:  None
    Grading:  None
[1m - PWR 2JPA[0m
    Title:  Writing & Rhetoric 2: How We Got Schooled: The Rhetoric of Literacy and Education
    Terms:  None
    Units:  None
    UG Reqs:  None
    Grading:  None
[1m - PWR 2JS[0m
    Title:  Writing & Rheto