In [5]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import csv
import datetime
import time
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options


## Scraping site using Selenium

In [36]:
# # Setting up Web Driver
# options = Options()
# options.add_argument("--headless=new")
# driver = webdriver.Chrome(options=options)
# driver.get('https://guide.wisc.edu/courses/')

# # Initializing empty dict for class dict 
# major_class_dict = {}

# course_list = driver.find_element(By.ID, "atozindex")
# subjects = course_list.find_elements(By.TAG_NAME, "a")

# # Gather all the URLs first
# links = [link.get_attribute('href') for link in subjects]
# names = [subject.text for subject in subjects]

# num_subjects = len(subjects)

# # Now visit each URL directly
# for i in range(num_subjects):
#     class_list = []
#     name = names[i].split(' (')[0]
#     print(name)
#     driver.get(links[i])
#     classes = WebDriverWait(driver, 20, ignored_exceptions=(NoSuchElementException, StaleElementReferenceException)).until(EC.presence_of_element_located((By.CLASS_NAME,  "sc_sccoursedescs")))
#     titles = classes.find_elements(By.CLASS_NAME, "courseblockcode")
#     credits = classes.find_elements(By.CLASS_NAME, "courseblockcredits")
#     courses = classes.find_elements(By.CLASS_NAME,  "courseblock")
#     num_courses = len(courses)
#     for j in range(num_courses):
#         class_dict = {}
#         class_dict['class'] = titles[j].text
#         class_dict['credits'] = int(credits[j].text[:1])

#         details = courses[j].find_elements(By.CLASS_NAME, "courseblockextra")
#         num_details = len(details)
#         class_details = {}
#         for k in range(num_details):
#             inner_html = details[k].get_attribute('innerHTML')
#             soup = BeautifulSoup(inner_html, 'html.parser')
#             text = soup.get_text().split(': ')
#             label = text[0]
#             info = text[1]
#             if re.search('\u200b|\xa0', info):
#                 info = info.replace('\u200b', '').replace('\xa0', ' ')
#             class_details[label] = info
#         class_dict['class info'] = class_details
#         class_list.append(class_dict)
#     major_class_dict[name] = class_list

# driver.quit()

In [35]:
# data = major_class_dict
# # Open a new CSV file for writing
# with open('courses.csv', 'w', newline='') as file:
#     writer = csv.writer(file)

#     # Write the header
#     writer.writerow(['Department', 'Class', 'Credits', 'Requisites',  'Course Designation', 'Repeatable for Credit', 'Last Taught'])

#     # Iterate over the departments and their courses
#     for department, courses in data.items():
#         for course in courses:
#             class_info = course['class info']
#             # Extract data, providing default values if any key is missing
#             requisites = class_info.get('Requisites', '')
#             repeatable = class_info.get('Repeatable for Credit', '')
#             last_taught = class_info.get('Last Taught', '')
#             course_designation = class_info.get('Course Designation', '')

#             # Write the data to the CSV file
#             writer.writerow([department, course['class'], course['credits'], requisites, course_designation, repeatable, last_taught])


## Scraping site using Requests

In [32]:
# Base URL
base_url = 'https://guide.wisc.edu'
start_url = 'https://guide.wisc.edu/courses/'

# Fetch main page content
response = requests.get(start_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Initializing empty dict for class dict 
major_class_dict = {}

# Extract subjects
subjects = soup.select('#atozindex a')

# Loop through subjects
for subject in subjects:
    class_list = []
    subject_url = base_url + subject['href']
    subject_name = subject.text.split(' (')[0]
    
    print(subject_name)
    
    # Get subject page content
    subject_response = requests.get(subject_url)
    subject_soup = BeautifulSoup(subject_response.content, 'html.parser')
    
    # Extract class details
    courses = subject_soup.select('.courseblock')
    
    for course in courses:
        class_dict = {}
        title = course.select_one('.courseblockcode').text
        if re.search('\u200b|\xa0', title):
            title = title.replace('\u200b', '').replace('\xa0', ' ')
        credit = course.select_one('.courseblockcredits').text
        
        class_dict['class'] = title
        class_dict['credits'] = int(credit[:1])
        
        details = course.select('.courseblockextra')
        class_details = {}
        
        for detail in details:
            text = detail.text.split(': ')
            label = text[0]
            info = text[1]
            
            if re.search('\u200b|\xa0', info):
                info = info.replace('\u200b', '').replace('\xa0', ' ')
            class_details[label] = info
        
        class_dict['class info'] = class_details
        class_list.append(class_dict)
    
    major_class_dict[subject_name] = class_list


Accounting and Information Systems
Actuarial Science
African Cultural Studies
Afro-American Studies
Agricultural and Applied Economics
Agroecology
Agronomy
Air Force Aerospace Studies
American Indian Studies
Anatomy
Anatomy & Physiology
Anesthesiology
Animal Sciences
Anthropology
Applied Biotechnology
Art Department
Art Education
Art History
Asian American Studies
Asian Languages and Cultures
Asian Languages and Cultures: Languages
Astronomy
Atmospheric and Oceanic Sciences
Biochemistry
Biological Systems Engineering
Biology
Biology Core Curriculum
Biomedical Engineering
Biomolecular Chemistry
Biostatistics and Medical Informatics
Botany
Cell and Regenerative Biology
Chemical and Biological Engineering
Chemistry
Chicana/o and Latina/o Studies
Civil and Environmental Engineering
Civil Society and Community Studies
Classics
Collaborative Nursing Program
Communication Arts
Communication Sciences and Disorders
Community and Environmental Sociology
Comparative Biosciences
Comparative Litera

## Writing dictionary to CSV

In [103]:
data = major_class_dict
# Open a new CSV file for writing
with open('allcourses.csv', 'w', newline='') as file:
    writer = csv.writer(file)

    # Write the header
    writer.writerow(['Department', 'Class', 'Credits', 'Requisites',  'Course Designation', 'Repeatable for Credit', 'Last Taught'])

    # Iterate over the departments and their courses
    for department, courses in data.items():
        for course in courses:
            class_info = course['class info']
            # Extract data, providing default values if any key is missing
            requisites = class_info.get('Requisites', '')
            repeatable = class_info.get('Repeatable for Credit', '')
            last_taught = class_info.get('Last Taught', '')
            course_designation = class_info.get('Course Designation', '')

            # Write the data to the CSV file
            writer.writerow([department, course['class'], course['credits'], requisites, course_designation, repeatable, last_taught])


In [67]:
full_df = pd.read_csv('allcourses.csv')
requisites = full_df['Requisites']
for requisite in requisites:
    try:
        print(requisite)
        reqs = re.split(r'[,.;]', requisite)
        reqs = [req.strip() for req in reqs]
        print(reqs)
    except TypeError as e:
        continue

Not open to students with credit for ACCT I S 300
['Not open to students with credit for ACCT I S 300']
ACCT I S 100 or declared in undergraduate Business Exchange program
['ACCT I S 100 or declared in undergraduate Business Exchange program']
Satisfied Quantitative Reasoning (QR) A requirement. Not open to students with credit for ACCT I S 100.
['Satisfied Quantitative Reasoning (QR) A requirement', 'Not open to students with credit for ACCT I S 100', '']
ACCT I S 100 or declared in undergraduate Business Exchange program
['ACCT I S 100 or declared in undergraduate Business Exchange program']
ACCT I S 301 or declared in undergraduate Business Exchange program
['ACCT I S 301 or declared in undergraduate Business Exchange program']
ACCT I S 211 or declared in undergraduate Business Exchange program
['ACCT I S 211 or declared in undergraduate Business Exchange program']
ACCT I S 100, 300, or LAW 811. Not open to students declared in Business
['ACCT I S 100', '300', 'or LAW 811', 'Not ope

In [42]:
full_df = pd.read_csv('allcourses.csv')
df = full_df[full_df['Last Taught'].str.slice(-4) >= str(datetime.datetime.now().year-3)]
df = df.reset_index(drop=True)
df

Unnamed: 0,Department,Class,Credits,Requisites,Course Designation,Repeatable for Credit,Last Taught
0,Accounting and Information Systems,ACCT I S 100,3,Not open to students with credit for ACCT I S 300,,No,Fall 2023
1,Accounting and Information Systems,ACCT I S 211,3,ACCT I S 100 or declared in undergraduate Busi...,,No,Fall 2023
2,Accounting and Information Systems,ACCT I S 300,3,Satisfied Quantitative Reasoning (QR) A requir...,Gen Ed - Quantitative Reasoning Part B,No,Fall 2023
3,Accounting and Information Systems,ACCT I S 301,3,ACCT I S 100 or declared in undergraduate Busi...,,No,Fall 2023
4,Accounting and Information Systems,ACCT I S 302,3,ACCT I S 301 or declared in undergraduate Busi...,,No,Fall 2023
...,...,...,...,...,...,...,...
8768,Zoology,ZOOLOGY 956,1,Graduate/professional standing,Grad 50% - Counts toward 50% graduate coursewo...,"Yes, unlimited number of completions",Fall 2022
8769,Zoology,ZOOLOGY 957,1,Graduate/professional standing,Grad 50% - Counts toward 50% graduate coursewo...,"Yes, unlimited number of completions",Fall 2023
8770,Zoology,ZOOLOGY 960,1,Graduate/professional standing,Grad 50% - Counts toward 50% graduate coursewo...,"Yes, unlimited number of completions",Spring 2023
8771,Zoology,ZOOLOGY 962,1,Graduate/professional standing,Grad 50% - Counts toward 50% graduate coursewo...,"Yes, unlimited number of completions",Spring 2023


In [88]:
start_url = 'https://guide.wisc.edu/explore-majors/'
base_url = 'https://guide.wisc.edu'

response = requests.get(start_url)

soup = BeautifulSoup(response.content, 'html.parser')
items = soup.select_one('#filter-items')
programs = items.select('li a')

for program in programs:
    title = program.select_one('.title').text
#     print(title)
    # testing with only BS degrees first
    if not re.search(r'Genetics and Genomics, B.S.', title):
#     if not re.search(r'\s*B\.S\.\s*', title):
        continue
    program_url = base_url + program['href']
    program_response = requests.get(program_url)
    extract_content = BeautifulSoup(program_response.content, 'html.parser')
    requirement_link = extract_content.select_one('#requirementstexttab a')['href']
    requirements_url = program_url + requirement_link
    requirement_response = requests.get(requirements_url)
    requirement_content = BeautifulSoup(requirement_response.content, 'html.parser')
    print(title)
    course_list = requirement_content.select('.sc_courselist')
    for course in course_list:
        rows = course.select('tbody tr')
        for row in rows:
            credit = re.match(r'(.*)\((\d+(?:-\d+)?)\)', row.text)
            if credit:
                print(credit.group(1))


Genetics and Genomics, B.S.


In [76]:
base_url = 'https://guide.wisc.edu'
start_url = 'https://guide.wisc.edu/undergraduate'

response = requests.get(start_url)

soup = BeautifulSoup(response.content, 'html.parser')
items = soup.select_one('#schoolsandcollegestextcontainer')
schools = items.select('li a')

schools_dict = {}
for school in schools:
    programs_list = []
    name = school.text
    school_url = base_url + school['href']
    school_response = requests.get(school_url)
    extract_content = BeautifulSoup(school_response.content, 'html.parser')
    requirement_link = extract_content.select_one('#degreesmajorscertificatestexttab a')['href']
    requirements_url = school_url + requirement_link
    requirement_response = requests.get(requirements_url)
    requirement_content = BeautifulSoup(requirement_response.content, 'html.parser')
    school_links = requirement_content.select('.visual-sitemap a')
    for link in school_links:
        if not re.search(r'\s*B\.S\.\s*', link.text):
            continue
        programs_list.append((link.text).split(',')[0])
    schools_dict[name] = programs_list
# schools_dict
    

In [77]:
df2 = pd.DataFrame(list(schools_dict.items()), columns=['School', 'Program'])
df2 = df2.explode('Program', ignore_index=True)
df2

Unnamed: 0,School,Program
0,College of Agricultural and Life Sciences,Agricultural and Applied Economics
1,College of Agricultural and Life Sciences,Agricultural Business Management
2,College of Agricultural and Life Sciences,Agronomy
3,College of Agricultural and Life Sciences,Animal and Veterinary Biosciences
4,College of Agricultural and Life Sciences,Animal Sciences
...,...,...
117,School of Human Ecology,Personal Finance
118,School of Human Ecology,Textiles and Fashion Design
119,School of Nursing,
120,School of Pharmacy,Pharmaceutical Sciences


In [2]:
from graphviz import Graph