In [1]:
import pandas as pd

from bs4 import BeautifulSoup
import requests

from keybert import KeyBERT
from tqdm import tqdm
import re

In [2]:
ucsc_url = 'https://registrar.ucsc.edu/catalog/archive/11-12/programs-courses/course-descriptions/cmpscourses.html'
req = requests.get(ucsc_url)
soup = BeautifulSoup(req.text)

In [3]:
course_blocks = soup.find_all("p")

class_id = []
class_title = []
desc = []
upper = []
def clean_description(text):
    text = text.replace('(2 credits)','')
    text = text.split("Students cannot")[0]
    text = text.split("Prerequisite(s):")[0] 
    text = text.split("(General Education Codes(s):")[0]
    text = text.replace("F,W,S","").replace("F,W","").replace("W,S","").replace("F,S","").replace("*","")
    return text.strip()

for i in range(len(course_blocks)):
    course_text = course_blocks[i].get_text(strip=True)
    if re.match(r"^\d+[A-Z]?\.", course_text):
        split_text = course_text.split(".", 1)
        course_code = split_text[0].strip() 
        course_name = split_text[1].strip() 

        course_number = int("".join(filter(str.isdigit, course_code)))

        if course_number < 200:
            class_id.append(course_code)
            class_title.append(clean_description(course_name).split('.')[0])
            desc_element = course_blocks[i + 1].get_text(strip=True) if i + 1 < len(course_blocks) else ""
            desc.append(clean_description(desc_element).split('.')[2])
            upper.append(course_number >= 100)



df = pd.DataFrame({
    "Course ID": class_id,
    "Course Title": class_title,
    "Upper": upper,
    "Skills": desc
})
df

Unnamed: 0,Course ID,Course Title,Upper,Skills
0,2,Computer Literacy,False,Introductory programming for students who have...
1,5C,Introduction to Programming in C/C++,False,Introductory programming for School of Engine...
2,5J,Introduction to Programming in Java,False,SIntroduction to programming for engineering ...
3,5P,Introduction to Programming in Python,False,"An overview of the theory, foundations, and p..."
4,10,Introduction to Computer Science,False,SContinuation of course 5J
...,...,...,...,...
72,195F,Senior Thesis Research,True,Students submit petition to sponsoring agency
73,198,Individual Study or Research,True,Intended for majors
74,198F,Individual Study or Research,True,For fourth-year students majoring in computer...
75,199,Tutorial,True,For fourth-year students majoring in computer...


In [4]:
def keyword_wrapper(doc):
    kw_model = KeyBERT() #instantiate model
    
    stop_words = ["cs", "prerequisite", "grade", "requirement", 
    "courses", "instructor", "faculty", "computer", "student", "concurrently", "majors",
    "approach", "aspects", "awarded",
    "concepts", "course", "courses", "credit", "design", "fields",
    "foundation", "fundamental", "fundamentals", "introduction", "issues", "level",
    "lower", "major", "methods", "none", "overview", "perspectives",
    "practice", "practices", "principles", "process", "processes",
    "programs", "related", "required", "requirement", "role",
    "skills", "study", "techniques", "tools", "topics", "understanding",
    "upper", "various", "work", "department", "resources", "requisite", "requisites", "enforced", "lecture", "hours"] #dont consider these words

    return [i[0] for i in kw_model.extract_keywords(doc, stop_words=stop_words, top_n=10)] #top 10 keywords

df['Skills'] = df['Skills'].apply(keyword_wrapper) #apply functon
df

Unnamed: 0,Course ID,Course Title,Upper,Skills
0,2,Computer Literacy,False,"[programming, introductory, students, experien..."
1,5C,Introduction to Programming in C/C++,False,"[programming, introductory, engineering, schoo..."
2,5J,Introduction to Programming in Java,False,"[programming, engineering, sintroduction, stud..."
3,5P,Introduction to Programming in Python,False,"[computers, theory, future, science, foundatio..."
4,10,Introduction to Computer Science,False,"[5j, scontinuation, of]"
...,...,...,...,...
72,195F,Senior Thesis Research,True,"[petition, sponsoring, submit, students, agenc..."
73,198,Individual Study or Research,True,"[intended, for]"
74,198F,Individual Study or Research,True,"[students, majoring, fourth, science, year, in..."
75,199,Tutorial,True,"[students, majoring, fourth, science, year, in..."


In [5]:
df.to_csv('UCSC.csv', index=False)