In [1]:
import pandas as pd

from bs4 import BeautifulSoup
import requests

from keybert import KeyBERT
from tqdm import tqdm

In [2]:
uci_url = 'https://catalogue.uci.edu/allcourses/compsci/'
req = requests.get(uci_url)
soup = BeautifulSoup(req.text)
divs = soup.find_all("div", class_="courseblock")

In [3]:
class_id = []
class_title = []
desc = []
upper = []

for div in divs:
    # Extract Course ID and Title
    title_element = div.find("p", class_="courseblocktitle")
    if title_element:
        full_title = title_element.get_text(strip=True)
        course_code, course_name = full_title.split('.', 1)
        course_number = int("".join(filter(str.isdigit, course_code)))
        if course_number < 200:
            class_id.append(course_code.strip())
            class_title.append(course_name.split('.')[0].strip())

            desc_element = div.find("div", class_="courseblockdesc")
            desc_text = []
            if desc_element:
                for p in desc_element.find_all("p"):
                    desc_text.append(p.get_text(strip=True))
            desc.append(" ".join(desc_text))
            if course_number >= 100:
                upper.append(True)
            else:
                upper.append(False)

df = pd.DataFrame({
    "Course ID": class_id,
    "Course Title": class_title,
    "Upper": upper,
    "Skills": desc
})

In [4]:
def clean_description(text):
    text = text.split("Prerequisite:")[0]
    text = text.split("Restriction:")[0]
    return text.strip()

In [5]:
df['Skills']= df['Skills'].apply(clean_description)

In [6]:
def keyword_wrapper(doc):
    kw_model = KeyBERT() #instantiate model
    
    stop_words = ["cs", "prerequisite", "grade", "requirement", 
                  "courses", "instructor", "faculty", "computer", "student", "concurrently", "majors", "students"] #dont consider these words

    return [i[0] for i in kw_model.extract_keywords(doc, stop_words=stop_words, top_n=10)] #top 10 keywords

df['Skills'] = df['Skills'].apply(keyword_wrapper) #apply functon
df

Unnamed: 0,Course ID,Course Title,Upper,Skills
0,COMPSCI 103,Advanced Programming and Problem Solving with C++,True,"[programming, advanced, language, software, sk..."
1,COMPSCI 111,Digital Image Processing,True,"[multimedia, digital, segmentation, graphics, ..."
2,COMPSCI 112,Computer Graphics,True,"[3d, graphics, polygonal, illumination, textur..."
3,COMPSCI 113,Computer Game Development,True,"[2d, interactive, 3d, graphics, game, developm..."
4,COMPSCI 114,Projects in Advanced 3D Computer Graphics,True,"[3d, illumination, graphics, modeling, surface..."
5,COMPSCI 115,Computer Simulation,True,"[simulation, stochastic, basic, distributions,..."
6,COMPSCI 116,Computational Photography and Vision,True,"[photographs, photography, stitching, panorami..."
7,COMPSCI 117,Project in Computer Vision,True,"[3d, photographs, models, vision, tracking, ro..."
8,COMPSCI 118,Introduction to Virtual Reality,True,"[vr, virtual, 3d, software, graphics, platform..."
9,COMPSCI 121,Information Retrieval,True,"[indexing, retrieval, clustering, classifying,..."


In [7]:
df.to_csv('UCI.csv', index=False)