In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import time
from tqdm import tqdm

from bs4 import BeautifulSoup
import requests
import re

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager


from keybert import KeyBERT

In [60]:
url = "https://registrar.ucla.edu/academics/course-descriptions?search=COM+SCI"


driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

driver.get(url)

time.sleep(1)

rendered_html = driver.page_source

driver.quit()

soup = BeautifulSoup(rendered_html, "html.parser")


upper_div_section = soup.find('div', {'aria-labelledby': 'upper-division-courses-51-1'})
    
lower_div_section = soup.find('div', {'aria-labelledby': 'lower-division-courses-10-1'})

undergrad_courses = []

if lower_div_section:
        course_records = lower_div_section.find_all('div', class_='course-record')
        for record in course_records:
            title_element = record.find('h3')
            description_paragraphs = record.find_all('p')

            if title_element and description_paragraphs:
                full_title = title_element.text.strip()
                parts = full_title.split('.', 1)
                if len(parts) == 2:
                    course_code = "CS " + parts[0].strip()
                    course_title = parts[1].strip()
                else:
                    course_code = "CS Unknown"
                    course_title = full_title
                description = '\n'.join(p.text.strip() for p in description_paragraphs[1:])
                undergrad_courses.append([course_code, course_title, description])

if upper_div_section:
    course_records = upper_div_section.find_all('div', class_='course-record')
    for record in course_records:
        title_element = record.find('h3')
        description_paragraphs = record.find_all('p')

        if title_element and description_paragraphs:
            full_title = title_element.text.strip()
            parts = full_title.split('.', 1)
            if len(parts) == 2:
                course_code = "CS " + parts[0].strip()
                course_title = parts[1].strip()
            else:
                course_code = "CS Unknown"
                course_title = full_title
            description = '\n'.join(p.text.strip() for p in description_paragraphs[1:])
            undergrad_courses.append([course_code, course_title, description])


df = pd.DataFrame(columns = ['Course ID', 'Course Title', 'Description'], data = undergrad_courses)
df.head()


Unnamed: 0,Course ID,Course Title,Description
0,CS 1,Freshman Computer Science Seminar,"Seminar, one hour. Introduction to department ..."
1,CS 19,Fiat Lux Freshman Seminars,"Seminar, one hour. Discussion of and critical ..."
2,CS 30,Principles and Practices of Computing,"Lecture, four hours; discussion, two hours; ou..."
3,CS 31,Introduction to Computer Science I,"Lecture, four hours; discussion, two hours; ou..."
4,CS 32,Introduction to Computer Science II,"Lecture, four hours; discussion, two hours; ou..."


In [61]:
def extract_description(text):
    try:
        parts = text.split('.', 1) 
        if len(parts) < 2: 
          return text
        description_start = parts[1].strip()

        if "grading" in description_start.lower():
            description_parts = description_start.split("grading")
            core_description = description_parts[0].strip()
        else:
            core_description = description_start

        return core_description

    except:
        return text

df['Description'] = df['Description'].apply(extract_description)

In [62]:
df['Upper Div'] = df['Course ID'].str.extract(r'(\d+)')[0].astype(int).apply(lambda x: x >= 100) #Upper div class is 100-199 class

In [63]:
df.head()

Unnamed: 0,Course ID,Course Title,Description,Upper Div
0,CS 1,Freshman Computer Science Seminar,Introduction to department resources and princ...,False
1,CS 19,Fiat Lux Freshman Seminars,Discussion of and critical thinking about topi...,False
2,CS 30,Principles and Practices of Computing,Designed for students in computer science and ...,False
3,CS 31,Introduction to Computer Science I,"Introduction to computer science via theory, a...",False
4,CS 32,Introduction to Computer Science II,Enforced requisite: course 31. Object-oriented...,False


In [64]:
start = time.time() #time it
tqdm.pandas() #time it

def keyword_wrapper(doc):
    kw_model = KeyBERT() #instantiate model
    
    stop_words = ["cs", "prerequisite", "grade", "requirement", 
    "courses", "instructor", "faculty", "computer", "student", "concurrently", "majors",
    "approach", "aspects", "awarded",
    "concepts", "course", "courses", "credit", "design", "fields",
    "foundation", "fundamental", "fundamentals", "introduction", "issues", "level",
    "lower", "major", "methods", "none", "overview", "perspectives",
    "practice", "practices", "principles", "process", "processes",
    "programs", "related", "required", "requirement", "role",
    "skills", "study", "techniques", "tools", "topics", "understanding",
    "upper", "various", "work", "department", "resources", "requisite", "requisites", "enforced", "lecture", "hours"] #dont consider these words

    return [i[0] for i in kw_model.extract_keywords(doc, stop_words=stop_words, top_n=10)] #top 10 keywords

df['keywords'] = df['Description'].progress_apply(keyword_wrapper) #apply functon
df.head()
end = time.time()
print('Time:', end - start) #print time

100%|██████████| 61/61 [01:18<00:00,  1.28s/it]

Time: 78.36068916320801





In [65]:
df.head()

Unnamed: 0,Course ID,Course Title,Description,Upper Div,keywords
0,CS 1,Freshman Computer Science Seminar,Introduction to department resources and princ...,False,"[seminar, engineering, students, principal, ta..."
1,CS 19,Fiat Lux Freshman Seminars,Discussion of and critical thinking about topi...,False,"[intellectual, discovery, thinking, discussion..."
2,CS 30,Principles and Practices of Computing,Designed for students in computer science and ...,False,"[programming, introductory, computers, lists, ..."
3,CS 31,Introduction to Computer Science I,"Introduction to computer science via theory, a...",False,"[programming, software, basic, pointers, abstr..."
4,CS 32,Introduction to Computer Science II,Enforced requisite: course 31. Object-oriented...,False,"[algorithms, software, algorithm, lists, sorti..."


In [66]:
df = df.drop(columns=['Description'])

In [67]:
df['Skills'] = df['keywords']
df = df.drop(columns=['keywords'])

In [68]:
df.to_csv("UCLA.csv", index = False)
