In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import time
from tqdm import tqdm

from bs4 import BeautifulSoup
import requests
import re

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager


from keybert import KeyBERT

In [17]:
url = "https://cs.ucsb.edu/education/courses/course-descriptions"

response = requests.get(url)

if response.status_code == 200:
    
    soup = BeautifulSoup(response.text, "html.parser")

    table = soup.find('table', class_ = "table table-hover table-striped").find_all('td', class_ = "views-field views-field-title")


    urls = ["https://cs.ucsb.edu"+i.find('a')['href'] for i in table]

else:
    print('response failed:', response.status_code)

In [54]:
metadata_keys = ["Prerequisite", "Enrollment Comments", "Repeat Comments"]

data = []

def clean_paragraph(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    cleaned_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        if any(sentence.startswith(key) for key in metadata_keys):
            continue
        if sentence:  
            cleaned_sentences.append(sentence)
    return " ".join(cleaned_sentences)

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    under_grad = soup.find_all('div', class_="field--item")
    
    if under_grad[3].text == "Undergraduate":
        course = under_grad[1].text
        title = soup.find('h1', class_="page-header").text.strip()
        desc_tags = soup.find('div', class_="field field--name-field-course-des field--type-text-long field--label-above") \
                        .find('div', class_="field--item") \
                        .find_all("p")
        
        cleaned_descs = []
        for p in desc_tags:
            text = p.get_text(" ", strip=True)
            if any(text.startswith(key) for key in metadata_keys):
                cleaned_text = clean_paragraph(text)
            else:
                cleaned_text = text
            if cleaned_text:
                cleaned_descs.append(cleaned_text)
                
        final_desc = " ".join(cleaned_descs)

        result = [course, title, final_desc]
        data.append(result)

In [56]:
df = pd.DataFrame(columns = ['Course ID', 'Course Title', 'Description'], data=data)

df['Upper Div'] = df['Course ID'].str.extract(r'(\d+)')[0].astype(int).apply(lambda x: x >= 100) #Upper div class is 100-199 class

df.head()

Unnamed: 0,Course ID,Course Title,Description,Upper Div
0,CMPSC 5B,Introduction to Data Science 2,"Students explore the data science lifecycle, i...",False
1,CMPSC 5A,Introduction to Data Science 1,Introduction to data science methods and Pytho...,False
2,CMPSC 8/W8,Introduction to Computer Science,Introduction to computer program development f...,False
3,CMPSC 9,Intermediate Python Programming,Intermediate topics in Computer Science using ...,False
4,CMPSC 16,Problem Solving with Computers I,Fundamental building blocks for solving proble...,False


In [57]:
start = time.time() #time it
tqdm.pandas() #time it

def keyword_wrapper(doc):
    kw_model = KeyBERT() #instantiate model
    
    stop_words = ["cs", "prerequisite", "grade", "requirement", 
    "courses", "instructor", "faculty", "computer", "student", "concurrently", "majors",
    "approach", "aspects", "awarded",
    "concepts", "course", "courses", "credit", "design", "fields",
    "foundation", "fundamental", "fundamentals", "introduction", "issues", "level",
    "lower", "major", "methods", "none", "overview", "perspectives",
    "practice", "practices", "principles", "process", "processes",
    "programs", "related", "required", "requirement", "role",
    "skills", "study", "techniques", "tools", "topics", "understanding",
    "upper", "various", "work"] #dont consider these words

    return [i[0] for i in kw_model.extract_keywords(doc, stop_words=stop_words, top_n=10)] #top 10 keywords

df['keywords'] = df['Description'].progress_apply(keyword_wrapper) #apply functon
df.head()
end = time.time()
print('Time:', end - start) #print time

100%|██████████| 50/50 [01:20<00:00,  1.62s/it]

Time: 80.96031427383423





In [58]:
df = df.drop(columns=['Description'])
df['Skills'] = df['keywords']
df = df.drop(columns=['keywords'])

In [62]:
df.to_csv('UCSB.csv', index =False)