In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
from keybert import KeyBERT

tqdm.pandas()

berk = "https://guide.berkeley.edu/courses/compsci/"
berk_req = requests.get(berk)
soup = BeautifulSoup(berk_req.text, "html.parser")

divs = soup.find_all("div", class_="courseblock")


class_id = []
class_title = []
desc = []
upper = []

for div in divs:
    heading = div.find("p", class_="course-heading")

    if heading:
        course_code = div.find("span", class_="code")
        course_title = div.find("span", class_="title")
        course_desc = div.find(class_="courseblockdesc")

        course_details = div.find_all("p")
        is_undergrad = any("Undergraduate" in p.text for p in course_details)

        if is_undergrad:  
            course_id_text = course_code.text.strip()
            class_id.append(course_id_text)
            class_title.append(course_title.text.strip())
            desc.append(course_desc.text.split('\n')[1])

           
            course_number = int("".join(filter(str.isdigit, course_id_text)))  # Extract numeric part
            is_upper = course_number >= 100
            upper.append(is_upper)  


df = pd.DataFrame({
    "Course ID": class_id,
    "Course Title": class_title,
    "Course Description": desc,
    "Upper Div": upper  
})


def keyword_wrapper(doc):
    kw_model = KeyBERT()
    stop_words = ["cs", "prerequisite", "grade", "requirement", 
                  "courses", "instructor", "faculty", "computer", "student", "concurrently", "majors"]  # Stop words
    return [i[0] for i in kw_model.extract_keywords(doc, stop_words=stop_words, top_n=10)]  # Extract top 10 keywords


df['keywords'] = df['Course Description'].progress_apply(keyword_wrapper)

df


  0%|          | 0/48 [00:00<?, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 48/48 [00:58<00:00,  1.22s/it]

        Course ID                                       Course Title  \
0      COMPSCI C8                        Foundations of Data Science   
1      COMPSCI 10                    The Beauty and Joy of Computing   
2     COMPSCI W10                    The Beauty and Joy of Computing   
3      COMPSCI 36  CS Scholars Seminar: The Educational Climate i...   
4      COMPSCI 39                         Freshman/Sophomore Seminar   
5     COMPSCI 47A         Completion of Work in Computer Science 61A   
6     COMPSCI 47B         Completion of Work in Computer Science 61B   
7     COMPSCI 47C         Completion of Work in Computer Science 61C   
8     COMPSCI 61A  The Structure and Interpretation of Computer P...   
9     COMPSCI 61B                                    Data Structures   
10   COMPSCI 61BL        Data Structures and Programming Methodology   
11    COMPSCI 61C  Great Ideas of Computer Architecture (Machine ...   
12     COMPSCI 70        Discrete Mathematics and Probability Th




In [None]:
df.to_csv("/tmp/berkeley_cs_courses.csv", index=False)  # macOS/Linux
