In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import time
from tqdm import tqdm

from bs4 import BeautifulSoup
import requests
import re

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager


from keybert import KeyBERT

In [29]:
UCM_home_url = "https://catalog.ucmerced.edu/content.php?filter%5B27%5D=CSE&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=23&expand=&navoid=2517&search_database=Filter#acalog_template_course_filter"


response = requests.get(UCM_home_url)

if response.status_code == 200:

    soup = BeautifulSoup(response.text, "html.parser")

    links = [soup.find_all("td", class_="width")[i].find_all('a')[0] for i in range(len(soup.find_all("td", class_="width")))]


    coids = [links[i]['href'][-5:] for i in range(len(links))]

    full_links = [f'https://catalog.ucmerced.edu/preview_course_nopop.php?catoid=23&coid={course}' for course in coids]
    
else:
    print('Request failed:', response.status_code)

In [30]:
data = []
for url in full_links:
    resposne = requests.get(url)
    if response.status_code == 200:

        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

        driver.get(url)

        time.sleep(1)

        rendered_html = driver.page_source
        
        driver.quit()

        soup = BeautifulSoup(rendered_html, "html.parser")


        soup = soup.find('td', class_ = 'block_content')

        header = soup.find("h1", id="course_preview_title")

        header_text = header.get_text(strip=True)
        course_code, course_title = [part.strip() for part in header_text.split(":", 1)]

        course_description = ""
        for br in soup.find_all("br"):
            next_text = br.next_sibling
            if next_text and isinstance(next_text, str):
                cleaned = next_text.strip()
                if cleaned and "Unit" not in cleaned:
                    course_description = cleaned
                    break

        result = [course_code, course_title, course_description]
        data.append(result)

    
    else:
        print('Request failed:', response.status_code)

df = pd.DataFrame(columns = ['Course ID', 'Course Title', 'Description'], data=data)
df.head()

Unnamed: 0,Course ID,Course Title,Description
0,CSE 005,Introduction to Computer Applications,Presents the use of computers to control infor...
1,CSE 015,Discrete Mathematics,Explores basic concepts of discrete mathematic...
2,CSE 019,Introduction to Computing,Presents the basics of programming to a studen...
3,CSE 022,Introduction to Programming,"Introduces students to programming, computatio..."
4,CSE 024,Advanced Programming,Intended for students with basic to intermedia...


In [None]:
df['Upper Div'] = df['Course ID'].str.extract(r'(\d+)')[0].astype(int).apply(lambda x: x >= 100) #Upper div class is 100-199 class
df.head()

Unnamed: 0,Course ID,Course Title,Description,Upper Div
0,CSE 005,Introduction to Computer Applications,Presents the use of computers to control infor...,False
1,CSE 015,Discrete Mathematics,Explores basic concepts of discrete mathematic...,False
2,CSE 019,Introduction to Computing,Presents the basics of programming to a studen...,False
3,CSE 022,Introduction to Programming,"Introduces students to programming, computatio...",False
4,CSE 024,Advanced Programming,Intended for students with basic to intermedia...,False
5,CSE 030,Data Structures,"Focuses on the design, analysis, and implement...",False
6,CSE 031,Computer Organization and Assembly Language,Exposes students to the underlying structure o...,False
7,CSE 095,Lower Division Undergraduate Research,Supervised research.,False
8,CSE 098,Lower Division Directed Group Study,Repeats Allowed for Credit: 99,False
9,CSE 099,Lower Division Individual Study,Repeats Allowed for Credit: 99,False


In [33]:
start = time.time() #time it
tqdm.pandas() #time it

def keyword_wrapper(doc):
    kw_model = KeyBERT() #instantiate model
    
    stop_words = ["cs", "prerequisite", "grade", "requirement", 
    "courses", "instructor", "faculty", "computer", "student", "concurrently", "majors",
    "approach", "aspects", "awarded",
    "concepts", "course", "courses", "credit", "design", "fields",
    "foundation", "fundamental", "fundamentals", "introduction", "issues", "level",
    "lower", "major", "methods", "none", "overview", "perspectives",
    "practice", "practices", "principles", "process", "processes",
    "programs", "related", "required", "requirement", "role",
    "skills", "study", "techniques", "tools", "topics", "understanding",
    "upper", "various", "work"] #dont consider these words

    return [i[0] for i in kw_model.extract_keywords(doc, stop_words=stop_words, top_n=10)] #top 10 keywords

df['keywords'] = df['Description'].progress_apply(keyword_wrapper) #apply functon
df.head()
end = time.time()
print('Time:', end - start) #print time

100%|██████████| 43/43 [00:56<00:00,  1.32s/it]

Time: 56.921117067337036





In [34]:
df= df[['Course ID',	'Course Title',	'Upper Div',	'keywords']]
df.columns = ['Course ID', 'Course Title', 'Upper', 'Skills']
df.head()

Unnamed: 0,Course ID,Course Title,Upper,Skills
0,CSE 005,Introduction to Computer Applications,False,"[programming, computers, data, security, infor..."
1,CSE 015,Discrete Mathematics,False,"[mathematics, discrete, disciplines, basic, gr..."
2,CSE 019,Introduction to Computing,False,"[programming, basics, arrays, variables, stude..."
3,CSE 022,Introduction to Programming,False,"[programming, python, students, teaching, comp..."
4,CSE 024,Advanced Programming,False,"[programming, pointers, basic, students, langu..."


In [35]:
df.to_csv('UC_Merced.csv', index=False)