In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import time
from tqdm import tqdm

from bs4 import BeautifulSoup
import requests
import re


from keybert import KeyBERT

In [59]:
UCR_url = "https://catalog.ucsd.edu/courses/CSE.html"

response = requests.get(UCR_url)

if response.status_code == 200:
    
    soup = BeautifulSoup(response.text, "html.parser")

    div = soup.find('div', class_ = "col-md-12 blank-slate")

    courses = [i.text.strip().split('.') for i in div.find_all('p', class_ = 'course-name')]
    courses = [course for course in courses if int(re.search(r'\d+', course[0]).group()) < 200]

    for course in courses:
        course[1] = re.sub(r'\s*\(\d+.*$', '', course[1])
    
    descriptions = [i.text for i in div.find_all('p', class_ = 'course-descriptions')]

    filtered_descriptions = [
    description.split('Prerequisites')[0]
    for name, description in zip(courses, descriptions)
    if int(re.search(r'\d+', name[0]).group()) < 200
]

    data = [courses[i] +  [filtered_descriptions[i]] for i in range(len(courses))]

    
    df = pd.DataFrame(columns = ['Course ID', 'Course Title', 'Description'], data=data)
    display(df)

else:
    print('response failed:', response.status_code)

Unnamed: 0,Course ID,Course Title,Description
0,CSE 3,Fluency in Information Technology,Introduces the concepts and skills necessary t...
1,CSE 4GS,Mathematical Beauty in Rome,Exploration of topics in mathematics and engin...
2,CSE 6GS,Mathematical Beauty in Rome Lab,Companion course to CSE 4GS where theory is ap...
3,CSE 6R,Introduction to Computer Science and Object-O...,An introduction to computer science and progra...
4,CSE 8A,Introduction to Programming and Computational...,Introductory course for students interested in...
...,...,...,...
90,CSE 197,Field Study in Computer Science and Engineering,Directed study accompanying full-time on-site ...
91,CSE 197C,Cooperative Practicum in Computer Science and...,Directed study accompanying full-time on-site ...
92,CSE 198,Directed Group Study,Computer science and engineering topics whose ...
93,CSE 199,Independent Study for Undergraduates,Independent reading or research by special arr...


In [60]:
df['Upper Div'] = df['Course ID'].str.extract(r'(\d+)')[0].astype(int).apply(lambda x: x >= 100) #Upper div class is 100-199 class
df.head()

Unnamed: 0,Course ID,Course Title,Description,Upper Div
0,CSE 3,Fluency in Information Technology,Introduces the concepts and skills necessary t...,False
1,CSE 4GS,Mathematical Beauty in Rome,Exploration of topics in mathematics and engin...,False
2,CSE 6GS,Mathematical Beauty in Rome Lab,Companion course to CSE 4GS where theory is ap...,False
3,CSE 6R,Introduction to Computer Science and Object-O...,An introduction to computer science and progra...,False
4,CSE 8A,Introduction to Programming and Computational...,Introductory course for students interested in...,False


In [61]:
start = time.time() #time it
tqdm.pandas() #time it

def keyword_wrapper(doc):
    kw_model = KeyBERT() #instantiate model
    
    stop_words = ["cs", "prerequisite", "grade", "requirement", 
    "courses", "instructor", "faculty", "computer", "student", "concurrently", "majors",
    "approach", "aspects", "awarded",
    "concepts", "course", "courses", "credit", "design", "fields",
    "foundation", "fundamental", "fundamentals", "introduction", "issues", "level",
    "lower", "major", "methods", "none", "overview", "perspectives",
    "practice", "practices", "principles", "process", "processes",
    "programs", "related", "required", "requirement", "role",
    "skills", "study", "techniques", "tools", "topics", "understanding",
    "upper", "various", "work"] #dont consider these words

    return [i[0] for i in kw_model.extract_keywords(doc, stop_words=stop_words, top_n=10)] #top 10 keywords

df['keywords'] = df['Description'].progress_apply(keyword_wrapper) #apply functon
df.head()
end = time.time()
print('Time:', end - start) #print time

100%|██████████| 95/95 [02:17<00:00,  1.44s/it]

Time: 137.26705980300903





In [62]:
df = df.drop(columns=['Description'])
df['Skills'] = df['keywords']
df = df.drop(columns=['keywords'])

In [63]:
df.to_csv('UCSD.csv', index =False)