## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import time
from tqdm import tqdm

from bs4 import BeautifulSoup
import requests
import re


from keybert import KeyBERT

## Web scrape data

In [2]:
UCR_url = "https://www1.cs.ucr.edu/undergraduate/course-descriptions"

response = requests.get(UCR_url)

if response.status_code == 200:
    
    soup = BeautifulSoup(response.text, "html.parser")

    tables = soup.find_all("table", class_="ui yellow definition striped table")

    header = tables[0].find_all('tr')[0].text.split('\n')[1:4]

    data = [tables[0].find_all('tr')[i].text.split('\n')[1:4] for i in range(1, len(tables[0].find_all('tr')))]

    df = pd.DataFrame(data, columns = header)

    
    display(df.head())

else:
    print('response failed:', response.status_code)

Unnamed: 0,Course,Course Title,Description
0,ENGR 001,Professional Development and Mentoring,"1 Unit, Activity, 30 hours per quarter. Provid..."
1,ENGR 101,Professional Development and Mentoring,"1 Unit, Activity, 30 hours per quarter. Prereq..."
2,ENGR 180W,Technical Communications,"4 Units, Lecture, 3 hours; workshop, 3 hours. ..."
3,CS 005,Introduction to Computer Programming,"4 Units, Lecture, 3 hours; laboratory,2 hours;..."
4,CS 006,Effective Use of the World Wide Web,"4 Units, Lecture, 3 hours; laboratory, 3 hours..."


## Remove non-CS courses and add Upper Div column. Remove prerequisite sentence

In [3]:
df = df[df['Course'].str[:2]==('CS')].reset_index().drop(columns = ['index']) #remove non-cs courses

df['Upper Div'] = df['Course'].str.extract(r'(\d+)')[0].astype(int).apply(lambda x: x >= 100) #Upper div class is 100-199 class


def remove_prereq(doc):
    doc = re.sub(r'^.*?Prerequisite\(s\):.*?(\.|\n)', '', doc, flags=re.DOTALL).strip() #remove prerequisites
    
    #for first class
    doc = re.sub(r'4 Units, Lecture, 3 hours; laboratory,2 hours; individual study, 1 hour\.', '', doc).strip()
    
    return doc



df['Description'] = df['Description'].apply(remove_prereq)

df.head()

Unnamed: 0,Course,Course Title,Description,Upper Div
0,CS 005,Introduction to Computer Programming,An introduction to computer programming for no...,False
1,CS 006,Effective Use of the World Wide Web,Adetailed introduction to the Internet fornon-...,False
2,CS 008,Introduction to Computing,Includes operating system basics (Windows and ...,False
3,CS 009A,Data-Oriented Introduction to Computing I,"Covers computational thinking, problem-solving...",False
4,CS 009B,Data Oriented Introduction to Computing II,Covers advanced programming concepts and algor...,False


## Keyword extraction, timed. Remove irrelevant words and take top 10 keywords

In [4]:
start = time.time() #time it
tqdm.pandas() #time it

def keyword_wrapper(doc):
    kw_model = KeyBERT() #instantiate model
    
    stop_words = ["cs", "prerequisite", "grade", "requirement", 
                  "courses", "instructor", "faculty", "computer", "student", "concurrently", "majors"] #dont consider these words

    return [i[0] for i in kw_model.extract_keywords(doc, stop_words=stop_words, top_n=10)] #top 10 keywords

df['keywords'] = df['Description'].progress_apply(keyword_wrapper) #apply functon
df.head()
end = time.time()
print('Time:', end - start) #print time

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 57/57 [02:01<00:00,  2.12s/it]

Time: 121.15577292442322





## No longer need description column

In [5]:
df = df.drop(columns = ['Description'])

## Rearrange columns and save csv

In [6]:
df.head()

Unnamed: 0,Course,Course Title,Upper Div,keywords
0,CS 005,Introduction to Computer Programming,False,"[programming, computing, basic, 010a, introduc..."
1,CS 006,Effective Use of the World Wide Web,False,"[web, internet, engineering, tools, searching,..."
2,CS 008,Introduction to Computing,False,"[credit, 008, 010a, unix, databases, web, basi..."
3,CS 009A,Data-Oriented Introduction to Computing I,False,"[computational, software, cs009m, application,..."
4,CS 009B,Data Oriented Introduction to Computing II,False,"[programming, programs, software, algorithms, ..."


In [7]:
df.columns = ['Course ID', 'Course Title', 'Upper', 'Skills']
df.head()

Unnamed: 0,Course ID,Course Title,Upper,Skills
0,CS 005,Introduction to Computer Programming,False,"[programming, computing, basic, 010a, introduc..."
1,CS 006,Effective Use of the World Wide Web,False,"[web, internet, engineering, tools, searching,..."
2,CS 008,Introduction to Computing,False,"[credit, 008, 010a, unix, databases, web, basi..."
3,CS 009A,Data-Oriented Introduction to Computing I,False,"[computational, software, cs009m, application,..."
4,CS 009B,Data Oriented Introduction to Computing II,False,"[programming, programs, software, algorithms, ..."


In [8]:
df.to_csv('UCR.csv', index=False)