In [None]:
! pip install beautifulsoup4
! pip install lxml
! pip install requests

In [119]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 

# **Parse from Coursicle**

In [None]:
! pip install my_fake_useragent
import my_fake_useragent as ua

In [121]:
# Get overall course information page
page = requests.get("https://www.coursicle.com/vanderbilt/courses/CS/")
soup = BeautifulSoup(page.text, "lxml")
container = soup.find_all(id="tileContainer")
courses = container[0].find_all("a", class_="tileElement")

In [122]:
# Parse course name, number, and link to specific course page
course_list = []
for course in courses[:-1]:
  temp = course.span.text.split() + [course.div.text, "https://www.coursicle.com/vanderbilt/courses/CS/" + course["href"]]
  course_list.append(temp)

print(course_list[2])

['CS', '1101', 'Programming and Problem Solving', 'https://www.coursicle.com/vanderbilt/courses/CS/1101/']


In [123]:
# Set user_agent generator
user_agent = ua.UserAgent()

In [124]:
# function to generate headers
def generate_headers():
  paramss = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en,zh-CN;q=0.9,zh;q=0.8",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Host": "www.coursicle.com",
    "Referer": "https://colab.research.google.com/",
    "sec-ch-ua": "\"Chromium\";v=\"106\", \"Google Chrome\";v=\"106\", \"Not;A=Brand\";v=\"99\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "Windows",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": user_agent.random().strip()
  }
  return paramss

In [None]:
# Parse course detail from every course page
new_course_list = []
for item in course_list:
  page_in = requests.get(item[3], headers=generate_headers()).text
  soup2 = BeautifulSoup(page_in, "lxml")
  detail_container = soup2.body.find_all("div", id="subItemContainer")
  detail_courses = detail_container[0].find_all("div", class_="subItemLabel")
  temp = []
  terms = None
  for each in detail_courses:  
    if "rofessor" in each.text:
      temp.append(list(map(lambda x:x.text, each.parent.find_all("a", class_="professorLink"))))
    if "emester" in each.text:
      terms = each.parent.find_all("div", class_="subItemContent")[0].text
  temp.append(terms)
  new_course_list.append(item + temp)

In [None]:
new_course_list[:10]

In [None]:
df = pd.DataFrame(new_course_list, columns = ["Subject", "Number", "Name", "Link", "Professors", "Frequency"]).drop('Link', axis=1)

In [None]:
# Special topics from Fall 2021 to Spring 2023
special = [
    ['CS', '103891', 'Special Topics - Numerical Methods for CS', ['David Hyde'], 'Fall 2022'],
    ['CS', '113891', 'Special Topics - Scalable Microservices', ['Douglas Schmidt'], 'Spring 2023'],
    ['CS', '123891', 'Special Topics - The Algorithms of Robotics', ['Jie Ying Wu'], 'Spring 2023, Spring 2022'],
    ['CS', '133891', 'Special Topics - Computational Creativity', ['Douglas Fisher'], 'Spring 2023'],
    ['CS', '143891', 'Special Topics - Reinforcement Learning', ['Gautam Biswas'], 'Fall 2022, Fall 2021'],
    ['CS', '153891', 'Special Topics - Mach Lrn / Nat Lang Proc Hlthc', ['Zhijun Yin'], 'Spring 2023, Spring 2022'],
    ['CS', '163891', 'Special Topics - Reverse Engineering', ['Daniel Balasubramanian'], 'Spring 2022'],
    ['CS', '173891', 'Special Topics - Computing and the Environment', ['Douglas Fisher'], 'Fall 2021'],
    ['CS', '183891', 'Special Topics - Fndatns Human/Computer Intract', ['Shilo Anders'], 'Fall 2021'],
    ['CS', '193891', 'Special Topics - Network Analysis in Healthcare', ['You Chen'], 'Fall 2022, Fall 2021'],
    ['CS', '123892', 'Special Topics - Projects in Machine Learning', ['Ipek Oguz'], 'Spring 2023, Spring 2022'],
    ['CS', '143892', 'Special Topics - Autonomous Vehicles', ['Laine Forrest'], 'Spring 2023, Spring 2022'],
]

In [None]:
# Drop rows of graduate courses and incorrect course info about special topics
df_new = df.drop(df[(df.Number == '3891') | (df.Number == '3892')].index).loc[df['Number'] < "5000"]
df_new

In [None]:
# Append correct course info of special topics
df_new_full = df_new.append(pd.DataFrame(special, columns=df_new.columns.values.tolist())).reset_index()
df_new_full

In [None]:
df_new_full.drop("Professors", axis=1).to_csv("Course.csv", index=False)

In [None]:
# Get all professors teaching courses
professors = set()
for c in new_course_list:
  professors.update(c[4])

all_professor = list(professors)
len(all_professor)

70

In [None]:
# Generate csv of professor names
df2 = pd.DataFrame(all_professor, columns =["Name"]).reset_index(drop=True)
df2.to_csv("Professor_name.csv", index=False)

In [None]:
# Generate teaching information of undergraduate courses
teaching = []
for c in new_course_list:
  if c[1] == '3891' or c[1] == '3892' or c[1] > '5000':
    continue
  for prof in c[4]:
    temp = c[:2]
    temp.append(prof)
    teaching.append(temp)

teaching[:10]

In [None]:
# Add special topics in teaching
for each in special:
  teaching.append(each[:2] + each[3])
len(teaching)

146

In [None]:
# Generate teaching csv with professor names
df3 = pd.DataFrame(teaching, columns =["Subject", "Course_no", "Professor"]).reset_index(drop=True)
df3.to_csv("Teaching_name.csv", index=False)

# **Parse from RMP**

In [None]:
! pip install selenium
! pip install packaging
! pip install webdriver_manager

import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep

In [None]:
prof = pd.read_csv("./Data/Professor_name.csv")

In [63]:
service = ChromeService(executable_path=ChromeDriverManager().install())
option = webdriver.ChromeOptions()
option.add_argument('lang=en')
driver = webdriver.Chrome(options=option, service=service)

In [None]:
# Get RMP scores; might have university mismatch; need inspection afterward
rmp_scores = []
for person in prof.Name:
    xp = re.sub("\\s+", "%20", person)
    url = "https://www.ratemyprofessors.com/search/teachers?query=" + xp + "&sid=4002"
    driver.get(url)
    sleep(3)

    pageSource = driver.page_source
    lxml_soup = BeautifulSoup(pageSource, 'lxml')
    body = lxml_soup.find("div", id="root")
    cards = body.find_all("a")

    for card in cards:
        flag = False
        for cla in card.get_attribute_list("class"):
            # print(cla)
            if cla is None or "Card" not in cla: continue
            temp = [card["href"]]
            for each in card.div.strings:
                if each.strip() != "": temp.append(each.strip())
            print(person)
            rmp_scores.append(temp)
            flag = True
            break
        if flag: break

In [93]:
# Clean the parsed data
prof_rmp_meta_clean = []
for each in rmp_scores:
    temp = [each[0][each[0].index("=")+1:]]
    temp.append(each[2])
    temp.append(each[4])
    temp.append(each[6])
    temp.append(each[9])
    prof_rmp_meta_clean.append(temp)

In [96]:
score_schema = ["Tid", "Over_rate", "Name", "college", "Diff_rate"]
df_score_2 = pd.DataFrame(prof_rmp_meta_clean, columns = score_schema).reset_index(drop=True)
df_score_2.to_csv("Prof_clean1.csv", index=False)

In [None]:
ratings1 = pd.read_csv("Prof_clean1.csv")
ratings1.Name

In [None]:
# Find those professors without RMP card
not_appear = []
see = set(ratings1.Name)
for i in prof.Name:
    if i not in see:
        not_appear.append(i)
not_appear

In [107]:
# Generate professor csv
no_app = []
for each in not_appear:
    temp = ["", 0, each, "Vanderbilt University", 0]
    no_app.append(temp)

df_score_new = ratings1.append(pd.DataFrame(no_app, columns=score_schema)).reset_index(drop=True)
df_score_new = df_score_new.drop("college", axis=1)
df_score_new

Unnamed: 0,Tid,Over_rate,Name,Diff_rate
0,198017.0,4.1,Xenofon Koutsoukos,2.0
1,2357581.0,1.3,Maithilee Kunda,3.3
2,2750807.0,5.0,Jonathan Sprinkle,2.0
3,2522286.0,4.6,Waseem Abbas,2.6
4,2827524.0,0.0,Mudassir Shabbir,0.0
...,...,...,...,...
66,,0.0,Edward Stringfellow,0.0
67,,0.0,Janos Sztipanovits,0.0
68,,0.0,Bradley Malin,0.0
69,,0.0,Meiyi Ma,0.0


In [108]:
df_score_new.to_csv("Professor.csv", index=False)

# **Making Teaching**

In [114]:
new_prof = pd.read_csv("./Data/Professor.csv")
new_teaching = pd.read_csv("./Data/Teaching_name.csv")
prof_tid = dict()

In [None]:
for index, row in new_prof.iterrows():
    prof_tid[row["Name"]] = row["Tid"]
prof_tid

In [None]:
teaching_no_name = []
for index, row in new_teaching.iterrows():
    temp = [row["Subject"], row["Course_no"], prof_tid[row["Professor"]]]
    teaching_no_name.append(temp)
teaching_no_name

In [117]:
xx = pd.DataFrame(teaching_no_name, columns=["Subject", "Course_no", "Tid"]).reset_index(drop=True)
xx.to_csv("Teaching.csv", index=False)