# Working Notebook
Aim: a function that takes in a module code and returns all reviews for that module

In [1]:
# Imports
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd
from datetime import datetime
import time

chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")


In [2]:
# Set up Chrome Driver
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

In [3]:
# Access sample NUSMods course page
base_url = "https://nusmods.com/courses/"
sample_course_code = "CS2040S"
reviews = "#reviews"
sample_url = base_url + sample_course_code + reviews
driver.get(sample_url)

In [5]:
# Retrieve course name
course_name = driver.find_element(By.CSS_SELECTOR, "h1").text.split("\n")[1]

In [6]:
# Find the review iframe and switch to it
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'iframe')))

iframes = driver.find_elements(By.TAG_NAME, 'iframe')

if len(iframes) >= 2:
    # Get the second iframe, rest are for ads
    second_iframe = iframes[1]
    # Print the second iframe's HTML
    print(second_iframe.get_attribute('outerHTML'))
else:
    print("Review Iframe was not found.")

driver._switch_to.frame(second_iframe)

<iframe id="dsq-app711" name="dsq-app711" allowtransparency="true" frameborder="0" scrolling="no" tabindex="0" title="Disqus" width="100%" src="https://disqus.com/embed/comments/?base=default&amp;f=nusmods-prod&amp;t_i=CS2040S&amp;t_u=https%3A%2F%2Fnusmods.com%2Fcourses%2FCS2040S%2Freviews&amp;t_e=CS2040S%20Data%20Structures%20and%20Algorithms&amp;t_d=CS2040SData%20Structures%20and%20Algorithms&amp;t_t=CS2040S%20Data%20Structures%20and%20Algorithms&amp;s_o=default#version=96fa2ba331081de9a1f11b9a1eeb8f14" style="width: 1px !important; min-width: 100% !important; border: none !important; overflow: hidden !important; height: 27406px !important;" horizontalscrolling="no" verticalscrolling="no"></iframe>


In [7]:
# Use BeautifulSoup to parse the HTML
soup = BeautifulSoup(driver.page_source, 'html.parser')
print(soup.prettify())

<html class="js no-touch localstorage sessionstorage contenteditable use-opacity-transitions embed-refresh embed-refresh-v2" dir="ltr" lang="en" style="--publisher-color: rgb(255,81,56); --publisher-color-safe: rgb(255,81,56);">
 <!--<![endif]-->
 <head>
  <title>
   Disqus Comments
  </title>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" name="viewport"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <style>
            border-radius: 3px;
            padding: 10px 15px;
            margin-bottom: 10px;
            background-color: #FFE070;
            color: #A47703;
        }

            color: #A47703;
            font-weight: bold;
        }

        .alert--error p,
            margin-top: 5px;
            margin-bottom: 5px;
        }
  </style>
  <style>
   html {
            overflow: hidden;
        }
        

        #error {
            display: none;
        }

        .clearfix:after {
            content: "";
   

In [8]:
# Create dataframe to store reviews
reviews_df = pd.DataFrame(columns=['Course Code','Course Name','Date','Name','Content'])

In [9]:
# Access post list and individual posts, store data
post_list = soup.find("ul", class_="post-list")
for post in post_list:
    author = post.find("span", class_="author").text

    date_str = post.find("a", class_="time-ago").get('title')
    date = datetime.strptime(date_str, "%A, %B %d, %Y %I:%M %p")

    post_message = post.find("div", class_="post-message").text

    temp_df = pd.DataFrame([[sample_course_code, course_name, date, author, post_message]], 
                           columns=['Course Code','Course Name','Date','Name','Content'])
    
    reviews_df = pd.concat([reviews_df, temp_df], ignore_index=True)

  reviews_df = pd.concat([reviews_df, temp_df], ignore_index=True)


In [10]:
# Export to CSV
reviews_df.to_csv(f"data/{sample_course_code} reviews {datetime.now().date()}.csv", index=False)

# Functions

In [11]:
# Global variables
nusmods_base_url = "https://nusmods.com/courses/"
reviews = "#reviews"

In [12]:
# Set up Chrome Driver
def start_driver() -> webdriver.Chrome:
    service = ChromeService(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    # wait = WebDriverWait(driver, 10)
    # wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'h1')))
    return driver

In [13]:
# Extract course name 
def get_course_name(course_code: str, driver: webdriver.Chrome) -> str:
    url = nusmods_base_url + course_code + reviews
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h1')))
    course_name = driver.find_element(By.CSS_SELECTOR, "h1").text.split("\n")[1]
    return course_name  

In [25]:
# Access given NUSMods course page
# Return course name and driver in correct iframe
def access_webpage(driver: webdriver.Chrome):
    # Find the review iframe and switch to it
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'iframe')))

    iframes = driver.find_elements(By.TAG_NAME, 'iframe')

    if len(iframes) >= 2:
        # Get the second iframe, rest are for ads
        second_iframe = iframes[1]
    else:
        print("Review Iframe was not found.")

    driver._switch_to.frame(second_iframe)
    time.sleep(1) # Wait for iframe to load
    
    return driver

In [42]:
def extract_reviews(driver: webdriver.Chrome) -> pd.DataFrame:
    # Use BeautifulSoup to parse the HTML
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Create dataframe to store reviews
    reviews_df = pd.DataFrame(columns=['Date','Author','Content'])

    post_list = soup.find("ul", class_="post-list")
    for post in post_list:
        author = post.find("span", class_="author").text

        date_str = post.find("a", class_="time-ago").get('title')
        date = datetime.strptime(date_str, "%A, %B %d, %Y %I:%M %p")

        post_message = post.find("div", class_="post-message").text

        temp_df = pd.DataFrame([[date, author, post_message]], 
                            columns=['Date','Author','Content'])
        
        reviews_df = pd.concat([reviews_df, temp_df], ignore_index=True)

    return reviews_df

In [35]:
# Overall function to scrape and export reviews
def scrape_reviews(course_code: str) -> pd.DataFrame:
    driver = start_driver()
    course_name = get_course_name(course_code, driver)
    driver = access_webpage(driver=driver)
    reviews_df = extract_reviews(driver=driver)
    reviews_df.insert(0, 'Course Name', course_name)
    reviews_df.insert(0, 'Course Code', course_code)
    reviews_df.to_csv(f"data/{course_code} Reviews {datetime.now().date()}.csv", index=False)
    driver.quit()
    return reviews_df

In [61]:
scrape_reviews("GEA1000")

  reviews_df = pd.concat([reviews_df, temp_df], ignore_index=True)


Unnamed: 0,Course Code,Course Name,Date,Author,Content
0,HSA1000,Asian Interconnections,2023-12-13 16:40:00,meower,HSA1000 Asian Interconnectionsfun but tiring f...
1,HSA1000,Asian Interconnections,2021-12-21 15:34:00,cookieslushies,this module was an absolute waste of time and ...
2,HSA1000,Asian Interconnections,2024-01-01 21:03:00,George Teo,This review is primarily for non-CHS majors.Wh...
3,HSA1000,Asian Interconnections,2021-12-23 00:24:00,yesclaws,Year taken: AY21/22 Module coordinator : Dr Ra...
4,HSA1000,Asian Interconnections,2023-06-25 00:49:00,soup,AY22/23 Sem 2This module is potentially very f...
5,HSA1000,Asian Interconnections,2023-06-06 21:15:00,កម្ពុជា,Taken in 2022/2023 sem 2 TA: BenjaminI had an ...
6,HSA1000,Asian Interconnections,2023-02-09 14:25:00,Callan Wang,Sem 1 AY22-23Lecture: Prof Clay EatonTutor: Be...
7,HSA1000,Asian Interconnections,2022-11-29 18:52:00,meese,year taken: AY21/22 sem 2tutor: takahiro kamis...
8,HSA1000,Asian Interconnections,2021-12-15 23:37:00,an_anonymous_penguin,This module is pre-allocated to all CHS studen...
9,HSA1000,Asian Interconnections,2023-12-29 11:14:00,secret0825,Taken in: AY 2022/2023 Sem 1Tutor: BenjaminAss...
