In [22]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

In [23]:
service = Service(executable_path="chromedriver.exe")
driver = webdriver.Chrome(service=service)
wait = WebDriverWait(driver, 10)  # Define globally for reuse

In [24]:
def wait_for_manual_login():
    print("Please log in manually and complete the Microsoft Authenticator process.")
    wait = WebDriverWait(driver, 60)  # Wait for up to 60 seconds
    try:
        wait.until(EC.presence_of_element_located((By.ID, "rcboTerm_Input")))
        print("Login successful! Proceeding...")
    except TimeoutException:
        print("Login failed. Could not detect the required page element.")
        driver.quit()
        exit()

In [25]:
# # Function to test scraping and CSV saving
# def test_scrape_class_details():
#     test_url = "https://boss.intranet.smu.edu.sg/ClassDetails.aspx?SelectedAcadTerm=2420&SelectedClassNumber=1580"
#     csv_filename = "TestClassDetails.csv"
#     print("Starting test scrape...")

#     # Open CSV file for writing
#     with open(csv_filename, "w", newline="", encoding="utf-8") as file:
#         writer = csv.writer(file)
#         headers = ["Term", "Course Code", "Section", "Description", "Grading Basis"]
#         for i in range(1, 4):  # Dynamic columns for up to 3 classes
#             headers.extend([f"class{i}_day", f"class{i}_starttime", f"class{i}_venue"])
#         writer.writerow(headers)

#         driver.get(test_url)
#         time.sleep(2)  # Allow time for page load

#         try:
#             # Extract key elements
#             wait = WebDriverWait(driver, 10)
#             course_header = wait.until(EC.presence_of_element_located((By.ID, "lblClassInfoHeader"))).text
#             description = driver.find_element(By.ID, "lblClassSection").text
#             term = driver.find_element(By.ID, "lblClassInfoSubHeader").text
#             grading_basis = driver.find_element(By.ID, "lblGradingBasis").text

#             # Split course code and section
#             course_code, section = [item.strip() for item in course_header.split('-')]

#             # Extract meeting details
#             class_details = []
#             rows = driver.find_elements(By.CSS_SELECTOR, "#RadGrid_MeetingInfo_ctl00 tr.rgRow, #RadGrid_MeetingInfo_ctl00 tr.rgAltRow")
#             for row in rows:
#                 cells = row.find_elements(By.TAG_NAME, "td")
#                 if cells and cells[0].text == "CLASS":
#                     class_details.append({
#                         "day": cells[3].text,
#                         "start_time": cells[4].text,
#                         "venue": cells[6].text
#                     })

#             # Prepare row data
#             row_data = [term, course_code, section, description, grading_basis]
#             for detail in class_details[:3]:  # Include up to 3 classes
#                 row_data.extend([detail["day"], detail["start_time"], detail["venue"]])

#             # Pad missing columns
#             for _ in range(len(class_details), 3):
#                 row_data.extend(["", "", ""])

#             # Write to CSV
#             writer.writerow(row_data)
#             print(f"Test data successfully written to {csv_filename}!")

#         except Exception as e:
#             print(f"Error occurred: {e}")

In [26]:
# # Main Execution
# try:
#     # Step 1: Navigate and wait for manual login
#     driver.get("https://boss.intranet.smu.edu.sg/OverallResults.aspx")
#     wait_for_manual_login()

#     # Step 2: Run the test scrape function
#     test_scrape_class_details()

# finally:
#     driver.quit()
#     print("Test completed!")

In [27]:
def scrape_class_details(ay, term_code, class_number, csv_writer):
    url = f"https://boss.intranet.smu.edu.sg/ClassDetails.aspx?SelectedClassNumber={class_number:04}&SelectedAcadTerm={ay}{term_code}&SelectedAcadCareer=UGRD"
    driver.get(url)

    # Immediately check for "No record found" in the raw page source
    if "No record found" in driver.page_source:
        return  # Exit early

    try:
        # Extract course details
        course_header = driver.find_element(By.ID, "lblClassInfoHeader").text
        description = driver.find_element(By.ID, "lblClassSection").text
        term = driver.find_element(By.ID, "lblClassInfoSubHeader").text
        grading_basis = driver.find_element(By.ID, "lblGradingBasis").text

        # Split course header into Course Code and Section
        course_code, section = [item.strip() for item in course_header.split('-')]

        # Extract meeting and exam details
        class_details = []
        exam_details = {"exam_startdate": "", "exam_day": "", "exam_starttime": ""}

        rows = driver.find_elements(By.CSS_SELECTOR, "#RadGrid_MeetingInfo_ctl00 tr.rgRow, #RadGrid_MeetingInfo_ctl00 tr.rgAltRow")
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            if cells:
                if cells[0].text == "CLASS":  # CLASS rows
                    class_details.append({
                        "day": cells[3].text,
                        "start_time": cells[4].text,
                        "venue": cells[6].text
                    })
                elif cells[0].text == "EXAM":  # EXAM row
                    exam_details["exam_startdate"] = cells[1].text
                    exam_details["exam_day"] = cells[3].text
                    exam_details["exam_starttime"] = cells[4].text

        # Prepare row data with SelectedClassNumber and SelectedAcadTerm
        row_data = [class_number, f"{ay}{term_code}", term, course_code, section, description, grading_basis]

        # Add class details (up to 3 classes)
        for detail in class_details[:3]:
            row_data.extend([detail["day"], detail["start_time"], detail["venue"]])
        for _ in range(len(class_details), 3):  # Pad missing class details
            row_data.extend(["", "", ""])

        # Add exam details
        row_data.extend([
            exam_details["exam_startdate"],
            exam_details["exam_day"],
            exam_details["exam_starttime"]
        ])

        # Write to CSV
        csv_writer.writerow(row_data)
        print(f"Scraped: AY{ay}, Term {term_code}, Class Number {class_number:04}")
        return True

    except Exception as e:
        print(f"Error scraping Class Number {class_number:04}, AY{ay}, Term {term_code}: {e}")
        return False

In [28]:
def main():
    ay_list = range(21, 25)  # AY2021 to AY2024
    term_mapping = {"10": "T1", "20": "T2", "31": "T3A", "32": "T3B"}

    for ay in ay_list:
        for term_code, term_name in term_mapping.items():
            # Skip AY 2021 Term 1
            if ay == 21 and term_code == "10":
                print(f"Skipping AY{ay}, Term {term_code} as it has already been scraped.")
                continue  # Skip this iteration
            # Skip AY 2021 Term 2
            if ay == 21 and term_code == "20":
                print(f"Skipping AY{ay}, Term {term_code} as it has already been scraped.")
                continue  # Skip this iteration
            # Skip AY 2021 Term 3A
            if ay == 21 and term_code == "31":
                print(f"Skipping AY{ay}, Term {term_code} as it has already been scraped.")
                continue  # Skip this iteration

            if ay == 21 and term_code == "32":
                print(f"Skipping AY{ay}, Term {term_code} as it has already been scraped.")
                continue  # Skip this iteration

            filename = f"20{ay}-20{ay+1}_{term_name}AddedInfo.csv"
            print(f"Starting scraping for file: {filename}")

            with open(filename, "w", newline="", encoding="utf-8") as file:
                writer = csv.writer(file)
                headers = ["SelectedClassNumber", "SelectedAcadTerm", "Term", "Course Code", "Section", "Description", "Grading Basis"]
                for i in range(1, 4):
                    headers.extend([f"class{i}_day", f"class{i}_starttime", f"class{i}_venue"])
                headers.extend(["exam_startdate", "exam_day", "exam_starttime"])
                writer.writerow(headers)

                class_number = 1000  # Start from class number 0001
                no_record_count = 0  # Track consecutive "No record found"

                while True:
                    success = scrape_class_details(ay, term_code, class_number, writer)

                    if not success:  # If no record is found
                        no_record_count += 1
                    else:
                        no_record_count = 0  # Reset the counter if a record is found

                    # Stop if no record is found 300 times in a row
                    if no_record_count >= 300:
                        print(f"300 consecutive 'No record found' reached. Moving to next term.")
                        break

                    class_number += 1  # Increment to next class number

    driver.quit()
    print("Scraping completed!")

if __name__ == "__main__":
    driver.get("https://boss.intranet.smu.edu.sg/OverallResults.aspx")
    wait_for_manual_login()
    main()

Please log in manually and complete the Microsoft Authenticator process.
Login successful! Proceeding...
Skipping AY21, Term 10 as it has already been scraped.
Skipping AY21, Term 20 as it has already been scraped.
Skipping AY21, Term 31 as it has already been scraped.
Skipping AY21, Term 32 as it has already been scraped.
Starting scraping for file: 2022-2023_T1AddedInfo.csv
Scraped: AY22, Term 10, Class Number 1002
Scraped: AY22, Term 10, Class Number 1003
Scraped: AY22, Term 10, Class Number 1004
Scraped: AY22, Term 10, Class Number 1005
Scraped: AY22, Term 10, Class Number 1006
Scraped: AY22, Term 10, Class Number 1007
Scraped: AY22, Term 10, Class Number 1008
Scraped: AY22, Term 10, Class Number 1009
Scraped: AY22, Term 10, Class Number 1010
Scraped: AY22, Term 10, Class Number 1011
Scraped: AY22, Term 10, Class Number 1012
Scraped: AY22, Term 10, Class Number 1013
Scraped: AY22, Term 10, Class Number 1014
Scraped: AY22, Term 10, Class Number 1015
Scraped: AY22, Term 10, Class Num