In [None]:
import selenium # Main Selenium library (though often 'from selenium import webdriver' is enough)
from selenium import webdriver # The main Selenium WebDriver module
from selenium.webdriver.common.by import By # To locate elements by various strategies (e.g., ID, CSS_SELECTOR, XPATH)
from selenium.webdriver.support.ui import WebDriverWait # To wait for specific conditions on the webpage
from selenium.webdriver.support import expected_conditions as EC # Predefined conditions for WebDriverWait
from selenium.common.exceptions import TimeoutException # Specific exception for wait timeouts
from bs4 import BeautifulSoup # Beautiful Soup for parsing HTML content
import csv # For writing extracted data to CSV files
import time # For time-related functions (e.g., delays)
import traceback # For detailed error reporting
import os # Import os for operating system-related functionalities (e.g., path manipulation, directory creation)

In [None]:
# --- Configuration ---
# Base URL for the Times Higher Education (THE) World University Rankings.
# It's templated with placeholders for year and subject.
base_url = "https://www.timeshighereducation.com/world-university-rankings/{}/subject-ranking/{}#!/length/-1/sort_by/rank/sort_order/asc/cols/scores"

# Define the range of years to scrape. (e.g., 2025 only for demonstration)
years = range(2025, 2026)

# Define the list of subjects to scrape.
# Only arts-and-humanities is contained in the list for demonstration, 
#but other subjects can be uncommented and scraped simultaneously.
subjects = ["arts-and-humanities"] # Example: "engineering-and-it", "computer-science", etc.

# --- Selenium WebDriver Setup ---
# Configure Chrome options for running the browser.
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run Chrome in headless mode (without a visible GUI).
                                    # This is common for server-side scraping.
# Note: Ensure you have the compatible chromedriver executable in your system's PATH
# or in the same directory as this script.

# Initialize the Chrome WebDriver.
# The Service() class is used to manage the ChromeDriver executable.
# If chromedriver is in PATH, you can just use `Service()`.
# If it's in the current directory, `Service(executable_path='./chromedriver')` would work,
# but if it's named 'chromedriver' and in PATH, this is fine.
driver = webdriver.Chrome(options=options)
print("Chrome WebDriver initialized.")

# --- Main Scraping Logic ---
# Use a try-finally block to ensure the WebDriver is closed even if errors occur.
try:
    # Loop through each year and subject combination to construct URLs and scrape data.
    for year in years:
        for subject in subjects:
            # Construct the full URL for the current year and subject.
            url = base_url.format(year, subject)
            print(f"\nScraping data for year: {year}, subject: {subject} from URL: {url}")

            # Instruct Selenium to load the URL in the browser. This executes JavaScript
            # and renders the full page content.
            driver.get(url)

            # --- Initial Page Load Wait ---
            # Wait for the main content area of the page to load, ensuring basic page structure is ready.
            wait = WebDriverWait(driver, 15) # Initialize WebDriverWait with a timeout (15 seconds)
            # wait = WebDriverWait(driver, 20) # Increase timeout if necessary for slow pages
            wait.until(EC.presence_of_element_located((By.ID, "main-content")))
            print("Main content loaded.")

            # --- Cookie Consent/Overlay Handling ---
            # Websites often display cookie consent banners or other overlays that need to be dismissed
            # to access the main content. This block attempts to find and remove such an overlay.
            try:
                # Wait for the cookie overlay to be present by its ID.
                overlay = wait.until(EC.presence_of_element_located((By.ID, "CybotCookiebotDialog")))

                # If the overlay is found, execute JavaScript to remove it from the DOM.
                if overlay:
                    driver.execute_script("document.getElementById('CybotCookiebotDialog').remove();")
                    print("Cookie overlay removed using JavaScript.")
                    # Add a small, fixed delay to allow the browser to fully process the JavaScript.
                    time.sleep(2)
                    # Check if the overlay is still present (in case of re-rendering or initial click failure)
                    # This line (if overlay := ...) means it tries to find the element again after 2 seconds.
                    # If it's still there, it tries to remove it again.
                    # A more robust approach might be to click a "reject" or "accept" button if available.
                    if overlay := wait.until(EC.presence_of_element_located((By.ID, "CybotCookiebotDialog"))):
                        print("Overlay still present after first attempt. Trying removal again...")
                        driver.execute_script("document.getElementById('CybotCookiebotDialog').remove();")

            except TimeoutException:
                # If the overlay is not found within the timeout, assume it's not present and proceed.
                print("Cookie overlay not found. Continuing scraping.")
                pass # Proceed without handling the overlay
            except Exception as e:
                # Catch any other unexpected errors during overlay handling.
                print(f"Error handling cookie overlay: {e}")

            # --- Click on "Scores" Tab ---
            # The website often has different views (e.g., 'Overview', 'Scores'). We click 'Scores'
            # to ensure the ranking data is presented in a consistent format for extraction.
            scores_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//label[@for='scores']")))
            scores_button.click()
            print("Clicked 'Scores' tab.")

            # --- Data Reload Wait ---
            # After clicking 'Scores', the table data reloads. A fixed delay is used here
            # to give the page time to fully render the new data.
            # (Note: Using explicit waits like EC.staleness_of(old_element) or EC.url_changes
            # would generally be more robust than fixed time.sleep() calls.)
            time.sleep(5)
            # Wait for the main data table to be present and visible after the click.
            table_element = wait.until(EC.presence_of_element_located((By.ID, 'datatable-1')))
            print("Ranking table found.")

            # A second fixed delay before extracting content, ensuring all dynamic content is settled.
            time.sleep(5)

            # --- Extract HTML with BeautifulSoup ---
            # Get the fully rendered HTML source code of the page from Selenium.
            html = driver.page_source
            # Create a BeautifulSoup object to parse the HTML.
            soup = BeautifulSoup(html, "html.parser")

            # Find the main ranking table by its ID.
            table = soup.find("table", id ="datatable-1")

            # --- Data Parsing and CSV Writing ---
            # Check if the table and its body are found before proceeding.
            if table:
                table_body = table.find("tbody")
                if table_body:
                    # The header row is assumed to be part of the tbody for parsing logic.
                    # This scraper handles variations in row structure ('institution-disabled' vs. regular).
                    header_row = table_body.find("tr") # This is the first row in tbody, assumed to be first university data

                    if header_row: # Proceed if at least one data row (or header-like row) is found
                        # Construct the output CSV filename based on subject and year.
                        # Files will be saved in the same directory as the script by default.
                        filename = f'{subject}_rankings_{year}.csv'
                        # Create a directory named 'THE_rankings_raw_data' to store the CSVs if it doesn't exist
                        output_folder = "THE_rankings_raw_data"
                        if not os.path.exists(output_folder):
                            os.makedirs(output_folder)
                            print(f"Created output directory: {output_folder}")
                        output_filepath = os.path.join(output_folder, filename)


                        with open(output_filepath, 'w', newline='', encoding='utf-8') as csvfile:
                            writer = csv.writer(csvfile) # Create a CSV writer object
                            # Define the CSV header based on the columns expected from the website.
                            writer.writerow(['Rank', 'Name', 'Country/Region', 'Overall', 'Research Quality', 'Industry Income', 'International Outlook', 'Research Environment', 'Teaching'])

                            # Iterate through each table row (<tr>) in the table body.
                            for row in table_body.find_all("tr"):
                                data = [] # List to hold extracted data for the current row
                                cells = row.find_all("td") # Find all table data cells (<td>) in the row

                                # --- Handle "institution-disabled" rows ---
                                # Some rows might have a specific class indicating they are "disabled"
                                # (e.g., hidden from ranking or subscription-only). They have a different HTML structure.
                                if "institution-disabled" in row.get("class", []):
                                    # Extract data using specific class names within <td> for disabled rows.
                                    data.append(cells[0].text.strip()) # Rank (from first td)
                                    name_div = row.find("div", class_="ranking-institution-title")
                                    data.append(name_div.text.strip() if name_div else "") # Name
                                    location_div = row.find("div", class_="ranking-institution__disabled-location")
                                    data.append(location_div.text.strip() if location_div else "") # Country/Region
                                    # Extract specific scores by their class names
                                    overall_score_div = row.find("td", class_="scores overall-score")
                                    data.append(overall_score_div.text.strip() if overall_score_div else "")
                                    citations_div = row.find("td", class_="scores citations-score")
                                    data.append(citations_div.text.strip() if citations_div else "")
                                    industry_div = row.find("td", class_="scores industry_income-score")
                                    data.append(industry_div.text.strip() if industry_div else "")
                                    int_outlook_div = row.find("td", class_="scores international_outlook-score")
                                    data.append(int_outlook_div.text.strip() if int_outlook_div else "")
                                    research_score_div = row.find("td", class_="scores research-score")
                                    data.append(research_score_div.text.strip() if research_score_div else "")
                                    teaching_score_div = row.find("td", class_="scores teaching-score")
                                    data.append(teaching_score_div.text.strip() if teaching_score_div else "")
                                else:
                                    # --- Handle Regular Data Rows ---
                                    # For regular rows (not disabled), extract data directly from <td> elements.
                                    # This assumes a consistent order and presence of 8 cells.
                                    if len(cells) >= 8: # Ensure enough cells are present
                                        data.append(cells[0].text.strip()) # Rank
                                        data.append(cells[1].find("a").text.strip()) # Name (from <a> tag within 2nd td)
                                        location_div = cells[1].find("div", class_="location") # Location (from <div> within 2nd td)
                                        country_region = location_div.text.strip() if location_div else ""
                                        data.append(country_region)
                                        data.append(cells[2].text.strip()) # Overall score
                                        data.append(cells[3].text.strip()) # Research Quality (formerly Citations)
                                        data.append(cells[4].text.strip()) # Industry Income
                                        data.append(cells[5].text.strip()) # International Outlook
                                        data.append(cells[6].text.strip()) # Research Environment
                                        data.append(cells[7].text.strip()) # Teaching
                                    else:
                                        print(f"Skipping row with insufficient data cells: {row.text.strip()}")
                                        continue # Skip this row and move to the next

                                writer.writerow(data) # Write the extracted data row to the CSV file

                    else:
                        print("Error: Could not find any data rows/header in the table body.")
                else:
                    print("Error: Could not find the table body (tbody).")
            else:
                print("Error: Could not find the main ranking table (id='datatable-1').")

            # Add a small delay between scraping different years/subjects to be polite to the website.
            time.sleep(2)

except Exception as e:
    # Catch any unexpected errors that occur during the scraping process.
    print(f"An unhandled error occurred during scraping: {e}")
    print("Traceback details:")
    print(traceback.format_exc()) # Print full traceback for debugging
finally:
    # Ensure the Selenium WebDriver is closed cleanly, regardless of whether errors occurred.
    if 'driver' in locals() and driver: # Check if driver was successfully initialized
        driver.quit() # Close the browser and terminate the WebDriver session
        print("Selenium WebDriver closed.")