In [None]:
"""
Scrapy Scraper to extract QS World University Rankings data.

This spider integrates Selenium WebDriver to handle JavaScript rendering and dynamic content
loading on the QS World University Rankings website. It navigates through different
subjects and years, extracts university ranking data, and saves it to CSV files.

Note on Asynchronous Execution & Twisted Compatibility:
Running Scrapy (which utilizes Twisted for asynchronous operations) within interactive
environments like Jupyter Notebooks/Lab can lead to a "RuntimeError: Event loop is already running".
This is because Jupyter already has an active asyncio event loop. The `nest_asyncio.apply()`
patch is used at the start of this script to allow for nested event loops, resolving this specific issue.

This spider has been successfully tested in environments with Twisted version 23.x.x.
However, if you encounter other unexpected runtime or compatibility errors while running this spider,
**particularly on macOS or Linux environments**, it may be related to specific Twisted versions.
As a troubleshooting step, you might consider trying Twisted==22.10.0, as this version has historically
shown strong stability with Scrapy on these operating systems (Windows environments typically experience
fewer such issues).
"""
import scrapy
from scrapy.selector import Selector        # For parsing HTML content using CSS/XPath selectors
from scrapy.crawler import CrawlerProcess   # To run Scrapy spiders programmatically
from selenium import webdriver              # The main Selenium WebDriver module
from selenium.webdriver.chrome.options import Options   # To configure Chrome browser options
from selenium.webdriver.common.by import By # To locate elements by various strategies (e.g., ID, CSS_SELECTOR, XPATH)
from selenium.webdriver.support.ui import WebDriverWait # To wait for specific conditions on the webpage
from selenium.webdriver.common.action_chains import ActionChains # For performing complex user interactions (e.g., mouse hovers, clicks)
from selenium.webdriver.support import expected_conditions as EC # Predefined conditions for WebDriverWait
from selenium.webdriver.chrome.service import Service   # To manage the ChromeDriver executable
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException, ElementClickInterceptedException # Common Selenium exceptions for robust error handling
import csv   # For writing extracted data to CSV files
import time  # For time-related functions (e.g., sleeps, though generally avoided in Scrapy)
import os    # For operating system-related functions (e.g., path manipulation, checking OS type)
import random        # For generating random numbers (e.g., for delays, user agents) - not explicitly used here but good for anti-bot
import nest_asyncio  # To allow nested asyncio event loops, crucial for running Scrapy/Twisted in Jupyter/IPython
nest_asyncio.apply() # Apply the patch to allow nested event loops. This resolves RuntimeErrors in Jupyter.

In [None]:
class UniversityRankingSpider(scrapy.Spider):
    """
    A Scrapy spider to scrape university rankings data from the QS World University Rankings website.
    This spider integrates Selenium to handle JavaScript rendering and dynamic content loading.
    """

    name = "university_ranking" # Unique name for the spider, used to run it from the command line

    def __init__(self):
        """
        Initializes the spider and sets up the Selenium webdriver.
        Configures Chrome options for headless mode, user agent, and logging.
        Handles platform-specific ChromeDriver executable paths.
        """
        super().__init__() # Call the parent class (scrapy.Spider) __init__ method

        # --- Selenium WebDriver Configuration ---
        chrome_options = Options()
        chrome_options.add_argument("--headless")    # Run Chrome in headless mode (without a visible GUI)
        chrome_options.add_argument("--disable-gpu") # Recommended for headless mode, especially on Windows
        chrome_options.add_argument("--no-sandbox")  # Recommended for headless mode in some environments (e.g., Docker)
        chrome_options.add_argument("--disable-dev-shm-usage") # Recommended to prevent issues in constrained environments

        # User-Agent: Mimic a real browser to avoid detection.
        # It's good practice to rotate these or use a realistic one.
        chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36")

        # Suppress logging to avoid excessive console output from ChromeDriver
        chrome_options.add_argument("--log-level=3") # INFO, WARNING, ERROR, FATAL (3 is FATAL, 0 is ALL)

        # Exclude automation detection flags (experimental options)
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)

        # DesiredCapabilities are largely deprecated with newer Selenium versions and ChromeOptions
        # caps = DesiredCapabilities.CHROME # Deprecated
        # caps['goog:loggingPrefs'] = {'performance': 'ALL'} # Deprecated - use chrome_options
        # caps = {} # You can define capabilities as an empty dictionary if needed for other settings

        # --- ChromeDriver Path Configuration (Cross-Platform Robustness) ---
        # Dynamically determine the correct ChromeDriver executable name based on the operating system.
        # This makes the script more portable across different OS.
        if os.name == 'nt':  # Windows
            chrome_driver_executable = 'chromedriver.exe'
        elif os.name == 'posix':  # macOS or Linux
            chrome_driver_executable = 'chromedriver'  # No extension on macOS/Linux
        else:
            raise OSError(f"Unsupported operating system: {os.name}")

        # Construct the full path to ChromeDriver.
        # It's assumed that 'chromedriver' is placed in a directory that is part of the system's PATH
        # (e.g., /usr/local/bin/ on macOS/Linux, or in the script's directory for Windows).
        # For robustness, it's often better to specify the full path if not in PATH, or rely on PATH.
        # If ChromeDriver is in PATH, you can often just do `Service()` without `executable_path`.
        # However, for explicit control, using `executable_path` is good.
        # For this setup, we assume it's in the current working directory for simplicity of path construction.
        # If it's in /usr/local/bin, you might just use: service = Service()
        # Or explicitly: service = Service(executable_path="/usr/local/bin/chromedriver")
        chrome_driver_path = os.path.join(os.getcwd(), chrome_driver_executable) # Assumes chromedriver is in current working dir

        # Use the Service class to manage ChromeDriver.
        # This is the modern way to specify the executable path for the driver.
        service = Service(executable_path=chrome_driver_path)
        self.driver = webdriver.Chrome(service=service, options=chrome_options)  # Initialize Chrome WebDriver instance
        self.driver.maximize_window() # Maximize the browser window (even in headless mode, can affect rendering)
        self.base_url = "https://www.topuniversities.com/university-subject-rankings/{subject}/{year}?items_per_page=150&tab=indicators&sort_by=rank&order_by=asc"  # Define the base URL template

    def start_requests(self):
        """
        Generates the initial requests to start scraping.
        This method is called by Scrapy when the spider starts.
        It iterates through predefined subjects and years to construct the URLs for ranking pages.
        """
        # Define the subjects to scrape. Commented out subjects can be uncommented to expand scraping.
        # Limiting subjects/years is useful during development to avoid hitting rate limits.
        subjects = [ # works best for two subjects at a time
            # "arts-and-humanities",
            # "engineering-technology",
            # "life-sciences-medicine",
            # "natural-sciences",
            "theology-divinity-religious-studies"
            # "veterinary-science",
        ]
        years = range(2024, 2025)  # Define the range of years to scrape (e.g., 2024 only)

        for subject in subjects:
            for year in years:
                url = self.base_url.format(subject=subject, year=year) # Construct the full URL using subject and year
                # Yield a Scrapy Request. The 'parse' method will be called to handle the response.
                # 'meta' is used to pass additional data (subject and year) to the callback method.
                yield scrapy.Request(
                    url=url,
                    callback=self.parse,
                    meta={'subject': subject, 'year': year}
                )

    def parse(self, response):
        """
        Parses the HTML response for each page using Selenium for rendering,
        extracts university ranking data, and handles pagination to scrape multiple pages.

        Args:
            response (scrapy.http.Response): The Scrapy Response object for the current URL.
        """
        self.driver.get(response.url)  # Instruct Selenium to load the URL. This executes JavaScript.

        # --- Pop-up Handling (Cookies and Survey) ---
        # Websites often have pop-ups that block content. These blocks attempt to remove them
        # using JavaScript execution via Selenium. WebDriverWait ensures the pop-up is present
        # before attempting to remove it, making the script more robust.
        try:
            WebDriverWait(self.driver, 10).until( # Wait up to 10 seconds
                EC.presence_of_element_located((By.ID, 'sliding-popup')) # Wait for the cookie consent pop-up by its ID
            )
            self.driver.execute_script("document.getElementById('sliding-popup').remove();") # Execute JavaScript to remove the element
            print("Cookie block removed using JavaScript.")
        except TimeoutException: # Catch TimeoutException if the element doesn't appear within 10s
            print("Cookie block not found or timed out.")
        except Exception as e: # Catch any other general exceptions
            print(f"Error closing cookie block: {e}")

        try:
            WebDriverWait(self.driver, 10).until( # Wait up to 10 seconds
                EC.presence_of_element_located((By.ID, 'surveyModal')) # Wait for the survey pop-up by its ID
            )
            self.driver.execute_script("document.getElementById('surveyModal').remove();") # Execute JavaScript to remove the element
            print("Survey popup removed using JavaScript.")
        except TimeoutException: # Catch TimeoutException if the element doesn't appear within 10s
            print("Survey popup not found or timed out.")
        except Exception as e: # Catch any other general exceptions
            print(f"Error closing survey popup: {e}")

        # --- File Path Definition ---
        # Construct the output file path for the current subject and year.
        subject = response.meta['subject']
        year = response.meta['year']
        folder_name = 'rankings_csv' # Name of the folder to store CSVs
        current_directory = os.getcwd() # Get the current working directory
        folder_path = os.path.join(current_directory, folder_name) # Full path to the output folder

        # Create the output folder if it doesn't exist
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
            print(f"Created output directory: {folder_path}")

        filename = f'QS_Rankings_{subject}-{year}.csv' # Construct the filename (e.g., 'QS_Rankings_theology-divinity-religious-studies-2024.csv')
        file_path = os.path.join(folder_path, filename) # Full path to the CSV file

        # Open the CSV file in write mode. 'w' creates/overwrites, 'encoding' for proper characters, 'newline' to prevent blank rows.
        with open(file_path, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f) # Create a CSV writer object
            header = [ # Define the CSV header row
                'Rank', 'Name', 'Location', 'Employer Reputation',
                'H-index Citations', 'Citations per Paper',
                'Academic Reputation', 'Global Engagement'
            ]
            writer.writerow(header) # Write the header row to the CSV

            page_number = 0 # Initialize page number for tracking pagination

            # --- Pagination Loop ---
            # Loop to navigate through multiple pages of rankings until no more "Next" button is found.
            while True:
                # Wait for the main ranking data rows to be present on the page.
                # This ensures the JavaScript content has loaded before attempting to parse.
                try:
                    WebDriverWait(self.driver, 15).until(
                        EC.presence_of_element_located(
                            (By.CSS_SELECTOR, "div._qs-ranking-data-row"))) # Wait for a specific div element to be 'viewable'
                except TimeoutException:
                    print(f"Timeout waiting for ranking data on page {page_number}. Exiting pagination.")
                    break # Exit loop if data rows don't load

                # Get the fully rendered HTML content from Selenium.
                rendered_html = self.driver.page_source

                # Create a Scrapy Selector from the rendered HTML.
                # This allows using Scrapy's powerful CSS/XPath selectors on the Selenium-rendered page.
                scrapy_response = Selector(text=rendered_html)

                # --- Data Extraction ---
                # Select university rows and indicator rows.
                # 'div.hide-this-in-mobile-indi' targets the desktop view.
                university_rows = scrapy_response.css(
                    'div.hide-this-in-mobile-indi div._qs-ranking-data-row') # Selects the div containing university ranking data
                indicator_rows = scrapy_response.css(
                    'div.hide-this-in-mobile-indi div.col-lg-6:not(._right_background)') # Selects the div containing indicator data

                if not university_rows: # Check if no university rows were found on the current page
                    print(f"No university data found on page {page_number}. Assuming end of results.")
                    break # Exit loop if no data is found

                for i, row in enumerate(university_rows): # Iterate through each university row
                    data = {} # Dictionary to store extracted data for the current university
                    data['Rank'] = row.css('div._univ-rank::text').get() # Extracts the university rank
                    info_div = row.css('div.col-lg-8') # Select the info div for name and location
                    data['Name'] = info_div.css('a.uni-link::text').get() # Extracts the university name
                    data['Location'] = info_div.css('div.location::text').get() # Extracts the university location

                    # --- Access the corresponding indicator row using the index ---
                    # The indicator data (scores) is in a separate div, but corresponds to the university row by index.
                    indicator_row = indicator_rows[i] # Access the indicator row using the same index

                    # Wait for the indicator score blocks to be present.
                    # This is crucial as scores might load dynamically after the main university list.
                    WebDriverWait(self.driver, 20).until(
                        EC.presence_of_all_elements_located(
                            (By.CSS_SELECTOR, "div._smallblocksfix-width")))

                    # --- Extract ranking scores with specific XPATHs ---
                    # XPATHs are used here to precisely target the score elements.
                    # NB: The XPATHs use 'trade_ranking_col_no_X' and then index [1], [2], [3].
                    # This often means the order of scores can vary by subject/year.
                    data['Employer Reputation'] = indicator_row.xpath(
                        ".//div[contains(@class, 'trade_ranking_col_no_0')]/div/div/span/div/div/text()"
                    ).get()
                    data['H-index Citations'] = indicator_row.xpath(
                        ".//div[contains(@class, 'trade_ranking_col_no_1')][3]/div/div/span/div/div/text()"
                    ).get()
                    data['Citations per Paper'] = indicator_row.xpath(
                        ".//div[contains(@class, 'trade_ranking_col_no_1')][2]/div/div/span/div/div/text()"
                    ).get()
                    data['Academic Reputation'] = indicator_row.xpath(
                        ".//div[contains(@class, 'trade_ranking_col_no_1')][1]/div/div/span/div/div/text()"
                    ).get()
                    data['Global Engagement'] = indicator_row.xpath(
                        ".//div[contains(@class, 'trade_ranking_col_no_2')]/div/div/span/div/div/text()"
                    ).get()

                    # Write extracted data to CSV in the predefined header order.
                    writer.writerow([data.get(key, '') for key in header]) # Use .get(key, '') to handle missing keys gracefully
                    print("Extracted:", data.get('Name', 'N/A'), "Rank:", data.get('Rank', 'N/A')) # Print for verification

                # --- Pagination Logic ---
                # Attempt to click the "Next" button to navigate to the next page of rankings.
                page_number += 1
                try:
                    next_button = WebDriverWait(self.driver, 10).until(
                        EC.element_to_be_clickable(
                            (By.XPATH, "//a[@class='page-link next']//i[@class='fal fa-chevron-right']") # XPATH for the next button
                        )
                    )
                    current_url = self.driver.current_url # Store current URL to wait for it to change
                    next_button.click() # Click the next button
                    # Wait for the URL to change, indicating a successful page navigation.
                    WebDriverWait(self.driver, 15).until(EC.url_changes(current_url))
                    print(f"Navigated to page {page_number}.")

                except TimeoutException:
                    print(f"No 'Next' button found on page {page_number}. Reached the end of pagination.")
                    break # Exit the loop if the "Next" button is not found or clickable within the timeout
                except StaleElementReferenceException:
                    print(f"Stale element on page {page_number}. Retrying or breaking.")
                    break # Break if element becomes stale (page reloaded unexpectedly)
                except ElementClickInterceptedException:
                    print(f"Click intercepted on page {page_number}. Another element is covering the button.")
                    break # Break if something is blocking the click

    def closed(self, reason):
        """
        Closes the Selenium webdriver when the spider finishes running or is closed for any reason.
        This is crucial to ensure that the browser process is terminated and resources are released.
        """
        if hasattr(self, 'driver') and self.driver: # Check if driver was initialized
            self.driver.quit()  # Close the Selenium driver when the spider closes
            print("Selenium WebDriver closed.")

# --- Scrapy Crawler Process Execution ---
# This block allows the spider to be run directly from a Python script or Jupyter Notebook.
process = CrawlerProcess()          # Initializes the Scrapy CrawlerProcess
process.crawl(UniversityRankingSpider) # Tells the crawler to run the defined spider
process.start()                     # Starts the crawling process. This will block until the crawling is finished.