In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
import csv
import threading
import time
import os

In [None]:
# --- Configuration ---
base_url = "https://www.shanghairanking.com/rankings/gras/{year}/{subject_code}"
output_folder = "ShanghaiRanking" # suggested output folder name

# --- Create Output Folder and Initialize WebDriver ---
os.makedirs(output_folder, exist_ok=True)

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu') 
options.add_argument('--no-sandbox') 
driver = webdriver.Chrome(options=options)

# --- Subject Codes ----
subject_codes = { 
    "AS0101" : "Mathematics",
    "AS0513" : "Hospitality & Tourism Management",
    "AS0515" : "Library & Information Science"
    # ... add other subjects as needed; subject code "ASXXXX" (found in the URL of the specific subject) : "Subject Name"
}
    
# --- Dropdown Options ---
dropdown_options_2025 = [
    "World-Class Output",
    "High Quality Research",
    "Research Impact",
    "International Collaboration",
]
# --- NB: For years before 2024, the criteria options are different ---
dropdown_options_other = [
    "CNCI", 
    "IC", 
    "TOP", 
    "AWARD"
]

def scrape_subject(year, subject_code, subject_name, driver):
    url = base_url.format(year=year, subject_code=subject_code)
    output_filename = f"{subject_name}_{year}.csv"
    file_path = os.path.join(output_folder, output_filename)

    print(f"Scraping {subject_name} for year {year}...")

    # Navigate and wait for the table
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table.rk-table tbody tr"))
        )
    except TimeoutException:
        print(f"Timeout: Table data not found for {subject_name} in {year}. Skipping.")
        return

    # --- Setup Headers and Dynamic Criteria Lists ---
    with open(file_path, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        
        if year >= 2025:
            header = [
                "Rank", "Institution", "Country/Region", "Total Score",
                "World-Class Faculty", "World-Class Output", "High Quality Research", 
                "Research Impact", "International Collaboration",
            ]
            default_criteria_key = "World-Class Faculty"
            # List contains all 5 options, starting with the default one
            criteria_options_to_click_all = [default_criteria_key] + dropdown_options_2025 
            # Note that the data from year < 2024 appears with different criteria options, so we handle them separately
        else:    
            header = [
                "Rank", "Institution", "Country/Region", "Total Score",
                "Q1", "CNCI", "IC", "TOP", "AWARD",
            ]
            default_criteria_key = "Q1"
            criteria_options_to_click_all = [default_criteria_key] + dropdown_options_other

        writer.writerow(header)
        
        all_data_collected = []
        page_num = 1
        
        while True:
            print(f"Scraping page {page_num}")
            current_page_data = []
            unscored_row_indices = []
            
            # --- Initial Extraction (Default View) ---
            rows = driver.find_elements(By.CSS_SELECTOR, "table.rk-table tbody tr")
            
            for i, row in enumerate(rows):
                data = {}
                
                try:
                    # Basic Data (td[1] to td[3])
                    data["Rank"] = row.find_element(By.XPATH, "./td[1]").text.strip()
                    data["Institution"] = row.find_element(By.XPATH, "./td[2]").text.strip()
                    country_element = row.find_element(By.XPATH, "./td[3]/div")
                    style_attr = country_element.get_attribute("style")
                    data["Country/Region"] = style_attr.split("/")[-1].split(".")[0] if style_attr else ""

                    # Total Score (td[4])
                    total_score_text = row.find_element(By.XPATH, "./td[4]").text.strip()
                    data["Total Score"] = total_score_text if total_score_text else ""

                    # Default Criteria (td[5])
                    default_text = row.find_element(By.XPATH, "./td[5]").text.strip()
                    
                    if not data["Total Score"]:
                        # If unscored, mark for forced re-scrape and temporarily clear the default value
                        data[default_criteria_key] = "" 
                        unscored_row_indices.append(i)
                    else:
                        # If scored, save the initial correct value
                        data[default_criteria_key] = default_text if default_text else ""
                    
                except NoSuchElementException:
                    continue
                
                current_page_data.append(data)
                
            # --- Iterate through ALL criteria options (including the default one, which is now the primary action) ---
            dropdown = driver.find_element(By.CSS_SELECTOR, "div.rank-select")
            
            for option_text in criteria_options_to_click_all:
                
                # We skip the click if this is the default criteria AND all rows were scored 
                # (avoids unnecessary click when data is already good).
                if option_text == default_criteria_key and not unscored_row_indices:
                    continue
                    
                # --- Dropdown Menu Interaction (Your Original Logic) ---
                actions = ActionChains(driver)
                actions.move_to_element(dropdown).click().perform()
                actions.move_to_element(dropdown).click().perform()
                
                # Wait for the ul.options element to be visible
                try:
                    WebDriverWait(driver, 20).until(
                        EC.visibility_of_element_located((By.XPATH, "//ul[@class='options']"))
                    )
                except TimeoutException:
                    print(f"Warning: Dropdown menu not visible for {option_text}. Skipping this criteria.")
                    continue
                
                # Use JavaScript to click the desired option
                driver.execute_script(
                    f"""
                    const optionText = '{option_text}';
                    const options = document.querySelectorAll('ul.options li');
                    for (let i = 0; i < options.length; i++) {{
                        if (options[i].textContent.trim() === optionText) {{
                            options[i].click();
                            break;
                        }}
                    }}
                    """
                )
                
                # Wait for Table Data to refresh
                time.sleep(1) 
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "table.rk-table tbody tr"))
                )

                # --- Extract Data for the Current Criteria (td[5]) ---
                rows = driver.find_elements(By.CSS_SELECTOR, "table.rk-table tbody tr")

                for i, row in enumerate(rows):
                    if i >= len(current_page_data):
                        continue
                    
                    try:
                        cell_value = row.find_element(By.XPATH, "./td[5]").text.strip()
                        current_page_data[i][option_text] = cell_value if cell_value else ""
                    except NoSuchElementException:
                        current_page_data[i][option_text] = ""
                
            # --- Close the dropdown and collect data ---
            actions = ActionChains(driver)
            actions.move_to_element(dropdown).click().perform()
            
            all_data_collected.extend(current_page_data)

            # --- Pagination Logic (Restored Logic) ---
            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//li[@title='下一页']"))
                )

                if next_button.get_attribute("aria-disabled") != "true":
                    driver.execute_script("arguments[0].scrollIntoView();", next_button)
                    driver.execute_script("arguments[0].click();", next_button)
                    time.sleep(2)

                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "table.rk-table tbody tr"))
                    )
                    
                    page_num += 1
                else:
                    print(f"Reached the last page for year {year}. Exiting Loop")
                    break
                
            except (NoSuchElementException, TimeoutException) as e:
                print(f"Error or end of pages reached: {e}")
                break
        
        # Write all collected data once, after the pagination loop finishes
        for data in all_data_collected:
            writer.writerow([data.get(key, "") for key in header])
        print(f"Successfully extracted {len(all_data_collected)} rows.")


# --- Main Execution Loop ---
print("--- Starting Scraper ---")
for year in range(2025,2026): 
    for subject_code, subject_name in subject_codes.items():
        scrape_subject(year, subject_code, subject_name, driver)

# --- Close WebDriver ---
driver.quit()

--- Starting Scraper ---
Scraping Economics for year 2025...
Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Reached the last page for year 2025. Exiting Loop
Successfully extracted 500 rows.
Scraping Statistics for year 2025...
Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Reached the last page for year 2025. Exiting Loop
Successfully extracted 300 rows.
Scraping Law for year 2025...
Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Reached the last page for year 2025. Exiting Loop
Successfully extracted 200 rows.
Scraping Political Sciences for year 2025...
Scraping page 1
Scraping 