In [None]:
import requests
import json
import csv
import time
import pandas as pd
from bs4 import BeautifulSoup
import sys 

# --- Configuration ---
# Target URL for the latest World University Rankings (Times Higher Education)
URL = "https://www.timeshighereducation.com/world-university-rankings/latest/world-ranking"
CSV_FILE = "THE_World_University_Rankings_2026.csv"
# We define a strong User-Agent to mimic a modern browser, important for requests
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}
# Define the known data path within the JSON structure
DATA_PATH = ['props', 'pageProps', 'page', 'rankingsTableConfig', 'rankingsData', 'data']

# --- Main Scraper Function ---
def run_scraper():
    print(f"Requesting data from: {URL}")
    
    try:
        # 1. Make the request to get the HTML content
        response = requests.get(URL, headers=HEADERS, timeout=15)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

        # 2. Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 3. Intercept the JSON Data
        # The data is pre-rendered into a script tag with the ID '__NEXT_DATA__'
        script_tag = soup.find('script', {'id': '__NEXT_DATA__'})
        
        if not script_tag:
            raise ValueError("Could not find the '__NEXT_DATA__' script tag. Site structure may have changed.")

        # 4. Load the JSON data
        json_data = json.loads(script_tag.string)

        # 5. Navigate the deep JSON path to extract the list of universities
        universities = json_data
        for key in DATA_PATH:
            universities = universities[key]
        
        print(f"✅ Successfully extracted data for {len(universities)} universities.")

        # --- Data Extraction and Saving ---
        
        with open(CSV_FILE, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            
            # Write Header Row
            writer.writerow([
                'Rank', 'Name', 'Country/Region', 'Overall Score',
                'Teaching Score', 'Research Environment Score',
                'Research Quality Score', 'Industry Score',
                'International Outlook Score'
            ])

            # Write Data Rows
            for university in universities:
                # Use .get() for safe extraction, returning empty string if key is missing
                writer.writerow([
                    university.get('rank', 'N/A'),
                    university.get('name', 'N/A'),
                    university.get('location', 'N/A'),
                    university.get('scores_overall', 'N/A'),
                    university.get('scores_teaching', 'N/A'),
                    university.get('scores_research', 'N/A'),
                    university.get('scores_citations', 'N/A'),
                    university.get('scores_industry_income', 'N/A'),
                    university.get('scores_international_outlook', 'N/A')
                ])

        print(f"✅ Success! Data saved to {CSV_FILE}")

    except (ValueError, KeyError) as e:
        print(f"❌ Error processing the page's data. The internal JSON structure might have changed. Error: {e}")
    except requests.exceptions.HTTPError as e:
        print(f"❌ HTTP Error (Status {response.status_code}): Could not access the URL. {e}")
    except requests.exceptions.RequestException as e:
        print(f"❌ Network Error: {e}")
    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}. Aborting script.")

if __name__ == '__main__':
    run_scraper()