In [7]:
import pandas as pd
from datetime import datetime
import time
import json

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

def get_cme_settlements_final_corrected(trade_date: str = None):
    """
    Scrapes CME settlement data using a pure Selenium approach with the corrected v1 API endpoint.
    This method should be robust against all anti-scraping measures.

    Args:
        trade_date (str, optional): Date in 'YYYYMMDD' format. Defaults to today.
    
    Returns:
        pandas.DataFrame: DataFrame with settlement data, or empty DataFrame on failure.
    """
    main_page_url = "https://www.cmegroup.com/markets/energy/refined-products/singapore-fob-marine-fuel-05-platts.settlements.html"
    
    if not trade_date:
        trade_date = datetime.today().strftime('%Y%m%d')
        
    # The corrected API URL with '/v1/'
    # The product ID 'S5F' is found by inspecting the network traffic on the page. 4286 was the old numeric ID.
    api_url = f"https://www.cmegroup.com/CmeWS/mvc/v1/settlements/Future/S5F/FUT?tradeDate={trade_date}&strategy=DEFAULT"

    print("--- Initializing automated browser ---")
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36')
    options.add_argument('--log-level=3')

    driver = None
    try:
        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
        driver.set_script_timeout(45)

        print("Navigating to the main page to authenticate...")
        driver.get(main_page_url)
        
        print("Waiting for security challenges to complete...")
        time.sleep(10)

        print(f"Executing API call for {trade_date} from within the browser...")

        js_script = """
            const url = arguments[0];
            const callback = arguments[1];
            
            fetch(url)
                .then(response => {
                    if (!response.ok) {
                        throw new Error('API request failed with status: ' + response.status + ' ' + response.statusText);
                    }
                    return response.json();
                })
                .then(data => callback(data))
                .catch(error => callback({ 'error': error.toString() }));
        """

        api_response = driver.execute_async_script(js_script, api_url)
        
        if 'error' in api_response:
            raise Exception(f"JavaScript execution failed: {api_response['error']}")

        print("Successfully received data from the API.")
        
        settlements_data = api_response.get('settlements', [])

        if not settlements_data:
            print(f"No settlement data found for {api_response.get('tradeDate', 'the specified date')}.")
            return pd.DataFrame()

        df = pd.DataFrame(settlements_data)
        # The 'priorSettle' key seems to have been renamed to 'priorSettlePrice' in the new API
        if 'priorSettlePrice' in df.columns:
            df.rename(columns={'priorSettlePrice': 'priorSettle'}, inplace=True)
            
        print(f"Successfully created DataFrame for trade date: {api_response.get('tradeDate')}")
        return df

    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()
    finally:
        if driver:
            driver.quit()
            print("Browser has been closed.")


# --- Example Usage ---
if __name__ == "__main__":
    print("\n" + "="*50 + "\n")
    print("--- Getting historical data for May 15, 2024 ---")
    historical_df = get_cme_settlements_final_corrected(trade_date='20240515')
    if not historical_df.empty:
        print("\nHistorical Data for 2024-05-15:")
        print(historical_df[['month', 'open', 'high', 'low', 'settle', 'volume', 'openInterest']])



--- Getting historical data for May 15, 2024 ---
--- Initializing automated browser ---
Navigating to the main page to authenticate...
Waiting for security challenges to complete...
Executing API call for 20240515 from within the browser...
An error occurred: JavaScript execution failed: Error: API request failed with status: 404 
Browser has been closed.


In [8]:
import pandas as pd
import re
from datetime import datetime
import time

# --- This cell contains the complete, definitive script ---
try:
    # Ensure all required libraries are available
    import undetected_chromedriver as uc
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException
except ImportError:
    # If they are missing, install them first
    print("Required libraries not found. Installing them now...")
    %pip install undetected-chromedriver pandas
    import undetected_chromedriver as uc
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException

def scrape_sxcoal_inventory_definitive():
    """
    Definitive version of the scraper. It locates the target article by its link text
    and uses the correct keywords for data extraction. This is based on the user-provided HTML.
    """
    search_url = "https://www.sxcoal.com/news/search?search=%E7%85%A4%E7%82%AD%E5%BA%93%E5%AD%98"
    
    # *** FIX #1: Find the article by its exact visible headline text ***
    target_article_headline = "调入增量明显 秦港煤炭库存震荡增加"

    print("--- Initializing browser with definitive link-text strategy ---")
    
    driver = None
    try:
        options = uc.ChromeOptions()
        # You can add '--headless' to hide the window once we confirm this works.
        # options.add_argument('--headless')
        
        driver = uc.Chrome(options=options, use_subprocess=False)
        wait = WebDriverWait(driver, 30) # A generous timeout

        # --- Step 1: Navigate and wait for the specific link to appear ---
        print(f"Navigating to: {search_url}")
        driver.get(search_url)

        print(f"\nWaiting for the target article with headline: '{target_article_headline}'")
        
        # We will wait for a link that has the exact text of the headline.
        article_link_selector = (By.PARTIAL_LINK_TEXT, "秦港煤炭库存震荡增加") # Using partial text to be more robust
        
        try:
            article_link_element = wait.until(EC.element_to_be_clickable(article_link_selector))
            print(">>> SUCCESS: Found the target article link!")
        except TimeoutException:
            print("\n--- CRITICAL FAILURE ---")
            print("Could not find the link by its text. The main article headline may have changed on the website.")
            return None 

        # --- Step 2: Click the link and wait for the new page to load ---
        print("Clicking the link to read the full report...")
        article_link_element.click()
        
        content_selector = (By.CSS_SELECTOR, '.view-content')
        wait.until(EC.presence_of_element_located(content_selector))
        time.sleep(2) # A brief pause to ensure all JS rendering is complete
        article_content = driver.find_element(*content_selector).text
        
        # --- Step 3: Extract data using the CORRECTED keywords ---
        print("\nExtracting data from article text...")
        
        # *** FIX #2: Use the exact keywords found in the article text ***
        keywords = {
            "秦皇岛港存煤量": "Qinhuangdao Port",
            "京唐港煤炭库存": "Jingtang Port",
            "曹妃甸港煤炭库存": "Caofeidian Port"
            # This script assumes the other port data may be in the same article or linked reports,
            # but it will definitely find the Qinhuangdao Port data.
        }
        
        results = {}
        for keyword_cn, keyword_en in keywords.items():
            match = re.search(f"{re.escape(keyword_cn)}[为是:：\\s]*(\\d+\\.?\\d*)", article_content)
            if match:
                value = float(match.group(1))
                results[keyword_en] = value
                print(f"  - SUCCESS: Found '{keyword_en}': {value} 万吨")
            else:
                results[keyword_en] = "Not Found"
                print(f"  - INFO: Data for '{keyword_en}' not found in this article's text.")
                
        df = pd.DataFrame.from_dict(results, orient='index', columns=['Inventory (in 10,000 tons)'])
        df.index.name = "Port"
        df['Scrape Timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        return df

    except Exception as e:
        print(f"\nAn unexpected error occurred during execution: {e}")
        return None
    finally:
        if driver:
            print("\nScript finished. Closing browser...")
            driver.quit()

# --- Run the scraper ---
inventory_data = scrape_sxcoal_inventory_definitive()

# --- Display the final DataFrame ---
if inventory_data is not None and not inventory_data.empty:
    print("\n\n--- SCRAPING SUCCEEDED ---")
    display(inventory_data) 
else:
    print("\n\n--- SCRAPING FAILED ---")
    print("Please review the messages. If it failed, the website's main article headline for inventory has likely changed.")

--- Initializing browser with definitive link-text strategy ---
Navigating to: https://www.sxcoal.com/news/search?search=%E7%85%A4%E7%82%AD%E5%BA%93%E5%AD%98

Waiting for the target article with headline: '调入增量明显 秦港煤炭库存震荡增加'
>>> SUCCESS: Found the target article link!
Clicking the link to read the full report...

An unexpected error occurred during execution: Message: element click intercepted: Element <a href="/news/detail/1941047223060611073" target="_blank" rel="noreferrer">...</a> is not clickable at point (850, 943). Other element would receive the click: <p>...</p>
  (Session info: chrome=138.0.7204.96); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#elementclickinterceptedexception
Stacktrace:
	GetHandleVerifier [0x0x1234553+62419]
	GetHandleVerifier [0x0x1234594+62484]
	(No symbol) [0x0x1072133]
	(No symbol) [0x0x10c0c40]
	(No symbol) [0x0x10beffa]
	(No symbol) [0x0x10bcb57]
	(No symbol) [0x0x10bbe14]
	(No 