In [7]:
import pandas as pd
from datetime import datetime
import time
import json

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

def get_cme_settlements_final_corrected(trade_date: str = None):
    """
    Scrapes CME settlement data using a pure Selenium approach with the corrected v1 API endpoint.
    This method should be robust against all anti-scraping measures.

    Args:
        trade_date (str, optional): Date in 'YYYYMMDD' format. Defaults to today.
    
    Returns:
        pandas.DataFrame: DataFrame with settlement data, or empty DataFrame on failure.
    """
    main_page_url = "https://www.cmegroup.com/markets/energy/refined-products/singapore-fob-marine-fuel-05-platts.settlements.html"
    
    if not trade_date:
        trade_date = datetime.today().strftime('%Y%m%d')
        
    # The corrected API URL with '/v1/'
    # The product ID 'S5F' is found by inspecting the network traffic on the page. 4286 was the old numeric ID.
    api_url = f"https://www.cmegroup.com/CmeWS/mvc/v1/settlements/Future/S5F/FUT?tradeDate={trade_date}&strategy=DEFAULT"

    print("--- Initializing automated browser ---")
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36')
    options.add_argument('--log-level=3')

    driver = None
    try:
        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
        driver.set_script_timeout(45)

        print("Navigating to the main page to authenticate...")
        driver.get(main_page_url)
        
        print("Waiting for security challenges to complete...")
        time.sleep(10)

        print(f"Executing API call for {trade_date} from within the browser...")

        js_script = """
            const url = arguments[0];
            const callback = arguments[1];
            
            fetch(url)
                .then(response => {
                    if (!response.ok) {
                        throw new Error('API request failed with status: ' + response.status + ' ' + response.statusText);
                    }
                    return response.json();
                })
                .then(data => callback(data))
                .catch(error => callback({ 'error': error.toString() }));
        """

        api_response = driver.execute_async_script(js_script, api_url)
        
        if 'error' in api_response:
            raise Exception(f"JavaScript execution failed: {api_response['error']}")

        print("Successfully received data from the API.")
        
        settlements_data = api_response.get('settlements', [])

        if not settlements_data:
            print(f"No settlement data found for {api_response.get('tradeDate', 'the specified date')}.")
            return pd.DataFrame()

        df = pd.DataFrame(settlements_data)
        # The 'priorSettle' key seems to have been renamed to 'priorSettlePrice' in the new API
        if 'priorSettlePrice' in df.columns:
            df.rename(columns={'priorSettlePrice': 'priorSettle'}, inplace=True)
            
        print(f"Successfully created DataFrame for trade date: {api_response.get('tradeDate')}")
        return df

    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()
    finally:
        if driver:
            driver.quit()
            print("Browser has been closed.")


# --- Example Usage ---
if __name__ == "__main__":
    print("\n" + "="*50 + "\n")
    print("--- Getting historical data for May 15, 2024 ---")
    historical_df = get_cme_settlements_final_corrected(trade_date='20240515')
    if not historical_df.empty:
        print("\nHistorical Data for 2024-05-15:")
        print(historical_df[['month', 'open', 'high', 'low', 'settle', 'volume', 'openInterest']])



--- Getting historical data for May 15, 2024 ---
--- Initializing automated browser ---
Navigating to the main page to authenticate...
Waiting for security challenges to complete...
Executing API call for 20240515 from within the browser...
An error occurred: JavaScript execution failed: Error: API request failed with status: 404 
Browser has been closed.


In [12]:
import pandas as pd
import re
from datetime import datetime
import time

try:
    import undetected_chromedriver as uc
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException
except ImportError:
    print("Required libraries not found. Installing them now...")
    %pip install undetected-chromedriver pandas
    import undetected_chromedriver as uc
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException

def scrape_all_inventory_data_final():
    """
    Final, robust version. It finds all articles, navigates to them, and grabs
    all visible text from the page to ensure the data is found regardless of page structure.
    """
    search_url = "https://www.sxcoal.com/news/search?search=%E7%85%A4%E7%82%AD%E5%BA%93%E5%AD%98"
    
    all_results = []

    print("--- Initializing browser for multi-article scraping ---")
    
    driver = None
    try:
        options = uc.ChromeOptions()
        # Add '--headless' to hide the browser window after you confirm it's working.
        # options.add_argument('--headless')
        
        driver = uc.Chrome(options=options, use_subprocess=False)
        wait = WebDriverWait(driver, 20) 

        # --- Step 1: Gather all URLs (This part is already working well) ---
        print(f"Navigating to: {search_url}")
        driver.get(search_url)

        print("\nWaiting for inventory articles to appear...")
        article_link_selector = (By.PARTIAL_LINK_TEXT, "煤炭库存")
        
        try:
            wait.until(EC.element_to_be_clickable(article_link_selector))
            article_elements = driver.find_elements(*article_link_selector)
            article_urls = sorted(list(set([el.get_attribute('href') for el in article_elements])))
            print(f">>> Success! Found {len(article_urls)} unique articles to scrape.")
        except TimeoutException:
            print("\n--- FAILURE ---")
            print("Could not find any links containing '煤炭库存'. The page structure may have changed.")
            return None 

        # --- Step 2: Loop through each URL and scrape data with the robust method ---
        for i, url in enumerate(article_urls):
            print(f"\n--- Scraping article {i+1} of {len(article_urls)} ---")
            print(f"URL: {url}")
            try:
                driver.get(url)
                
                # *** THIS IS THE KEY FIX ***
                # Instead of waiting for a specific element, just wait for the page to be ready.
                # A small, fixed wait is a simple and effective way to handle pages with different structures.
                time.sleep(3) # Wait 3 seconds for the page to render fully.
                
                # Grab all text from the body of the page. This is guaranteed to exist.
                page_text = driver.find_element(By.TAG_NAME, 'body').text

                # The flexible regex to find "[Port Name]煤炭库存为[Number]"
                pattern = re.compile(r"([\u4e00-\u9fa5]+港)煤炭库存为(\d+\.?\d*)")
                match = pattern.search(page_text)
                
                if match:
                    port_name = match.group(1).strip()
                    inventory = float(match.group(2))
                    
                    all_results.append({"Port": port_name, "Inventory (10k tons)": inventory})
                    print(f"  >>> SUCCESS: Found '{port_name}' with inventory {inventory}")
                else:
                    print(f"  > INFO: No data matching the pattern was found in this article's text.")
                    
            except Exception as e:
                # This provides a much more helpful error message.
                print(f"  > ERROR: An error occurred processing this article: {type(e).__name__} - {e}")
        
        if not all_results:
            print("\nScraping finished, but no usable data could be extracted.")
            return None
            
        df = pd.DataFrame(all_results)
        df['Scrape Timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        df.drop_duplicates(subset=['Port', 'Inventory (10k tons)'], keep='first', inplace=True)
        return df

    except Exception as e:
        print(f"\nA critical error occurred: {type(e).__name__} - {e}")
        return None
    finally:
        if driver:
            print("\nScript finished. Closing browser...")
            driver.quit()

# --- Run the scraper ---
inventory_data = scrape_all_inventory_data_final()

# --- Display the final DataFrame ---
if inventory_data is not None and not inventory_data.empty:
    print("\n\n--- SCRAPING COMPLETE ---")
    display(inventory_data) 
else:
    print("\n\n--- SCRAPING FAILED OR NO DATA FOUND ---")
    print("Please review the messages above.")

--- Initializing browser for multi-article scraping ---
Navigating to: https://www.sxcoal.com/news/search?search=%E7%85%A4%E7%82%AD%E5%BA%93%E5%AD%98

Waiting for inventory articles to appear...
>>> Success! Found 15 unique articles to scrape.

--- Scraping article 1 of 15 ---
URL: https://www.sxcoal.com/news/detail/1939511707756490753
  >>> SUCCESS: Found '秦皇岛港' with inventory 578.0

--- Scraping article 2 of 15 ---
URL: https://www.sxcoal.com/news/detail/1939874095525621762
  >>> SUCCESS: Found '秦皇岛港' with inventory 575.0

--- Scraping article 3 of 15 ---
URL: https://www.sxcoal.com/news/detail/1939874096851021826
  > INFO: No data matching the pattern was found in this article's text.

--- Scraping article 4 of 15 ---
URL: https://www.sxcoal.com/news/detail/1940236483366043649
  >>> SUCCESS: Found '秦皇岛港' with inventory 580.0

--- Scraping article 5 of 15 ---
URL: https://www.sxcoal.com/news/detail/1940236484091658242
  >>> SUCCESS: Found '黄骅港' with inventory 175.6

--- Scraping arti

Unnamed: 0,Port,Inventory (10k tons),Scrape Timestamp
0,秦皇岛港,578.0,2025-07-08 15:04:28
1,秦皇岛港,575.0,2025-07-08 15:04:28
2,秦皇岛港,580.0,2025-07-08 15:04:28
3,黄骅港,175.6,2025-07-08 15:04:28
5,黄骅港,180.3,2025-07-08 15:04:28
6,秦皇岛港,570.0,2025-07-08 15:04:28
7,秦皇岛港,576.0,2025-07-08 15:04:28
9,黄骅港,182.5,2025-07-08 15:04:28
