In [7]:
import pandas as pd
from datetime import datetime
import time
import json

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

def get_cme_settlements_final_corrected(trade_date: str = None):
    """
    Scrapes CME settlement data using a pure Selenium approach with the corrected v1 API endpoint.
    This method should be robust against all anti-scraping measures.

    Args:
        trade_date (str, optional): Date in 'YYYYMMDD' format. Defaults to today.
    
    Returns:
        pandas.DataFrame: DataFrame with settlement data, or empty DataFrame on failure.
    """
    main_page_url = "https://www.cmegroup.com/markets/energy/refined-products/singapore-fob-marine-fuel-05-platts.settlements.html"
    
    if not trade_date:
        trade_date = datetime.today().strftime('%Y%m%d')
        
    # The corrected API URL with '/v1/'
    # The product ID 'S5F' is found by inspecting the network traffic on the page. 4286 was the old numeric ID.
    api_url = f"https://www.cmegroup.com/CmeWS/mvc/v1/settlements/Future/S5F/FUT?tradeDate={trade_date}&strategy=DEFAULT"

    print("--- Initializing automated browser ---")
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36')
    options.add_argument('--log-level=3')

    driver = None
    try:
        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
        driver.set_script_timeout(45)

        print("Navigating to the main page to authenticate...")
        driver.get(main_page_url)
        
        print("Waiting for security challenges to complete...")
        time.sleep(10)

        print(f"Executing API call for {trade_date} from within the browser...")

        js_script = """
            const url = arguments[0];
            const callback = arguments[1];
            
            fetch(url)
                .then(response => {
                    if (!response.ok) {
                        throw new Error('API request failed with status: ' + response.status + ' ' + response.statusText);
                    }
                    return response.json();
                })
                .then(data => callback(data))
                .catch(error => callback({ 'error': error.toString() }));
        """

        api_response = driver.execute_async_script(js_script, api_url)
        
        if 'error' in api_response:
            raise Exception(f"JavaScript execution failed: {api_response['error']}")

        print("Successfully received data from the API.")
        
        settlements_data = api_response.get('settlements', [])

        if not settlements_data:
            print(f"No settlement data found for {api_response.get('tradeDate', 'the specified date')}.")
            return pd.DataFrame()

        df = pd.DataFrame(settlements_data)
        # The 'priorSettle' key seems to have been renamed to 'priorSettlePrice' in the new API
        if 'priorSettlePrice' in df.columns:
            df.rename(columns={'priorSettlePrice': 'priorSettle'}, inplace=True)
            
        print(f"Successfully created DataFrame for trade date: {api_response.get('tradeDate')}")
        return df

    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()
    finally:
        if driver:
            driver.quit()
            print("Browser has been closed.")


# --- Example Usage ---
if __name__ == "__main__":
    print("\n" + "="*50 + "\n")
    print("--- Getting historical data for May 15, 2024 ---")
    historical_df = get_cme_settlements_final_corrected(trade_date='20240515')
    if not historical_df.empty:
        print("\nHistorical Data for 2024-05-15:")
        print(historical_df[['month', 'open', 'high', 'low', 'settle', 'volume', 'openInterest']])



--- Getting historical data for May 15, 2024 ---
--- Initializing automated browser ---
Navigating to the main page to authenticate...
Waiting for security challenges to complete...
Executing API call for 20240515 from within the browser...
An error occurred: JavaScript execution failed: Error: API request failed with status: 404 
Browser has been closed.


In [14]:
import pandas as pd
import re
from datetime import datetime
import time

try:
    import undetected_chromedriver as uc
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException
except ImportError:
    print("Required libraries not found. Installing them now...")
    %pip install undetected-chromedriver pandas
    import undetected_chromedriver as uc
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException

def scrape_and_reshape_inventory_data():
    """
    Scrapes all inventory articles, extracts date, port, and inventory,
    and then reshapes the final data into a wide table with ports as columns.
    """
    search_url = "https://www.sxcoal.com/news/search?search=%E7%85%A4%E7%82%AD%E5%BA%93%E5%AD%98"
    
    raw_data_list = []

    print("--- Initializing browser for final scrape and reshape task ---")
    
    driver = None
    try:
        options = uc.ChromeOptions()
        options.add_argument('--headless')
        
        driver = uc.Chrome(options=options, use_subprocess=False)
        wait = WebDriverWait(driver, 25) 

        # --- Step 1: Gather all URLs ---
        print(f"Navigating to: {search_url}")
        driver.get(search_url)

        print("\nFinding all relevant inventory articles...")
        article_link_selector = (By.PARTIAL_LINK_TEXT, "煤炭库存")
        
        try:
            wait.until(EC.element_to_be_clickable(article_link_selector))
            article_elements = driver.find_elements(*article_link_selector)
            article_urls = sorted(list(set([el.get_attribute('href') for el in article_elements])))
            print(f">>> Success! Found {len(article_urls)} unique articles to process.")
        except TimeoutException:
            print("\n--- FAILURE ---")
            print("Could not find any article links containing '煤炭库存'.")
            return None 

        # --- Step 2: Loop through each URL and scrape data ---
        for i, url in enumerate(article_urls):
            print(f"\n--- Processing article {i+1} of {len(article_urls)} ---")
            try:
                driver.get(url)
                time.sleep(3) 
                page_text = driver.find_element(By.TAG_NAME, 'body').text

                pattern = re.compile(r"(\d+月\d+日)，([\u4e00-\u9fa5]+港)煤炭库存为(\d+\.?\d*)")
                match = pattern.search(page_text)
                
                if match:
                    date_str = match.group(1)
                    port_name = match.group(2).strip()
                    inventory = float(match.group(3))
                    
                    raw_data_list.append({"Date": date_str, "Port": port_name, "Inventory": inventory})
                    print(f"  >>> SUCCESS: Found [{date_str}] '{port_name}' with inventory {inventory}")
                else:
                    print(f"  > INFO: No data matching the pattern 'X月X日，[Port]煤炭库存为[Number]' was found.")
                    
            except Exception as e:
                print(f"  > ERROR: An error occurred processing this article: {type(e).__name__}")
        
        if not raw_data_list:
            print("\nScraping finished, but no data could be extracted.")
            return None
            
        # --- Step 3: Reshape the data using a Pivot Table ---
        print("\n--- Data extraction complete. Reshaping table... ---")
        
        long_df = pd.DataFrame(raw_data_list)
        
        current_year = datetime.now().year
        long_df['FullDate'] = long_df['Date'].apply(lambda x: datetime.strptime(f'{current_year}年{x}', '%Y年%m月%d日'))

        final_table = long_df.pivot_table(
            index='FullDate',       
            columns='Port',     
            values='Inventory'
        )
        
        final_table.sort_index(ascending=False, inplace=True)
        
        final_table.reset_index(inplace=True)
        final_table.rename(columns={'FullDate': 'Date'}, inplace=True)
        
        # *** THIS IS THE ONLY LINE THAT CHANGED ***
        # The format string '%m-%d-%y' adds the two-digit year.
        final_table['Date'] = final_table['Date'].dt.strftime('%m-%d-%y')
        
        return final_table

    except Exception as e:
        print(f"\nA critical error occurred: {type(e).__name__} - {e}")
        return None
    finally:
        if driver:
            print("\nScript finished. Closing browser...")
            driver.quit()

# --- Run the scraper ---
final_inventory_data = scrape_and_reshape_inventory_data()

# --- Display the final DataFrame ---
if final_inventory_data is not None and not final_inventory_data.empty:
    print("\n\n--- SCRAPING AND RESHAPING COMPLETE ---")
    display(final_inventory_data) 
else:
    print("\n\n--- TASK FAILED OR NO DATA FOUND ---")
    print("Please review the messages above.")

--- Initializing browser for final scrape and reshape task ---
Navigating to: https://www.sxcoal.com/news/search?search=%E7%85%A4%E7%82%AD%E5%BA%93%E5%AD%98

Finding all relevant inventory articles...
>>> Success! Found 15 unique articles to process.

--- Processing article 1 of 15 ---
  >>> SUCCESS: Found [6月30日] '秦皇岛港' with inventory 578.0

--- Processing article 2 of 15 ---
  >>> SUCCESS: Found [7月1日] '秦皇岛港' with inventory 575.0

--- Processing article 3 of 15 ---
  > INFO: No data matching the pattern 'X月X日，[Port]煤炭库存为[Number]' was found.

--- Processing article 4 of 15 ---
  >>> SUCCESS: Found [7月2日] '秦皇岛港' with inventory 580.0

--- Processing article 5 of 15 ---
  >>> SUCCESS: Found [7月2日] '黄骅港' with inventory 175.6

--- Processing article 6 of 15 ---
  > INFO: No data matching the pattern 'X月X日，[Port]煤炭库存为[Number]' was found.

--- Processing article 7 of 15 ---
  >>> SUCCESS: Found [7月3日] '秦皇岛港' with inventory 580.0

--- Processing article 8 of 15 ---
  >>> SUCCESS: Found [7月3日]

Port,Date,秦皇岛港,黄骅港
0,07-08-25,575.0,182.5
1,07-07-25,576.0,
2,07-04-25,570.0,
3,07-03-25,580.0,180.3
4,07-02-25,580.0,175.6
5,07-01-25,575.0,
6,06-30-25,578.0,
