In [4]:
# ==============================================================================
# STEP 1: KAGGLE AUTH & PYTHON DEPENDENCIES
# ==============================================================================
print("--- Installing Python Dependencies ---")
!pip install -q selenium pandas kaggle # <--- REVERT BACK TO THIS
# ... rest of your imports

import os
import pandas as pd
import logging
import json
import re
from datetime import datetime
from kaggle_secrets import UserSecretsClient
from importlib import reload

# Force logging to be active so we see all messages
reload(logging)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

print("\n--- Setting up Kaggle API Authentication ---")
api = None
try:
    user_secrets = UserSecretsClient()
    secret_value = user_secrets.get_secret("KAGGLE_JSON")
    kaggle_dir = os.path.expanduser('~/.kaggle')
    os.makedirs(kaggle_dir, exist_ok=True)
    kaggle_json_path = os.path.join(kaggle_dir, 'kaggle.json')
    with open(kaggle_json_path, 'w') as f: f.write(secret_value)
    os.chmod(kaggle_json_path, 600)
    
    from kaggle.api.kaggle_api_extended import KaggleApi
    api = KaggleApi()
    api.authenticate()
    print("Kaggle API Authentication Successful.")
except Exception as e:
    logging.critical(f"FATAL: A critical error occurred during Kaggle setup. Error: {e}")
    raise

# ==============================================================================
# STEP 2: SYSTEM INSTALLATIONS (CHROME)
# ==============================================================================
print("\n--- Installing Google Chrome & ChromeDriver ---")
# Using quiet flags to keep the log clean
!sudo apt-get update > /dev/null
!sudo apt-get install -y wget gnupg > /dev/null
!wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | sudo apt-key add -
!sudo sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list'
!sudo apt-get update > /dev/null
!sudo apt-get install -y google-chrome-stable > /dev/null
!apt-get install -y chromium-chromedriver > /dev/null
!cp /usr/lib/chromium-browser/chromedriver /usr/bin &>/dev/null
print("--- Chrome & ChromeDriver Setup Complete ---")


# ==============================================================================
# STEP 3: SCRAPER FUNCTIONS (WITH ROBUSTNESS FIXES)
# ==============================================================================
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def get_all_leagues_and_games(driver):
    """
    Scrapes the main basketball page with robust waits and debugging.
    """
    url = "https://www.pinnacle.com/en/basketball/matchups/"
    logging.info(f"Navigating to matchups page: {url}")
    driver.get(url)

    # Handle cookie banner if it appears
    try:
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))).click()
        logging.info("Clicked the Accept button for cookies."); time.sleep(2)
    except TimeoutException:
        logging.warning("Cookie banner not found or already handled.")

    leagues_data = {}
    current_league_name = None

    try:
        # ROBUSTNESS FIX #1: Wait for the main content container to be present.
        # This is the single most important element to wait for.
        # Selector targets the block that holds all the leagues and games.
        content_container_selector = (By.CSS_SELECTOR, ".contentBlock.square")
        logging.info("Waiting for the main content container to load...")
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located(content_container_selector)
        )
        logging.info("Main content container found. Proceeding to scrape rows.")
        
        # Give the JS a brief moment to render everything after the container is found
        time.sleep(2)

        all_rows = driver.find_elements(By.CSS_SELECTOR, ".contentBlock.square > div[class*='row-']")
        if not all_rows:
            logging.error("Content container was found, but it contains no game or league rows.")
            # ROBUSTNESS FIX #2: Save page source for debugging if rows are missing
            with open("debug_page_no_rows.html", "w", encoding="utf-8") as f:
                f.write(driver.page_source)
            logging.info("Saved debug_page_no_rows.html to output for analysis.")
            return {}

        logging.info(f"Found {len(all_rows)} total rows to process on the matchups page.")

        for row in all_rows:
            row_class = row.get_attribute('class')
            
            if 'row-CTcjEjV6yK' in row_class:
                try:
                    league_name = row.find_element(By.CSS_SELECTOR, "a span").text.strip()
                    if league_name:
                        current_league_name = league_name
                        leagues_data[current_league_name] = []
                        logging.info(f"Discovered new league section: {current_league_name}")
                except NoSuchElementException:
                    continue 

            elif 'row-k9ktBvvTsJ' in row_class and current_league_name:
                try:
                    game = {}
                    link_tag = row.find_element(By.CSS_SELECTOR, "a[href*='/basketball/']")
                    teams = link_tag.find_elements(By.CSS_SELECTOR, "span.ellipsis.gameInfoLabel-EDDYv5xEfd")
                    game['team1'], game['team2'] = teams[0].text, teams[1].text
                    game['game_link'] = link_tag.get_attribute('href')
                    
                    odds_groups = row.find_elements(By.CSS_SELECTOR, "div.buttons-j19Jlcwsi9")
                    def get_text(elements, index): return elements[index].text if index < len(elements) else 'N/A'
                    
                    h_spans = odds_groups[0].find_elements(By.CSS_SELECTOR, "button span")
                    ml_spans = odds_groups[1].find_elements(By.CSS_SELECTOR, "span.price-r5BU0ynJha")
                    t_spans = odds_groups[2].find_elements(By.CSS_SELECTOR, "button span")
                    
                    game.update({'team1_moneyline': get_text(ml_spans, 0), 'team2_moneyline': get_text(ml_spans, 1),'team1_spread': get_text(h_spans, 0), 'team1_spread_odds': get_text(h_spans, 1),'team2_spread': get_text(h_spans, 2), 'team2_spread_odds': get_text(h_spans, 3),'over_total': get_text(t_spans, 0), 'over_total_odds': get_text(t_spans, 1),'under_total': get_text(t_spans, 2), 'under_total_odds': get_text(t_spans, 3)})
                    
                    leagues_data[current_league_name].append(game)
                except (NoSuchElementException, IndexError):
                    continue

    except TimeoutException:
        # ROBUSTNESS FIX #2 (Primary use case): Save page source if main container never loads
        logging.error("FATAL: Timed out waiting for the main content container. The page may be blocked or changed.")
        with open("debug_page.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        logging.info("Saved debug_page.html to output. This file will show what the scraper saw (e.g., a CAPTCHA).")
    
    return leagues_data

def scrape_detailed_game_odds(driver, game_url):
    logging.info(f"Scraping detailed odds from: {game_url}")
    driver.get(game_url)
    all_markets_data = []
    try:
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.marketGroups-HjCkfKkLNt"))); time.sleep(2)
        market_groups = driver.find_elements(By.CSS_SELECTOR, "div.marketGroup-wMlWprW2iC")
        for group in market_groups:
            market_title = group.find_element(By.CSS_SELECTOR, "span.titleText-BgvECQYfHf").text
            if not group.find_elements(By.CSS_SELECTOR, "ul[data-test-id]"):
                for btn in group.find_elements(By.CSS_SELECTOR, "button"):
                    parts = btn.text.split('\n')
                    if len(parts) == 2: all_markets_data.append({'Market': market_title, 'Selection': parts[0], 'Odds': parts[1]})
                continue
            headers = [h.text for h in group.find_elements(By.CSS_SELECTOR, "ul[data-test-id] > li")]
            button_rows = group.find_elements(By.CSS_SELECTOR, ".buttonRow-zWMLOGu5YB")
            for row in button_rows:
                buttons = row.find_elements(By.TAG_NAME, 'button')
                if len(buttons) == len(headers):
                    for i, btn in enumerate(buttons):
                        parts = btn.text.split('\n')
                        if len(parts) == 2:
                            selection_name = f"{headers[i]} {parts[0]}"
                            all_markets_data.append({'Market': market_title, 'Selection': selection_name, 'Odds': parts[1]})
    except TimeoutException:
        logging.error(f"Could not load market data for URL: {game_url}")
    return pd.DataFrame(all_markets_data)

def to_slug(name):
    return re.sub(r'[^a-z0-9]+', '_', name.lower()).strip('_')

# ==============================================================================
# STEP 4: MAIN DATA PIPELINE EXECUTION
# ==============================================================================
print("\n--- Starting Data Pipeline Execution ---")
if __name__ == "__main__" and api:
    DATASET_SLUG = "zachht/wnba-odds-history" 
    WORKING_DIR = "/kaggle/working"
    
    driver = None
    leagues_updated = []
    try:
        # ROBUSTNESS FIX #3: More realistic browser options
        # --- REPLACE WITH THIS BLOCK ---
        # --- REPLACE WITH THIS NEW, MORE STABLE BLOCK ---
        from selenium import webdriver

        logging.info("Initializing a smarter, stealthier Selenium driver...")
        options = webdriver.ChromeOptions()

        # Use the new headless mode which is harder to detect
        options.add_argument("--headless=new") 
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--window-size=1920,1080")
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36")

        # Key options to make Selenium look less like a bot
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)

        driver = webdriver.Chrome(options=options)
        # We also need to execute a command to fool the bot detector
        driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
            'source': '''
                Object.defineProperty(navigator, 'webdriver', {
                  get: () => undefined
                })
            '''
        })

        logging.info("Smarter Selenium driver initialized.")
        
        
        all_leagues_games = get_all_leagues_and_games(driver)

        if not all_leagues_games:
            logging.warning("Scraping finished, but no leagues were found on the site. Check debug files if they were created.")
        else:
            for league_name, new_main_lines_data in all_leagues_games.items():
                if not new_main_lines_data:
                    logging.info(f"No games found for league: {league_name}. Skipping.")
                    continue

                logging.info(f"\n--- Processing League: {league_name} ({len(new_main_lines_data)} games found) ---")
                leagues_updated.append(league_name)
                league_slug = to_slug(league_name)

                MAIN_CSV_PATH = os.path.join(WORKING_DIR, f"{league_slug}_main_lines.csv")
                DETAILED_CSV_PATH = os.path.join(WORKING_DIR, f"{league_slug}_detailed_odds.csv")

                try:
                    logging.info(f"Downloading existing files for {league_name}...")
                    api.dataset_download_file(DATASET_SLUG, file_name=os.path.basename(MAIN_CSV_PATH), path=WORKING_DIR)
                    api.dataset_download_file(DATASET_SLUG, file_name=os.path.basename(DETAILED_CSV_PATH), path=WORKING_DIR)
                    old_main_df = pd.read_csv(MAIN_CSV_PATH)
                    old_detailed_df = pd.read_csv(DETAILED_CSV_PATH)
                    logging.info("Successfully loaded existing data.")
                except Exception:
                    logging.warning(f"Could not load existing data for {league_name}. Starting with fresh history files.")
                    old_main_df, old_detailed_df = pd.DataFrame(), pd.DataFrame()

                scrape_timestamp = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
                new_main_df = pd.DataFrame(new_main_lines_data)
                new_main_df['timestamp'] = scrape_timestamp
                combined_main_df = pd.concat([old_main_df, new_main_df], ignore_index=True)
                
                all_detailed_dfs = []
                for game in new_main_lines_data:
                    detailed_df = scrape_detailed_game_odds(driver, game['game_link'])
                    if not detailed_df.empty:
                        detailed_df['matchup'] = f"{game['team1']} vs {game['team2']}"
                        all_detailed_dfs.append(detailed_df)
                
                if all_detailed_dfs:
                    new_detailed_df = pd.concat(all_detailed_dfs, ignore_index=True)
                    new_detailed_df['timestamp'] = scrape_timestamp
                    combined_detailed_df = pd.concat([old_detailed_df, new_detailed_df], ignore_index=True)
                    
                    logging.info(f"Saving combined data to local CSVs for {league_name}...")
                    combined_main_df.to_csv(MAIN_CSV_PATH, index=False)
                    combined_detailed_df.to_csv(DETAILED_CSV_PATH, index=False)
            
            if leagues_updated:
                logging.info("\n--- Finalizing and Uploading to Kaggle ---")
                metadata_path = os.path.join(WORKING_DIR, 'dataset-metadata.json')
                metadata = {"title": "Pinnacle Basketball Odds History", "id": DATASET_SLUG, "licenses": [{"name": "CC0-1.0"}]}
                with open(metadata_path, 'w') as f: json.dump(metadata, f)
                
                version_note = f"Automated odds update. Leagues updated: {', '.join(leagues_updated)}."
                logging.info(f"Pushing new dataset version. {version_note}")
                api.dataset_create_version(folder=WORKING_DIR, version_notes=version_note, quiet=False, dir_mode='zip')
            else:
                logging.warning("No games were found for any leagues. No new version will be pushed.")

    except Exception as e:
        logging.error(f"An error occurred during the main pipeline: {e}", exc_info=True)
    finally:
        if driver: driver.quit(); logging.info("Selenium driver closed.")

print("\n--- Data Pipeline Execution Finished ---")

--- Installing Python Dependencies ---

--- Setting up Kaggle API Authentication ---
Kaggle API Authentication Successful.

--- Installing Google Chrome & ChromeDriver ---
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
OK
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


2025-08-25 05:53:34,450 - INFO - Initializing a smarter, stealthier Selenium driver...


--- Chrome & ChromeDriver Setup Complete ---

--- Starting Data Pipeline Execution ---


2025-08-25 05:53:34,949 - INFO - Smarter Selenium driver initialized.
2025-08-25 05:53:34,950 - INFO - Navigating to matchups page: https://www.pinnacle.com/en/basketball/matchups/
2025-08-25 05:53:45,435 - INFO - Waiting for the main content container to load...
2025-08-25 05:53:45,452 - INFO - Main content container found. Proceeding to scrape rows.
2025-08-25 05:53:47,471 - INFO - Found 26 total rows to process on the matchups page.
2025-08-25 05:53:47,520 - INFO - Discovered new league section: WNBA
2025-08-25 05:53:47,897 - INFO - Discovered new league section: BRAZIL - PAULISTA FPB U20
2025-08-25 05:53:48,181 - INFO - Discovered new league section: FIBA - AMERICUP
2025-08-25 05:53:48,446 - INFO - Discovered new league section: WORLD - CLUB FRIENDLIES
2025-08-25 05:53:48,717 - INFO - Discovered new league section: WNBA
2025-08-25 05:53:48,998 - INFO - Discovered new league section: FIBA - AMERICUP
2025-08-25 05:53:49,291 - INFO - Discovered new league section: FIBA - EUROBASKET
20

Dataset URL: https://www.kaggle.com/datasets/zachht/wnba-odds-history
Dataset URL: https://www.kaggle.com/datasets/zachht/wnba-odds-history


2025-08-25 05:53:52,918 - INFO - Successfully loaded existing data.
2025-08-25 05:53:52,927 - INFO - Scraping detailed odds from: https://www.pinnacle.com/en/basketball/wnba/las-vegas-aces-vs-chicago-sky/1613894209/
2025-08-25 05:53:58,117 - INFO - Saving combined data to local CSVs for WNBA...
2025-08-25 05:53:58,126 - INFO - 
--- Processing League: BRAZIL - PAULISTA FPB U20 (1 games found) ---
2025-08-25 05:53:58,127 - INFO - Downloading existing files for BRAZIL - PAULISTA FPB U20...


Dataset URL: https://www.kaggle.com/datasets/zachht/wnba-odds-history
Dataset URL: https://www.kaggle.com/datasets/zachht/wnba-odds-history


2025-08-25 05:53:59,411 - INFO - Successfully loaded existing data.
2025-08-25 05:53:59,415 - INFO - Scraping detailed odds from: https://www.pinnacle.com/en/basketball/brazil-paulista-fpb-u20/sao-jose-basketball-vs-paulistano/1613870162/
2025-08-25 05:54:04,298 - INFO - Saving combined data to local CSVs for BRAZIL - PAULISTA FPB U20...
2025-08-25 05:54:04,302 - INFO - 
--- Processing League: FIBA - AMERICUP (1 games found) ---
2025-08-25 05:54:04,303 - INFO - Downloading existing files for FIBA - AMERICUP...


Dataset URL: https://www.kaggle.com/datasets/zachht/wnba-odds-history
Dataset URL: https://www.kaggle.com/datasets/zachht/wnba-odds-history


2025-08-25 05:54:05,031 - INFO - Successfully loaded existing data.
2025-08-25 05:54:05,035 - INFO - Scraping detailed odds from: https://www.pinnacle.com/en/basketball/fiba-americup/canada-vs-puerto-rico/1613591100/
2025-08-25 05:54:09,250 - INFO - Saving combined data to local CSVs for FIBA - AMERICUP...
2025-08-25 05:54:09,253 - INFO - 
--- Processing League: WORLD - CLUB FRIENDLIES (1 games found) ---
2025-08-25 05:54:09,254 - INFO - Downloading existing files for WORLD - CLUB FRIENDLIES...


Dataset URL: https://www.kaggle.com/datasets/zachht/wnba-odds-history
Dataset URL: https://www.kaggle.com/datasets/zachht/wnba-odds-history


2025-08-25 05:54:09,974 - INFO - Successfully loaded existing data.
2025-08-25 05:54:09,976 - INFO - Scraping detailed odds from: https://www.pinnacle.com/en/basketball/world-club-friendlies/changwon-lg-sakers-vs-up-fighting-maroons/1614058987/
2025-08-25 05:54:13,923 - INFO - Saving combined data to local CSVs for WORLD - CLUB FRIENDLIES...
2025-08-25 05:54:13,927 - INFO - 
--- Processing League: FIBA - EUROBASKET (6 games found) ---
2025-08-25 05:54:13,927 - INFO - Downloading existing files for FIBA - EUROBASKET...


Dataset URL: https://www.kaggle.com/datasets/zachht/wnba-odds-history
Dataset URL: https://www.kaggle.com/datasets/zachht/wnba-odds-history


2025-08-25 05:54:14,736 - INFO - Successfully loaded existing data.
2025-08-25 05:54:14,739 - INFO - Scraping detailed odds from: https://www.pinnacle.com/en/basketball/fiba-eurobasket/georgia-vs-spain/1613561311/
2025-08-25 05:54:18,911 - INFO - Scraping detailed odds from: https://www.pinnacle.com/en/basketball/fiba-eurobasket/israel-vs-iceland/1613551699/
2025-08-25 05:54:37,246 - INFO - Scraping detailed odds from: https://www.pinnacle.com/en/basketball/fiba-eurobasket/belgium-vs-france/1613561310/
2025-08-25 05:54:57,928 - ERROR - Could not load market data for URL: https://www.pinnacle.com/en/basketball/fiba-eurobasket/belgium-vs-france/1613561310/
2025-08-25 05:54:57,930 - INFO - Scraping detailed odds from: https://www.pinnacle.com/en/basketball/fiba-eurobasket/bosnia-herzegovina-vs-cyprus/1613559812/
2025-08-25 05:55:02,692 - INFO - Scraping detailed odds from: https://www.pinnacle.com/en/basketball/fiba-eurobasket/greece-vs-italy/1613551698/
2025-08-25 05:55:07,260 - INFO - S

Starting upload for file fiba_americup_detailed_odds.csv


100%|██████████| 960/960 [00:00<00:00, 2.76kB/s]


Upload successful: fiba_americup_detailed_odds.csv (960B)
Starting upload for file .virtual_documents.zip


100%|██████████| 22.0/22.0 [00:00<00:00, 64.7B/s]


Upload successful: .virtual_documents.zip (22B)
Starting upload for file brazil_paulista_fpb_u20_main_lines.csv


100%|██████████| 616/616 [00:00<00:00, 1.84kB/s]


Upload successful: brazil_paulista_fpb_u20_main_lines.csv (616B)
Starting upload for file fiba_eurobasket_detailed_odds.csv


100%|██████████| 4.41k/4.41k [00:00<00:00, 13.2kB/s]


Upload successful: fiba_eurobasket_detailed_odds.csv (4KB)
Starting upload for file world_club_friendlies_detailed_odds.csv


100%|██████████| 9.21k/9.21k [00:00<00:00, 28.9kB/s]


Upload successful: world_club_friendlies_detailed_odds.csv (9KB)
Starting upload for file wnba_detailed_odds.csv


100%|██████████| 1.85k/1.85k [00:00<00:00, 5.48kB/s]


Upload successful: wnba_detailed_odds.csv (2KB)
Starting upload for file wnba_main_lines.csv


100%|██████████| 556/556 [00:00<00:00, 1.61kB/s]


Upload successful: wnba_main_lines.csv (556B)
Starting upload for file world_club_friendlies_main_lines.csv


100%|██████████| 640/640 [00:00<00:00, 1.87kB/s]


Upload successful: world_club_friendlies_main_lines.csv (640B)
Starting upload for file fiba_americup_main_lines.csv


100%|██████████| 539/539 [00:00<00:00, 1.61kB/s]


Upload successful: fiba_americup_main_lines.csv (539B)
Starting upload for file debug_page.html


100%|██████████| 173k/173k [00:00<00:00, 296kB/s]


Upload successful: debug_page.html (173KB)
Starting upload for file brazil_paulista_fpb_u20_detailed_odds.csv


100%|██████████| 8.29k/8.29k [00:00<00:00, 24.9kB/s]


Upload successful: brazil_paulista_fpb_u20_detailed_odds.csv (8KB)
Starting upload for file fiba_eurobasket_main_lines.csv


100%|██████████| 2.29k/2.29k [00:00<00:00, 6.97kB/s]


Upload successful: fiba_eurobasket_main_lines.csv (2KB)


2025-08-25 05:55:20,825 - INFO - Selenium driver closed.



--- Data Pipeline Execution Finished ---


In [8]:
import time

print("This message prints immediately.")

# Pause execution for 3 seconds
time.sleep(3)

print("This message prints after a 3-second delay.")

This message prints immediately.
This message prints after a 3-second delay.


In [None]:
# ==============================================================================
# STEP 1: KAGGLE AUTH & PYTHON DEPENDENCIES
# ==============================================================================
print("--- Installing Python Dependencies ---")
!pip install -q selenium pandas kaggle undetected-chromedriver

import os
import pandas as pd
import logging
import json
import re
from datetime import datetime
from kaggle_secrets import UserSecretsClient
from importlib import reload

# Force logging to be active so we see all messages
reload(logging)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

print("\n--- Setting up Kaggle API Authentication ---")
api = None
try:
    user_secrets = UserSecretsClient()
    secret_value = user_secrets.get_secret("KAGGLE_JSON")
    kaggle_dir = os.path.expanduser('~/.kaggle')
    os.makedirs(kaggle_dir, exist_ok=True)
    kaggle_json_path = os.path.join(kaggle_dir, 'kaggle.json')
    with open(kaggle_json_path, 'w') as f: f.write(secret_value)
    os.chmod(kaggle_json_path, 600)
    
    from kaggle.api.kaggle_api_extended import KaggleApi
    api = KaggleApi()
    api.authenticate()
    print("Kaggle API Authentication Successful.")
except Exception as e:
    logging.critical(f"FATAL: A critical error occurred during Kaggle setup. Error: {e}")
    raise

# ==============================================================================
# STEP 2: SYSTEM INSTALLATIONS (CHROME) - THIS IS INTENTIONALLY LEFT BLANK
# We will use the Chrome version pre-installed in the Kaggle environment
# to avoid conflicts with undetected-chromedriver.
# ==============================================================================
print("\n--- Skipping manual Chrome installation to use the environment's default ---")


# ==============================================================================
# STEP 3: SCRAPER FUNCTIONS (WITH ROBUSTNESS FIXES)
# ==============================================================================
import time
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def get_all_leagues_and_games(driver):
    """
    Scrapes the main basketball page with robust waits and debugging.
    """
    url = "https://www.pinnacle.com/en/basketball/matchups/"
    logging.info(f"Navigating to matchups page: {url}")
    driver.get(url)

    # Handle cookie banner if it appears
    try:
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))).click()
        logging.info("Clicked the Accept button for cookies."); time.sleep(2)
    except TimeoutException:
        logging.warning("Cookie banner not found or already handled.")

    leagues_data = {}
    current_league_name = None

    try:
        # Wait for the main content container to be present.
        content_container_selector = (By.CSS_SELECTOR, ".contentBlock.square")
        logging.info("Waiting for the main content container to load...")
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located(content_container_selector)
        )
        logging.info("Main content container found. Proceeding to scrape rows.")
        
        time.sleep(2) # Give JS a moment to render after container is found

        all_rows = driver.find_elements(By.CSS_SELECTOR, ".contentBlock.square > div[class*='row-']")
        if not all_rows:
            logging.error("Content container was found, but it contains no game or league rows.")
            with open("debug_page_no_rows.html", "w", encoding="utf-8") as f:
                f.write(driver.page_source)
            logging.info("Saved debug_page_no_rows.html to output for analysis.")
            return {}

        logging.info(f"Found {len(all_rows)} total rows to process on the matchups page.")

        for row in all_rows:
            row_class = row.get_attribute('class')
            
            if 'row-CTcjEjV6yK' in row_class:
                try:
                    league_name = row.find_element(By.CSS_SELECTOR, "a span").text.strip()
                    if league_name:
                        current_league_name = league_name
                        leagues_data[current_league_name] = []
                        logging.info(f"Discovered new league section: {current_league_name}")
                except NoSuchElementException:
                    continue 

            elif 'row-k9ktBvvTsJ' in row_class and current_league_name:
                try:
                    game = {}
                    link_tag = row.find_element(By.CSS_SELECTOR, "a[href*='/basketball/']")
                    teams = link_tag.find_elements(By.CSS_SELECTOR, "span.ellipsis.gameInfoLabel-EDDYv5xEfd")
                    game['team1'], game['team2'] = teams[0].text, teams[1].text
                    game['game_link'] = link_tag.get_attribute('href')
                    
                    odds_groups = row.find_elements(By.CSS_SELECTOR, "div.buttons-j19Jlcwsi9")
                    def get_text(elements, index): return elements[index].text if index < len(elements) else 'N/A'
                    
                    h_spans = odds_groups[0].find_elements(By.CSS_SELECTOR, "button span")
                    ml_spans = odds_groups[1].find_elements(By.CSS_SELECTOR, "span.price-r5BU0ynJha")
                    t_spans = odds_groups[2].find_elements(By.CSS_SELECTOR, "button span")
                    
                    game.update({'team1_moneyline': get_text(ml_spans, 0), 'team2_moneyline': get_text(ml_spans, 1),'team1_spread': get_text(h_spans, 0), 'team1_spread_odds': get_text(h_spans, 1),'team2_spread': get_text(h_spans, 2), 'team2_spread_odds': get_text(h_spans, 3),'over_total': get_text(t_spans, 0), 'over_total_odds': get_text(t_spans, 1),'under_total': get_text(t_spans, 2), 'under_total_odds': get_text(t_spans, 3)})
                    
                    leagues_data[current_league_name].append(game)
                except (NoSuchElementException, IndexError):
                    continue

    except TimeoutException:
        logging.error("FATAL: Timed out waiting for the main content container. The page may be blocked or changed.")
        with open("debug_page.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        logging.info("Saved debug_page.html to output. This file will show what the scraper saw (e.g., a CAPTCHA).")
    
    return leagues_data

def scrape_detailed_game_odds(driver, game_url):
    logging.info(f"Scraping detailed odds from: {game_url}")
    driver.get(game_url)
    all_markets_data = []
    try:
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.marketGroups-HjCkfKkLNt"))); time.sleep(2)
        market_groups = driver.find_elements(By.CSS_SELECTOR, "div.marketGroup-wMlWprW2iC")
        for group in market_groups:
            market_title = group.find_element(By.CSS_SELECTOR, "span.titleText-BgvECQYfHf").text
            if not group.find_elements(By.CSS_SELECTOR, "ul[data-test-id]"):
                for btn in group.find_elements(By.CSS_SELECTOR, "button"):
                    parts = btn.text.split('\n')
                    if len(parts) == 2: all_markets_data.append({'Market': market_title, 'Selection': parts[0], 'Odds': parts[1]})
                continue
            headers = [h.text for h in group.find_elements(By.CSS_SELECTOR, "ul[data-test-id] > li")]
            button_rows = group.find_elements(By.CSS_SELECTOR, ".buttonRow-zWMLOGu5YB")
            for row in button_rows:
                buttons = row.find_elements(By.TAG_NAME, 'button')
                if len(buttons) == len(headers):
                    for i, btn in enumerate(buttons):
                        parts = btn.text.split('\n')
                        if len(parts) == 2:
                            selection_name = f"{headers[i]} {parts[0]}"
                            all_markets_data.append({'Market': market_title, 'Selection': selection_name, 'Odds': parts[1]})
    except TimeoutException:
        logging.error(f"Could not load market data for URL: {game_url}")
    return pd.DataFrame(all_markets_data)

def to_slug(name):
    return re.sub(r'[^a-z0-9]+', '_', name.lower()).strip('_')

# ==============================================================================
# STEP 4: MAIN DATA PIPELINE EXECUTION
# ==============================================================================
print("\n--- Starting Data Pipeline Execution ---")
if __name__ == "__main__" and api:
    DATASET_SLUG = "zachht/wnba-odds-history" 
    WORKING_DIR = "/kaggle/working"
    
    driver = None
    leagues_updated = []
    try:
        # CORRECT INITIALIZATION FOR UNDETECTED CHROMEDRIVER IN KAGGLE
        logging.info("Initializing Undetected ChromeDriver...")
        options = uc.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument('--disable-dev-shm-usage')
        
        driver = uc.Chrome(options=options)
        logging.info("Undetected ChromeDriver initialized successfully.")
        
        all_leagues_games = get_all_leagues_and_games(driver)

        if not all_leagues_games:
            logging.warning("Scraping finished, but no leagues were found on the site. Check debug files if they were created.")
        else:
            for league_name, new_main_lines_data in all_leagues_games.items():
                if not new_main_lines_data:
                    logging.info(f"No games found for league: {league_name}. Skipping.")
                    continue

                logging.info(f"\n--- Processing League: {league_name} ({len(new_main_lines_data)} games found) ---")
                leagues_updated.append(league_name)
                league_slug = to_slug(league_name)

                MAIN_CSV_PATH = os.path.join(WORKING_DIR, f"{league_slug}_main_lines.csv")
                DETAILED_CSV_PATH = os.path.join(WORKING_DIR, f"{league_slug}_detailed_odds.csv")

                try:
                    logging.info(f"Downloading existing files for {league_name}...")
                    api.dataset_download_file(DATASET_SLUG, file_name=os.path.basename(MAIN_CSV_PATH), path=WORKING_DIR)
                    api.dataset_download_file(DATASET_SLUG, file_name=os.path.basename(DETAILED_CSV_PATH), path=WORKING_DIR)
                    old_main_df = pd.read_csv(MAIN_CSV_PATH)
                    old_detailed_df = pd.read_csv(DETAILED_CSV_PATH)
                    logging.info("Successfully loaded existing data.")
                except Exception:
                    logging.warning(f"Could not load existing data for {league_name}. Starting with fresh history files.")
                    old_main_df, old_detailed_df = pd.DataFrame(), pd.DataFrame()

                scrape_timestamp = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
                new_main_df = pd.DataFrame(new_main_lines_data)
                new_main_df['timestamp'] = scrape_timestamp
                combined_main_df = pd.concat([old_main_df, new_main_df], ignore_index=True)
                
                all_detailed_dfs = []
                for game in new_main_lines_data:
                    detailed_df = scrape_detailed_game_odds(driver, game['game_link'])
                    if not detailed_df.empty:
                        detailed_df['matchup'] = f"{game['team1']} vs {game['team2']}"
                        all_detailed_dfs.append(detailed_df)
                
                if all_detailed_dfs:
                    new_detailed_df = pd.concat(all_detailed_dfs, ignore_index=True)
                    new_detailed_df['timestamp'] = scrape_timestamp
                    combined_detailed_df = pd.concat([old_detailed_df, new_detailed_df], ignore_index=True)
                    
                    logging.info(f"Saving combined data to local CSVs for {league_name}...")
                    combined_main_df.to_csv(MAIN_CSV_PATH, index=False)
                    combined_detailed_df.to_csv(DETAILED_CSV_PATH, index=False)
            
            if leagues_updated:
                logging.info("\n--- Finalizing and Uploading to Kaggle ---")
                metadata_path = os.path.join(WORKING_DIR, 'dataset-metadata.json')
                metadata = {"title": "Pinnacle Basketball Odds History", "id": DATASET_SLUG, "licenses": [{"name": "CC0-1.0"}]}
                with open(metadata_path, 'w') as f: json.dump(metadata, f)
                
                version_note = f"Automated odds update. Leagues updated: {', '.join(leagues_updated)}."
                logging.info(f"Pushing new dataset version. {version_note}")
                api.dataset_create_version(folder=WORKING_DIR, version_notes=version_note, quiet=False, dir_mode='zip')
            else:
                logging.warning("No games were found for any leagues. No new version will be pushed.")

    except Exception as e:
        logging.error(f"An error occurred during the main pipeline: {e}", exc_info=True)
    finally:
        if driver: driver.quit(); logging.info("Selenium driver closed.")

print("\n--- Data Pipeline Execution Finished ---")

--- Installing Python Dependencies ---

--- Setting up Kaggle API Authentication ---


2025-08-25 05:57:05,263 - INFO - Initializing Undetected ChromeDriver...


Kaggle API Authentication Successful.

--- Skipping manual Chrome installation to use the environment's default ---

--- Starting Data Pipeline Execution ---


2025-08-25 05:57:06,520 - INFO - patching driver executable /root/.local/share/undetected_chromedriver/undetected_chromedriver
