In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import json
import os
import random
from urllib.parse import urljoin
import csv
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager


def get_random_headers():
    """Load and return a random set of headers from the JSON file."""
    # Get the directory where the current script is located
    script_dir = os.path.dirname(os.path.abspath(__file__))
    headers_file = os.path.join(script_dir, 'browser_headers.json')
    
    try:
        with open(headers_file, 'r') as f:
            headers_list = json.load(f)
        
        # Return a random set of headers
        return random.choice(headers_list)
    
    except FileNotFoundError:
        # Fallback to your original headers if file not found
        return {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }


def scrape_sb_today():
    """
    Scrapes SportyBet today's football matches and extracts match data
    Returns a list of dictionaries containing match data
    """
    url = "https://www.sportybet.com/ng/sport/football/today"
    
    # Headers to mimic a real browser
    # headers = get_random_headers()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1"
    }
    
    try:
        # Set up headless Chrome
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run without opening a browser window
        chrome_options.add_argument("--no-sandbox")  # For stability in some environments
        chrome_options.add_argument("--disable-dev-shm-usage")  # Avoid resource issues
        chrome_options.add_argument("--disable-gpu")  # Additional stability
        chrome_options.add_argument("--disable-extensions")
        chrome_options.add_argument("--disable-logging")  # Reduce log noise
        chrome_options.add_argument("--log-level=3")  # Only fatal errors
        chrome_options.add_argument(f"user-agent={headers['User-Agent']}")  # Reuse your user-agent for consistency
        
        # Initialize driver
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        driver.get(url)
        
        # Wait for JS to load (adjust timeout if needed; 10 seconds should suffice for this site)
        driver.implicitly_wait(10)

        # Get page source and clean it before parsing
        page_source = driver.page_source
        
        # Clean the page source to remove any problematic content
        # Remove any WebDriver-related paths that might be causing issues
        page_source = re.sub(r'/[^<>]*?\.wdm/[^<>]*?chromedriver[^<>]*?', '', page_source)
        page_source = re.sub(r'\[[^<>\[\]]*?chromedriver[^<>\[\]]*?\]', '', page_source)

        # Parse with explicit parser and error handling
        try:
            # Try html.parser first (most robust)
            soup = BeautifulSoup(page_source, 'html.parser')
        except Exception as e1:
            print(f"⚠️ html.parser failed: {e1}")
            try:
                # Fallback to lxml if available
                soup = BeautifulSoup(page_source, 'lxml')
            except Exception as e2:
                print(f"⚠️ lxml parser failed: {e2}")
                # Last resort - use html5lib if available
                try:
                    soup = BeautifulSoup(page_source, 'html5lib')
                except Exception as e3:
                    print(f"❌ All parsers failed. html5lib error: {e3}")
                    driver.quit()
                    return []
        
        # Find all matches with the correct class structure
        matches = soup.find_all('div', class_='m-table-row m-content-row match-row')
        print(f"Found {len(matches)} events for today")
        
        extracted_data = []
        
        for match in matches:
            try:                
                left_team_cell = match.find(class_='m-table-cell left-team-cell')

                if left_team_cell:
                    left_team_table = left_team_cell.find(class_='left-team-table')
                    if left_team_table:
                        game_id_elem = left_team_table.find(class_='game-id')
                        if game_id_elem:
                            game_id_text = game_id_elem.get_text(strip=True)
                            # Extract 5-digit number using regex
                            game_id_match = re.search(r'\b\d{5}\b', game_id_text)
                            # if game_id_match:
                            #     match_data['game_id'] = game_id_match.group()
                            # else:
                            #     match_data['game_id'] = game_id_text  # Fallback to full text if no 5-digit found

                        # Extract time
                        time_elem = left_team_table.find(class_='clock-time')
                        if time_elem:
                            time_text = time_elem.get_text(strip=True)
        
                # Find teams container
                teams_container = match.find(class_='teams')
                if not teams_container:
                    continue
                
                # Extract team names
                home_team_elem = teams_container.find(class_='home-team')
                away_team_elem = teams_container.find(class_='away-team')
                
                if not home_team_elem or not away_team_elem:
                    continue
                
                home_team = home_team_elem.get_text(strip=True)
                away_team = away_team_elem.get_text(strip=True)
                
                # Extract title from teams container
                title = teams_container.get('title', f"{home_team} vs {away_team}")
                   
                match_data = {
                    'time': time_text,
                    'title': title,
                    'game-id': game_id_match.group() if game_id_match else game_id_text,
                    'home-team': home_team,
                    'away-team': away_team,
                }
                extracted_data.append(match_data)                
            
            except Exception as e:
                print(f"⚠️ Error processing match: {e}")
                continue

        driver.quit()  # Clean up browser session
        
        print(f"\n📊 Summary:")
        print(f"   - Total events found: {len(matches)}")
        
        return extracted_data
    
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching data: {e}")
        return []
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        return []


def save_to_csv(data, filename=None):
    """
    Save extracted data to CSV file with timestamp
    """
    if not data:
        print("❌ No data to save")
        return False, None
    
    if filename is None:
        # Generate filename with current timestamp
        current_time = datetime.now()
        filename = f"sb_save_{current_time.strftime('%d-%m-%y-%H-%M-%S')}.csv"
    
    try:
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        print(f"💾 Data saved to {filename}")
        return True, filename
    except Exception as e:
        print(f"❌ Error saving to CSV: {e}")
        return False, None


def main():
    """
    Main function to run the scraper and save data
    """
    print("🚀 Starting sb today scraper...")
    
    # Scrape the data
    matches_data = scrape_sb_today()
    
    if matches_data:
        # Save to CSV
        success, filename = save_to_csv(matches_data)
        
        if success:
            print(f"✅ Successfully scraped and saved {len(matches_data)} matches to {filename}")
        else:
            print("❌ Failed to save data to CSV")
    else:
        print("❌ No data was scraped")


if __name__ == "__main__":
    main()

🚀 Starting sb today scraper...
Found 47 events for today

📊 Summary:
   - Total events found: 47
💾 Data saved to sb_today_02-09-25-19-37-29.csv
✅ Successfully scraped and saved 47 matches to sb_today_02-09-25-19-37-29.csv
