In [2]:
import pandas as pd
import time
from pathlib import Path
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

import warnings
import pandas as pd
import sys
from pathlib import Path
warnings.filterwarnings("ignore")


In [3]:
chrome_options = Options()

chrome_options.add_experimental_option("detach", True)
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging", "enable-automation"])

# Add user agent to avoid bot detection
chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

# Performance optimizations
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_argument('--disable-extensions')


In [None]:
# League IDs organized by tier

IDs = {
    # ============ TIER 1: Top European Leagues ============
    "Premier_League": 13,      # England
    "La_Liga": 53,             # Spain
    "Serie_A": 31,             # Italy
    "Bundesliga": 19,          # Germany
    "Ligue_1": 16,             # France
    
    # ============ TIER 2: Second Divisions & Competitive Leagues ============
    "Championship": 14,        # England (2nd tier)
    "La_Liga_2": 54,           # Spain (2nd tier)
    "Eredivisie": 10,          # Netherlands
    "Pro_League": 4,           # Belgium
    
    # ============ TIER 3: Other Leagues ============
    "MLS": 39,                 # United States
    
}

In [11]:
LEAGUES = """
lg%5B%5D=13&lg%5B%5D=31&lg%5B%5D=53&lg%5B%5D=19&lg%5B%5D=14&lg%5B%5D=54&lg%5B%5D=10&lg%5B%5D=4&lg%5B%5D=83&lg%5B%5D=2012
"""

LEAGUES = LEAGUES.strip()

In [None]:
BASE_URL = f"""
https://sofifa.com/?&showCol%5B%5D=pi&{LEAGUES}&showCol%5B%5D=oa&showCol%5B%5D=pt&showCol%5B%5D=tt&showCol%5B%5D=pi&showCol%5B%5D=by&showCol%5B%5D=hi&showCol%5B%5D=wi&showCol%5B%5D=pf&showCol%5B%5D=bo&showCol%5B%5D=bp&showCol%5B%5D=gu&showCol%5B%5D=jt&showCol%5B%5D=le&showCol%5B%5D=ta&showCol%5B%5D=cr&showCol%5B%5D=he&showCol%5B%5D=sh&showCol%5B%5D=vo&showCol%5B%5D=fi&showCol%5B%5D=ts&showCol%5B%5D=cu&showCol%5B%5D=dr&showCol%5B%5D=lo&showCol%5B%5D=bl&showCol%5B%5D=fr&showCol%5B%5D=ac&showCol%5B%5D=ag&showCol%5B%5D=re&showCol%5B%5D=sp&showCol%5B%5D=ba&showCol%5B%5D=to&showCol%5B%5D=tp&showCol%5B%5D=so&showCol%5B%5D=st&showCol%5B%5D=sr&showCol%5B%5D=ln&showCol%5B%5D=ju&showCol%5B%5D=ar&showCol%5B%5D=vi&showCol%5B%5D=po&showCol%5B%5D=pe&showCol%5B%5D=cm&showCol%5B%5D=in&showCol%5B%5D=te&showCol%5B%5D=sa&showCol%5B%5D=sl&showCol%5B%5D=ma&showCol%5B%5D=td&showCol%5B%5D=tg&showCol%5B%5D=gc&showCol%5B%5D=gh&showCol%5B%5D=gd&showCol%5B%5D=gr&showCol%5B%5D=gp&showCol%5B%5D=bs&showCol%5B%5D=wk&showCol%5B%5D=aw&showCol%5B%5D=ir&showCol%5B%5D=bt&showCol%5B%5D=dw&showCol%5B%5D=pac&showCol%5B%5D=sk&showCol%5B%5D=sho&showCol%5B%5D=pas&showCol%5B%5D=dri&showCol%5B%5D=def&showCol%5B%5D=phy&showCol%5B%5D=t1&showCol%5B%5D=ps1&showCol%5B%5D=ps2&showCol%5B%5D=tc&showCol%5B%5D=hc&showCol%5B%5D=t2&showCol%5B%5D=cp&showCol%5B%5D=at&showCol%5B%5D=wg&showCol%5B%5D=vl&showCol%5B%5D=rc&showCol%5B%5D=cj"""

BASE_URL = BASE_URL.strip()

In [31]:
# FIFA roster dates and their corresponding IDs (filtered to ~1 per month)
roster_dates = {
    # "Sep 11, 2023": 230054,
    "Aug 11, 2023": 230050,
    "Jul 17, 2023": 230045,
    "Jun 19, 2023": 230040,
    "May 16, 2023": 230034,
    "Apr 17, 2023": 230028,
    "Mar 24, 2023": 230021,
    "Feb 22, 2023": 230015,
    "Jan 18, 2023": 230010,
    "Dec 17, 2022": 230008,
    "Nov 16, 2022": 230006,
    "Oct 7, 2022": 230004,
    "Sep 1, 2022": 230001
}


In [32]:
class SoFIFAScraper:
    def __init__(self, seasons, base_url, save_dir):
        self.seasons = seasons  # Dictionary: {date_string: season_code}
        self.base_url = base_url
        self.save_dir = save_dir

    def initialize_driver(self):        
        chrome_options = Options()

        chrome_options.add_experimental_option("detach", True)
        chrome_options.add_experimental_option("excludeSwitches", ["enable-logging", "enable-automation"])

        # Add user agent to avoid bot detection
        chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

        # Performance optimizations
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_argument('--disable-extensions')
        
        chrome_options.page_load_strategy = "eager"

        driver = webdriver.Chrome(options=chrome_options)
        print("Chrome WebDriver initialized")
        
        return driver


    def get_player_stats(self, driver, season_code, roster_date):
        """
        Scrape player stats for a given FIFA roster date.
        
        Args:
            driver: Selenium WebDriver instance
            season_code: SoFIFA season code (e.g., 230054)
            roster_date: Roster date string (e.g., "Sep 11, 2023")
            
        Returns:
            DataFrame with all players for the roster date
        """
        data_list = []
        offset = 0
        
        while True:
            try:
                url = f"{self.base_url}&r={season_code}&set=true&offset={offset}"
                driver.get(url)
                time.sleep(3)
                
                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.TAG_NAME, "table"))
                )
                
            except TimeoutException:
                print(f"  ✓ Scraping complete at offset {offset}")
                break
            except Exception as e:
                print(f"  ✗ Error at offset {offset}: {e}")
                break
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            table = soup.find('table')
            
            if not table:
                print("  ✗ ERROR: No table found!")
                break
            
            # Get headers only once
            if offset == 0:
                headers = [th.get_text(strip=True) for th in table.select("thead th")]
            
            # Extract rows
            for row in table.select("tbody tr"):
                cols = row.find_all(['th', 'td'])
                if cols:
                    data_list.append([col.get_text(strip=True) for col in cols])

            offset += 60
            
            if offset % 600 == 0:
                print(f"  → {offset} players scraped...")
        
        # Create DataFrame
        df = pd.DataFrame(data_list, columns=headers)
        
        # Add metadata
        df['roster_date'] = roster_date
        df['season_code'] = season_code

        print(f"  ✓ Total players collected: {len(df)}")
        return df

    def close_driver(self, driver):
        """Close the WebDriver."""
        driver.quit()
        print("Chrome WebDriver closed")
    
    def scrape_all_separate(self):
        """Scrape all roster dates and save each to a separate CSV file."""
        # Ensure directory exists
        self.save_dir.mkdir(parents=True, exist_ok=True)
        
        driver = self.initialize_driver()
        
        total_rows = 0
        completed = 0
        saved_files = []
        
        print(f"\n{'='*60}")
        print(f"Starting SoFIFA Player Data Collection")
        print(f"Total roster dates to scrape: {len(self.seasons)}")
        print(f"Save location: {self.save_dir}")
        print(f"{'='*60}\n")
        
        try:
            for roster_date, season_code in self.seasons.items():
                completed += 1
                print(f"\n[{completed}/{len(self.seasons)}] Scraping: {roster_date} (code: {season_code})")
                print(f"{'-'*60}")
                
                # Clear cookies between seasons
                driver.delete_all_cookies()
                
                df = self.get_player_stats(driver, season_code, roster_date)
                
                # Create filename from season code
                output_path = self.save_dir / f"players_{season_code}.csv"
                
                # Write to separate CSV file
                df.to_csv(output_path, index=False)
                
                total_rows += len(df)
                saved_files.append(output_path.name)
                print(f"  ✓ Saved {len(df)} rows to: {output_path.name}")
                
                # Clear memory
                del df
        
        finally:
            self.close_driver(driver)
            
        print(f"\n{'='*60}")
        print(f"✓ SCRAPING COMPLETE")
        print(f"  • Total roster dates: {len(self.seasons)}")
        print(f"  • Total players collected: {total_rows}")
        print(f"  • Total files created: {len(saved_files)}")
        print(f"  • Saved to: {self.save_dir}")
        print(f"{'='*60}\n")


In [33]:
# Set up paths and parameters
project_root = Path(__file__).resolve().parents[2] if '__file__' in globals() else Path.cwd().parents[1]
save_directory = project_root / "data" / "player_data_train"

# Create scraper instance
scraper = SoFIFAScraper(
    seasons=roster_dates,
    base_url=BASE_URL,
    save_dir=save_directory
)

print(f"Project root: {project_root}")
print(f"Save directory: {save_directory}")
print(f"Number of seasons to scrape: {len(roster_dates)}")


Project root: /Users/lionlucky7/01.Projects/In-progress/soccer_prediction/2026_world_cup
Save directory: /Users/lionlucky7/01.Projects/In-progress/soccer_prediction/2026_world_cup/data/player_data_train
Number of seasons to scrape: 12


In [34]:
# Run the scraper (saves each roster date to a separate CSV file)
scraper.scrape_all_separate()


Chrome WebDriver initialized

Starting SoFIFA Player Data Collection
Total roster dates to scrape: 12
Save location: /Users/lionlucky7/01.Projects/In-progress/soccer_prediction/2026_world_cup/data/player_data_train


[1/12] Scraping: Aug 11, 2023 (code: 230050)
------------------------------------------------------------
  → 600 players scraped...
  → 1200 players scraped...
  → 1800 players scraped...
  → 2400 players scraped...
  → 3000 players scraped...
  → 3600 players scraped...
  → 4200 players scraped...
  → 4800 players scraped...
  → 5400 players scraped...
  ✓ Scraping complete at offset 5580
  ✓ Total players collected: 5539
  ✓ Saved 5539 rows to: players_230050.csv

[2/12] Scraping: Jul 17, 2023 (code: 230045)
------------------------------------------------------------
  → 600 players scraped...
  → 1200 players scraped...
  → 1800 players scraped...
  → 2400 players scraped...
  → 3000 players scraped...
  → 3600 players scraped...
  → 4200 players scraped...
  → 4800 p