In [20]:
import pandas as pd

In [46]:
import pandas as pd
import soccerdata as sd
from pathlib import Path

class FBREFAPI:
    def __init__(self, years, list_stats, league, save_dir):
        self.years = years
        self.list_stats = list_stats
        self.league = league
        self.save_dir = Path(save_dir)  # Convert to Path object

    def individual_stats(self, year):
        """Collect individual player statistics for each year."""
        fbref = sd.FBref(leagues=[self.league], seasons=[year])
        total_others = pd.DataFrame()

        for i, stat in enumerate(self.list_stats):
            pl = fbref.read_player_season_stats(stat_type=stat)
            pl.columns = ['_'.join(col).strip() if col[1] else col[0] 
                         for col in pl.columns.values]
            pl = pl.reset_index()
            others = pl[pl['pos'] != "GK"]

            if i == 0:
                total_others = others
            else:
                others_columns = ['player'] + [col for col in others.columns 
                                             if col not in total_others.columns and col != 'player']
                others = others[others_columns]
                total_others = total_others.merge(others, how='left', on=['player'])
                total_others = total_others[total_others['team'].notna()]

            print("Data Collected:", stat)

        total_others = total_others.drop_duplicates(subset=['player', 'season']).reset_index(drop=True)
        print("All Individual Data Collected for", year)
        return total_others
    
    def save_all(self):
        output_path = self.save_dir
        
        for year in self.years:
            df = self.individual_stats(year)
            df.to_csv(f"/Users/lionlucky7/01.Projects/In-progress/soccer_prediction/model_2/data/raw_yearly/fbref_{year}.csv", index=False)
            print(f"✓ {year} data ({len(df)} rows) saved")
            del df

    def combine_total(self):
        """Combine all yearly data into one total file."""
        output_path = self.save_dir / "fbref_second.csv"
        first_year = not output_path.exists()

        for year in self.years:
            df = self.individual_stats(year)  # Fixed: use self.individual_stats()
            df.to_csv(output_path, mode='w' if first_year else 'a', 
                     header=first_year, index=False)
            first_year = False
            print(f"✓ {year} data ({len(df)} rows) appended")
            del df



In [22]:
import warnings
warnings.filterwarnings('ignore')

In [48]:
# Usage

list_stats = ["standard", "shooting", "passing", "passing_types", 
              "goal_shot_creation", "defense", "possession", "playing_time", "misc"]
years = [1718, 1819, 1920, 2021, 2122, 2223, 2324, 2425, 2526] 
league = 'Big 5 European Leagues Combined'

SAVE_DIR = "./data/raw/"
fbref_api = FBREFAPI(years, list_stats, league, SAVE_DIR)
fbref_api.combine_total()

Data Collected: standard
Data Collected: shooting
Data Collected: passing
Data Collected: passing_types
Data Collected: goal_shot_creation
Data Collected: defense
Data Collected: possession
Data Collected: playing_time
Data Collected: misc
All Individual Data Collected for 1718
✓ 1718 data (2359 rows) appended


Data Collected: standard
Data Collected: shooting
Data Collected: passing
Data Collected: passing_types
Data Collected: goal_shot_creation
Data Collected: defense
Data Collected: possession
Data Collected: playing_time
Data Collected: misc
All Individual Data Collected for 1819
✓ 1819 data (2348 rows) appended


Data Collected: standard
Data Collected: shooting
Data Collected: passing
Data Collected: passing_types
Data Collected: goal_shot_creation
Data Collected: defense
Data Collected: possession
Data Collected: playing_time
Data Collected: misc
All Individual Data Collected for 1920
✓ 1920 data (2401 rows) appended


Data Collected: standard
Data Collected: shooting
Data Collected: passing
Data Collected: passing_types
Data Collected: goal_shot_creation
Data Collected: defense
Data Collected: possession
Data Collected: playing_time
Data Collected: misc
All Individual Data Collected for 2021
✓ 2021 data (2493 rows) appended


Data Collected: standard
Data Collected: shooting
Data Collected: passing
Data Collected: passing_types
Data Collected: goal_shot_creation
Data Collected: defense
Data Collected: possession
Data Collected: playing_time
Data Collected: misc
All Individual Data Collected for 2122
✓ 2122 data (2572 rows) appended


Data Collected: standard
Data Collected: shooting
Data Collected: passing
Data Collected: passing_types
Data Collected: goal_shot_creation
Data Collected: defense
Data Collected: possession
Data Collected: playing_time
Data Collected: misc
All Individual Data Collected for 2223
✓ 2223 data (2515 rows) appended


Data Collected: standard
Data Collected: shooting
Data Collected: passing
Data Collected: passing_types
Data Collected: goal_shot_creation
Data Collected: defense
Data Collected: possession
Data Collected: playing_time
Data Collected: misc
All Individual Data Collected for 2324
✓ 2324 data (2502 rows) appended


Data Collected: standard
Data Collected: shooting
Data Collected: passing
Data Collected: passing_types
Data Collected: goal_shot_creation
Data Collected: defense
Data Collected: possession
Data Collected: playing_time
Data Collected: misc
All Individual Data Collected for 2425
✓ 2425 data (2494 rows) appended


Data Collected: standard
Data Collected: shooting
Data Collected: passing
Data Collected: passing_types
Data Collected: goal_shot_creation
Data Collected: defense
Data Collected: possession
Data Collected: playing_time
Data Collected: misc
All Individual Data Collected for 2526
✓ 2526 data (2225 rows) appended


In [49]:
df = pd.read_csv("./data/raw/fbref_second.csv")

In [51]:
df.season.unique()

array([1718, 1819, 1920, 2021, 2122, 2223, 2324, 2425, 2526])