In [None]:
mls_teams_fbref = {
    "Colorado-Rapids": "415b4465",
    "Columbus-Crew": "529ba333",
    "dc-united": "44117292",
    "fc-dallas": "15cf8f40",
    "houston-dynamo": "0d885416",
    "Sporting-Kansas": "4acb0537",
    "la-galaxy": "d8b46897",
    "los-angeles-fc": "e0f8cfb5",
    "inter-miami": "cb8b86a2",
    "minnesota-united": "99ea75a6",
    "CF-Montreal": "fc22273c",
    "Nashville-SC": "35f1b818",
    "New-England-Revolution": "3c079def",
    "New-York-City-FC": "64e81410",
    "new-york-red-bulls": "69a0fb10",
    "orlando-city": "46ef01d0",
    "Philadelphia-Union": "46024eeb",
    "Portland-Timbers": "076914e",
    "salt-lake": "b78d3892",
    "san-jose": "f31f1091",
    "seattle": "c2d4c03c",
    "st-louis-city": "cdde63bc",
    "toronto": "59802b6c",
    "vancouver": "fdffd0d3"
}


In [1]:
import os
import pandas as pd
import time
import random
import requests

# Base URL for FBref
base_url = "https://fbref.com/en/squads"

mls_teams_fbref = {
    "los-angeles-fc": "81d817a3",
    "Portland-Timbers": "d076914e"
}
# List of User-Agent strings for rotation
user_agents = [
    #"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0 Safari/537.36 Edg/117.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36",
    #"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:117.0) Gecko/20100101 Firefox/117.0"
]

# Base directory for saving data
base_folder = "MLS_2024_Data"
os.makedirs(base_folder, exist_ok=True)

# Function to fetch and save tables
def fetch_table_with_backoff(team_name, team_id, endpoint, description, team_folder, is_fixture=False):
    # Construct URL
    if is_fixture:
        url = f"{base_url}/{team_id}/2024/matchlogs/c22/schedule/{team_name}-Scores-and-Fixtures-Major-League-Soccer"
    else:
        url = f"{base_url}/{team_id}/2024/matchlogs/c22/{endpoint}/{team_name}-Match-Logs-Major-League-Soccer"
    
    backoff = 5  # Start with a 5-second delay
    max_backoff = 30  # Maximum backoff delay
    for attempt in range(5):  # Retry up to 5 times
        try:
            # Rotate User-Agent for each request
            headers = {"User-Agent": random.choice(user_agents)}
            print(f"Fetching: {url} (Attempt {attempt + 1}) with User-Agent: {headers['User-Agent']}")
            
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Raise an error for bad HTTP status codes
            
            # Parse table using pandas
            tables = pd.read_html(response.text)
            table = tables[0]
            if table.columns.nlevels > 1:  # Drop multi-level headers if present
                table.columns = table.columns.droplevel(0)
            
            # Save table as CSV
            csv_path = os.path.join(team_folder, f"{description.replace(' ', '_').lower()}.csv")
            table.to_csv(csv_path, index=False)
            print(f"Saved: {description} for {team_name} to {csv_path}")
            return
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429:  # Handle Too Many Requests
                print(f"429 Too Many Requests: Retrying in {backoff} seconds...")
                time.sleep(backoff)
                backoff = min(backoff * 2, max_backoff)  # Exponential backoff, capped at 30 seconds
            else:
                print(f"HTTP Error: {e}")
                break
        except Exception as e:
            print(f"Error fetching {description} for {team_name}: {e}")
            break

# Iterate through teams and save their data
for team_name, team_id in mls_teams_fbref.items():
    # Create a folder for the team
    team_folder = os.path.join(base_folder, team_name)
    os.makedirs(team_folder, exist_ok=True)
    
    # Fetch data with randomized delays and backoff
    fetch_table_with_backoff(team_name, team_id, "schedule", "Scores and Fixtures", team_folder, is_fixture=True)
    time.sleep(random.randint(30, 60))  # Random delay between 10 and 20 seconds
    fetch_table_with_backoff(team_name, team_id, "shooting", "Shooting Data", team_folder)
    time.sleep(random.randint(30, 60))
    fetch_table_with_backoff(team_name, team_id, "keeper", "Keeper Data", team_folder)
    time.sleep(random.randint(30, 60))
    fetch_table_with_backoff(team_name, team_id, "passing", "Passing Data", team_folder)
    time.sleep(random.randint(30, 60))
    fetch_table_with_backoff(team_name, team_id, "passing_types", "Passing Types Data", team_folder)
    time.sleep(random.randint(30, 60))
    fetch_table_with_backoff(team_name, team_id, "gca", "Goals and Shots Data", team_folder)
    time.sleep(random.randint(30, 60))
    fetch_table_with_backoff(team_name, team_id, "defense", "Defense Data", team_folder)
    time.sleep(random.randint(30, 60))
    fetch_table_with_backoff(team_name, team_id, "possession", "Possession Data", team_folder)
    time.sleep(random.randint(30, 60))
    fetch_table_with_backoff(team_name, team_id, "misc", "Miscellaneous Data", team_folder)
    time.sleep(random.randint(30, 60))



Fetching: https://fbref.com/en/squads/81d817a3/2024/matchlogs/c22/schedule/los-angeles-fc-Scores-and-Fixtures-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36
Saved: Scores and Fixtures for los-angeles-fc to MLS_2024_Data\los-angeles-fc\scores_and_fixtures.csv


  tables = pd.read_html(response.text)


Fetching: https://fbref.com/en/squads/81d817a3/2024/matchlogs/c22/shooting/los-angeles-fc-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36
Saved: Shooting Data for los-angeles-fc to MLS_2024_Data\los-angeles-fc\shooting_data.csv


  tables = pd.read_html(response.text)


Fetching: https://fbref.com/en/squads/81d817a3/2024/matchlogs/c22/keeper/los-angeles-fc-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Keeper Data for los-angeles-fc to MLS_2024_Data\los-angeles-fc\keeper_data.csv
Fetching: https://fbref.com/en/squads/81d817a3/2024/matchlogs/c22/passing/los-angeles-fc-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Passing Data for los-angeles-fc to MLS_2024_Data\los-angeles-fc\passing_data.csv
Fetching: https://fbref.com/en/squads/81d817a3/2024/matchlogs/c22/passing_types/los-angeles-fc-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Passing Types Data for los-angeles-fc to MLS_2024_Data\los-angeles-fc\passing_types_data.csv
Fetching: https://fbref.com/en/squads/81d817a3/2024/matchlogs/c22/gca/los-angeles-fc-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Goals and Shots Data for los-angeles-fc to MLS_2024_Data\los-angeles-fc\goals_and_shots_data.csv
Fetching: https://fbref.com/en/squads/81d817a3/2024/matchlogs/c22/defense/los-angeles-fc-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Defense Data for los-angeles-fc to MLS_2024_Data\los-angeles-fc\defense_data.csv
Fetching: https://fbref.com/en/squads/81d817a3/2024/matchlogs/c22/possession/los-angeles-fc-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Possession Data for los-angeles-fc to MLS_2024_Data\los-angeles-fc\possession_data.csv
Fetching: https://fbref.com/en/squads/81d817a3/2024/matchlogs/c22/misc/los-angeles-fc-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Miscellaneous Data for los-angeles-fc to MLS_2024_Data\los-angeles-fc\miscellaneous_data.csv
Fetching: https://fbref.com/en/squads/d076914e/2024/matchlogs/c22/schedule/Portland-Timbers-Scores-and-Fixtures-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Scores and Fixtures for Portland-Timbers to MLS_2024_Data\Portland-Timbers\scores_and_fixtures.csv
Fetching: https://fbref.com/en/squads/d076914e/2024/matchlogs/c22/shooting/Portland-Timbers-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Shooting Data for Portland-Timbers to MLS_2024_Data\Portland-Timbers\shooting_data.csv
Fetching: https://fbref.com/en/squads/d076914e/2024/matchlogs/c22/keeper/Portland-Timbers-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Keeper Data for Portland-Timbers to MLS_2024_Data\Portland-Timbers\keeper_data.csv
Fetching: https://fbref.com/en/squads/d076914e/2024/matchlogs/c22/passing/Portland-Timbers-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Passing Data for Portland-Timbers to MLS_2024_Data\Portland-Timbers\passing_data.csv
Fetching: https://fbref.com/en/squads/d076914e/2024/matchlogs/c22/passing_types/Portland-Timbers-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Passing Types Data for Portland-Timbers to MLS_2024_Data\Portland-Timbers\passing_types_data.csv
Fetching: https://fbref.com/en/squads/d076914e/2024/matchlogs/c22/gca/Portland-Timbers-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Goals and Shots Data for Portland-Timbers to MLS_2024_Data\Portland-Timbers\goals_and_shots_data.csv
Fetching: https://fbref.com/en/squads/d076914e/2024/matchlogs/c22/defense/Portland-Timbers-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Defense Data for Portland-Timbers to MLS_2024_Data\Portland-Timbers\defense_data.csv
Fetching: https://fbref.com/en/squads/d076914e/2024/matchlogs/c22/possession/Portland-Timbers-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Possession Data for Portland-Timbers to MLS_2024_Data\Portland-Timbers\possession_data.csv
Fetching: https://fbref.com/en/squads/d076914e/2024/matchlogs/c22/misc/Portland-Timbers-Match-Logs-Major-League-Soccer (Attempt 1) with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 Safari/537.36


  tables = pd.read_html(response.text)


Saved: Miscellaneous Data for Portland-Timbers to MLS_2024_Data\Portland-Timbers\miscellaneous_data.csv


In [None]:
"""
# fbref table link
url_df = 'https://fbref.com/en/squads/1ebc1a5b/Atlanta-United-Stats'
matchlog = pd.read_html(url_df)
#player states
matchlog0 = matchlog[0]
matchlog0.columns = matchlog0.columns.droplevel(0)
display(matchlog0.head())

#scores
match_fixtures_url = 'https://fbref.com/en/squads/1ebc1a5b/2024/matchlogs/c22/schedule/Atlanta-United-Scores-and-Fixtures-Major-League-Soccer'
scores = pd.read_html(match_fixtures_url)
scores0 = scores[0]
display(scores0.head())

#shooting
match_fixtures_url = 'https://fbref.com/en/squads/1ebc1a5b/2024/matchlogs/c22/shooting/Atlanta-United-Match-Logs-Major-League-Soccer'
shooting = pd.read_html(match_fixtures_url)
shooting0 = shooting[0]
shooting0.columns = shooting0.columns.droplevel(0)
display(shooting0.head())

#keeper
match_fixtures_url = 'https://fbref.com/en/squads/1ebc1a5b/2024/matchlogs/c22/keeper/Atlanta-United-Match-Logs-Major-League-Soccer'
keeper = pd.read_html(match_fixtures_url)
keeper0 = keeper[0]
keeper0.columns = keeper0.columns.droplevel(0)
display(keeper0.head())

#passing_types
match_fixtures_url = 'https://fbref.com/en/squads/1ebc1a5b/2024/matchlogs/c22/passing/Atlanta-United-Match-Logs-Major-League-Soccer'
passing = pd.read_html(match_fixtures_url)
passing0 = passing[0]
passing0.columns = passing0.columns.droplevel(0)
display(passing0.head())

#passing_types
match_fixtures_url = 'https://fbref.com/en/squads/1ebc1a5b/2024/matchlogs/c22/passing_types/Atlanta-United-Match-Logs-Major-League-Soccer'
pass_type = pd.read_html(match_fixtures_url)
pass_type0 = pass_type[0]
pass_type0.columns = pass_type0.columns.droplevel(0)
display(pass_type0.head())

#goals/shots
match_fixtures_url = 'https://fbref.com/en/squads/1ebc1a5b/2024/matchlogs/c22/gca/Atlanta-United-Match-Logs-Major-League-Soccer'
gca = pd.read_html(match_fixtures_url)
gca0 = gca[0]
gca0.columns = gca0.columns.droplevel(0)
display(gca0.head())

#defense
match_fixtures_url = 'https://fbref.com/en/squads/1ebc1a5b/2024/matchlogs/c22/defense/Atlanta-United-Match-Logs-Major-League-Soccer'
defense = pd.read_html(match_fixtures_url)
defense0 = defense[0]
defense0.columns = defense0.columns.droplevel(0)
display(defense0.head())

#possession
match_fixtures_url = 'https://fbref.com/en/squads/1ebc1a5b/2024/matchlogs/c22/possession/Atlanta-United-Match-Logs-Major-League-Soccer'
poss = pd.read_html(match_fixtures_url)
poss0 = defense[0]
poss0.columns = poss0.columns.droplevel(0)
display(poss0.head())

#misc
match_fixtures_url = 'https://fbref.com/en/squads/1ebc1a5b/2024/matchlogs/c22/misc/Atlanta-United-Match-Logs-Major-League-Soccer'
misc = pd.read_html(match_fixtures_url)
misc0 = defense[0]
misc0.columns = misc0.columns.droplevel(0)
display(misc0.head())
"""