In [7]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import numpy as np

In [2]:
team_urls_2023_2024 = [
    'https://understat.com/team/Atletico_Madrid/2023',
    'https://understat.com/team/Real_Madrid/2023',
    'https://understat.com/team/Barcelona/2023',
    'https://understat.com/team/Sevilla/2023',
    'https://understat.com/team/Villarreal/2023',
    'https://understat.com/team/Real_Betis/2023',
    'https://understat.com/team/Real_Sociedad/2023',
    'https://understat.com/team/Valencia/2023',
    'https://understat.com/team/Osasuna/2023',
    'https://understat.com/team/Alaves/2023',
    'https://understat.com/team/Espanyol/2023',
    'https://understat.com/team/Getafe/2023',
    'https://understat.com/team/Celta_Vigo/2023',
    'https://understat.com/team/Rayo_Vallecano/2023',
    'https://understat.com/team/Real_Valladolid/2023',
    'https://understat.com/team/Levante/2023',
    'https://understat.com/team/Elche/2023',
    'https://understat.com/team/Mallorca/2023',
    
    'https://understat.com/team/Atletico_Madrid/2024',
    'https://understat.com/team/Real_Madrid/2024',
    'https://understat.com/team/Barcelona/2024',
    'https://understat.com/team/Sevilla/2024',
    'https://understat.com/team/Villarreal/2024',
    'https://understat.com/team/Real_Betis/2024',
    'https://understat.com/team/Real_Sociedad/2024',
    'https://understat.com/team/Valencia/2024',
    'https://understat.com/team/Osasuna/2024',
    'https://understat.com/team/Alaves/2024',
    'https://understat.com/team/Espanyol/2024',
    'https://understat.com/team/Getafe/2024',
    'https://understat.com/team/Celta_Vigo/2024',
    'https://understat.com/team/Rayo_Vallecano/2024',
    'https://understat.com/team/Real_Valladolid/2024',
    'https://understat.com/team/Levante/2024',
    'https://understat.com/team/Elche/2024',
    'https://understat.com/team/Mallorca/2024'
]

In [4]:
matches_data = []

In [12]:
for team_url in team_urls_2023_2024:
    # Fetch the team's page
    response = requests.get(team_url)
    print(f"Fetching data for: {team_url}")

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to fetch data for {team_url}")
        continue
        
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the script tags that contain the match data
    scripts = soup.find_all('script')
    if len(scripts) > 1:
        try:
            # Extract the JSON data from the script
            strings = scripts[1].string
            ind_start = strings.index("('") + 2
            ind_end = strings.index("')")
            json_data = strings[ind_start:ind_end]
            json_data = json_data.encode('utf8').decode('unicode_escape')

            # Convert to JSON format
            data = json.loads(json_data)

            # Extract and append the relevant match data
            for match in data:
                home_team = match['h']['title']
                away_team = match['a']['title']
                home_score = match['goals'].get('h', np.nan) if 'goals' in match else np.nan
                away_score = match['goals'].get('a', np.nan) if 'goals' in match else np.nan
                home_xg = match['xG'].get('h', np.nan) if 'xG' in match else np.nan
                away_xg = match['xG'].get('a', np.nan) if 'xG' in match else np.nan
                forecast_w = match['forecast'].get('w', np.nan) if 'forecast' in match else np.nan
                forecast_d = match['forecast'].get('d', np.nan) if 'forecast' in match else np.nan
                forecast_l = match['forecast'].get('l', np.nan) if 'forecast' in match else np.nan
                result = match.get('result', np.nan)

                # Append the match data with additional metrics to the list
                matches_data.append([
                    home_team, away_team, home_score, away_score, 
                    home_xg, away_xg, forecast_w, forecast_d, forecast_l, result
                ])
        except Exception as e:
            print(f"Error extracting match data for {team_url}: {e}")
    else:
        print(f"No match data found in the expected script for {team_url}")

df_matches = pd.DataFrame(matches_data, columns=[
    'Home Team', 'Away Team', 'Home Score', 'Away Score', 
    'Home xG', 'Away xG', 'Forecast (W)', 'Forecast (D)', 'Forecast (L)', 'Result'
])

matches = df_matches.drop_duplicates()

matches.fillna(np.nan, inplace=True)

matches = matches.dropna()

matches['Home Value'] = 1  # Home team gets 1

matches['Opp Team Value'] = 0
matches['Result Value'] = matches['Result'].apply(lambda x: 1 if x == 'w' else 0)

away_team_codes = {team: code for code, team in enumerate(matches['Away Team'].unique())}

matches['OppCode'] = matches['Away Team'].map(away_team_codes)

matches.to_csv('matchesDATA.csv', index=False)
print(matches)

Fetching data for: https://understat.com/team/Atletico_Madrid/2023
Fetching data for: https://understat.com/team/Real_Madrid/2023
Fetching data for: https://understat.com/team/Barcelona/2023
Fetching data for: https://understat.com/team/Sevilla/2023
Fetching data for: https://understat.com/team/Villarreal/2023
Fetching data for: https://understat.com/team/Real_Betis/2023
Fetching data for: https://understat.com/team/Real_Sociedad/2023
Fetching data for: https://understat.com/team/Valencia/2023
Fetching data for: https://understat.com/team/Osasuna/2023
Fetching data for: https://understat.com/team/Alaves/2023
Fetching data for: https://understat.com/team/Espanyol/2023
Fetching data for: https://understat.com/team/Getafe/2023
Fetching data for: https://understat.com/team/Celta_Vigo/2023
Fetching data for: https://understat.com/team/Rayo_Vallecano/2023
Fetching data for: https://understat.com/team/Real_Valladolid/2023
Fetching data for: https://understat.com/team/Levante/2023
Fetching dat

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches.fillna(np.nan, inplace=True)
