In [1]:
import pandas as pd
import numpy as np
from statsbombpy import sb
import warnings
from matplotlib.patches import Polygon as mpl_Polygon
from shapely.geometry import Point, Polygon as shp_Polygon
from matplotlib.patches import Arc
from matplotlib.patches import Wedge

warnings.filterwarnings('ignore')

---

### Creating datasets for my_xG (Python) or football_dashboard (Power BI)

Here, the file **equal_dataframe.csv** is created with balanced data for women and men from the competitions listed below.  
This dataset is used in the **my_xG** project.

Alternatively, a file **powerbi_dataset.csv** may be created. It contains **all available event data from StatsBomb** (via `statsbombpy`) and is used in the **football_dashboard** project (Power BI).

To select which dataset to generate, set the variable `powerbi` to `False` (for Python) or `True` (for Power BI).

---

In [None]:
# Select False to create file for my_xG or True for football_dashboard

powerbi = False

In [3]:
# Selecting the appropriate competitions

if powerbi:

    competitions_df = sb.competitions()

    competition_list = competitions_df[['competition_name', 'season_name']].values.tolist()
    
else:
    
    competition_list = [
                      ['FIFA World Cup', '2022'], ['FIFA World Cup', '2018'],
                      ["Women's World Cup", '2023'], ["Women's World Cup", '2019'],
                      ["UEFA Women's Euro", '2022'], ['La Liga', '2015/2016'],
                      ['NWSL', '2018'], ["FA Women's Super League", '2020/2021'],
                      ["FA Women's Super League", '2019/2020'], ["FA Women's Super League", '2018/2019']
                  ]

---

### Calculates new parameteres

* players_in_shot_keeper_cone
* opponent_players_in_shot_keeper_cone
* goalkeeper_in_shot_keeper_cone
* pressing
* players_in_the_box


---

In [4]:
# Calculates keeper cone attributes - Shot Block (SB)

def is_point_in_keeper_cone(point, keeper_cone_points):
    
    polygon = shp_Polygon(keeper_cone_points)
    point_obj = Point(point)
    
    return polygon.intersects(point_obj)

def calculate_keeper_cone(row):
    
    shooter_x = row['x']
    shooter_y = row['y']
    
    # Goal coordinates
    goal_center_x = 120
    goal_top_y, goal_bottom_y = 44, 36

    keeper_cone_points = [
        (shooter_x, shooter_y),
        (goal_center_x, goal_top_y),
        (goal_center_x, goal_bottom_y)
    ]
    
    return keeper_cone_points

def calculates_keeper_cone_attributes(df):
    
    def count_players_in_keeper_cone(row):
        
        keeper_cone_points = calculate_keeper_cone(row)
        
        players_in_keeper_cone = 0
        opponent_players_in_keeper_cone = 0
        goalkeeper_in_shot_keeper_cone = False

        # Parses freeze_frame to list if string
        if isinstance(row['shot_freeze_frame'], str):
            freeze_frame = ast.literal_eval(row['shot_freeze_frame'])
        else:
            freeze_frame = row['shot_freeze_frame']
        
        goalkeeper = next((player for player in freeze_frame if not player['teammate'] and player.get('position_name') == 'Goalkeeper'), None)

        # Counts players in cone
        for player in freeze_frame:
            
            if is_point_in_keeper_cone(player['location'], keeper_cone_points):
                
                players_in_keeper_cone += 1
                
                if not player['teammate']:
                    
                    opponent_players_in_keeper_cone += 1

        if goalkeeper and is_point_in_keeper_cone(goalkeeper['location'], keeper_cone_points):
            
            goalkeeper_in_shot_keeper_cone = True
        
        return pd.Series({
            'players_in_shot_keeper_cone': players_in_keeper_cone,
            'opponent_players_in_shot_keeper_cone': opponent_players_in_keeper_cone,
            'goalkeeper_in_shot_keeper_cone': goalkeeper_in_shot_keeper_cone
        })
    
    df[['players_in_shot_keeper_cone', 'opponent_players_in_shot_keeper_cone', 'goalkeeper_in_shot_keeper_cone']] = df.apply(count_players_in_keeper_cone, axis=1)

    return df

In [5]:
# Calculates pressing - Defensive Congestion Index (DCI)

def calculate_pressing(df):
    
    def calculate_pressing_areas(row):
        
        shooter_x = row['x']
        shooter_y = row['y']
        center = (shooter_x, shooter_y)

        goal_center_y = 40
        goal_center_x = 120
        
        # Calculates the angle relative to the center of the goal
        angle = np.arctan2(goal_center_y - shooter_y, goal_center_x - shooter_x) * 180 / np.pi

        def generate_pressing_areas(center, radius, start_angle, end_angle, step=1):

            # Adjustment for negative angles
            if end_angle < start_angle:
                
                end_angle += 360  
                
            points = [
                (
                    center[0] + radius * np.cos(np.radians(a)),
                    center[1] + radius * np.sin(np.radians(a))
                )
                for a in np.arange(start_angle, end_angle + step, step)
            ]
            
            return shp_Polygon([center] + points + [center])

        # Creates a pressing area in front of the striker
        bigger_area = generate_pressing_areas(center, 3.5, angle - 55, angle + 55)

        # Creates a pressing area behind the striker
        smaller_area = generate_pressing_areas(center, 1.75, angle + 55, angle - 55)

        # Parses freeze_frame to list if string
        if isinstance(row['shot_freeze_frame'], str):
            freeze_frame = ast.literal_eval(row['shot_freeze_frame'])
        else:
            freeze_frame = row['shot_freeze_frame']

        # Counts opponent players in areas
        number_of_players_in_bigger_area = sum(
            1 for player in freeze_frame
            if not player['teammate'] and bigger_area.intersects(Point(player['location']))
        )
        
        number_of_players_in_smaller_area = sum(
            1 for player in freeze_frame
            if not player['teammate'] and smaller_area.intersects(Point(player['location']))
        )

        return number_of_players_in_bigger_area + number_of_players_in_smaller_area

    df['pressing'] = df.apply(calculate_pressing_areas, axis=1)

    return df

In [6]:
# Calculates opponent players in the box

def is_player_in_the_box(point):
    
    return point[0] >= 102 and 18 <= point[1] <= 62

def calculate_players_in_the_box(df):

    def count_players_in_the_box(row):

        # Parses freeze_frame to list if string
        if isinstance(row['shot_freeze_frame'], str):
            freeze_frame = ast.literal_eval(row['shot_freeze_frame'])
        else:
            freeze_frame = row['shot_freeze_frame']

        players_in_the_box = 0

        # Counts players in the box
        for player in freeze_frame:

            if not player['teammate']:
            
                if is_player_in_the_box(player['location']):
                    
                    players_in_the_box += 1

        return players_in_the_box

    df['players_in_the_box'] = df.apply(count_players_in_the_box, axis=1)

    return df

In [9]:
# Creates a match list

all_matches = []

for comp in competition_list:

    competition = sb.competitions().query("competition_name == @comp[0] and season_name == @comp[1]")
    
    competition_id = competition.competition_id.iloc[0]
    season_id = competition.season_id.iloc[0]
    gender = competition.competition_gender.iloc[0]
    competition_name = competition.competition_name.iloc[0]
    season_name = competition.season_name.iloc[0]

    matches = sb.matches(competition_id=competition_id, season_id=season_id)
    
    for match_id in matches.match_id:
        
        temp_match = sb.events(match_id=match_id)
        match_info = matches[matches['match_id'] == match_id].iloc[0]
        
        temp_match['home_team_name'] = match_info['home_team']
        temp_match['away_team_name'] = match_info['away_team']
        temp_match['gender'] = gender
        temp_match['match_id'] = match_id
        temp_match['competition_name'] = competition_name
        temp_match['season_name'] = season_name
        
        all_matches.append(temp_match)

---

### Creates csv files

---

In [11]:
# Creates equal_dataframe.csv or powerbi_dataset.csv

df = pd.concat(all_matches)

# Keeps only the selected columns
if powerbi:

    df = df[['gender', 'location', 'match_id', 'minute', 'play_pattern',
             'player', 'player_id', 'shot_body_part', 'competition_name', 'position',
             'shot_outcome', 'shot_statsbomb_xg', 'team', 'shot_end_location',
             'shot_type', 'team_id', 'type', 'home_team_name', 'away_team_name', 'season_name']
             ]
    
    # Filers by Shot
    df = df[df['type'] == 'Shot']

    # Takes player's nationality from lineup        
    match_ids = df['match_id'].unique()
    lineup_data = []

    for match_id in match_ids:
        
        lineups = sb.lineups(match_id)
    
        for team_name, team_df in lineups.items():
            
            for _, player in team_df.iterrows():
                
                lineup_data.append({
                    'match_id': match_id,
                    'player_id': player['player_id'],
                    'player_country': player['country'],
                    'player_nickname': player['player_nickname']
                })
    
    lineup_df = pd.DataFrame(lineup_data)
    df = df.merge(lineup_df, on=['match_id', 'player_id'], how='left')
        
else:
   
    df = df[['gender', 'location', 'match_id', 'minute', 'play_pattern',
             'player', 'shot_aerial_won', 'shot_body_part', 'shot_freeze_frame',
             'shot_outcome', 'shot_statsbomb_xg', 'shot_technique',
             'shot_type', 'team_id', 'type', 'under_pressure']
             ]
    
    # Filers by Shot and removes pentalty kicks
    df = df[(df['type'] == 'Shot') & (df['shot_type'] != 'Penalty')]

# Splits location to two columns x and y
df['x'] = df['location'].apply(lambda x: x[0])
df['y'] = df['location'].apply(lambda x: x[1])

# Goal coordinates
goal_center_x, goal_center_y = 120, 40
goal_top_y, goal_bottom_y = 44, 36

# Calculates distance and angle_degrees
df['distance'] = np.sqrt((df['x'] - goal_center_x)**2 + (df['y'] - goal_center_y)**2).round(2)

df['angle'] = np.arctan2(goal_top_y - df['y'], goal_center_x - df['x']) - \
                np.arctan2(goal_bottom_y - df['y'], goal_center_x - df['x'])

df['angle_degrees'] = np.degrees(df['angle']).round(2)

# Removes unnecessary columns
df = df.drop(columns=['type', 'location', 'angle'])

if powerbi:
       
    df.to_csv('powerbi_dataset.csv', index=False)
    
else:
    
    # Data about players
    df['shot_freeze_frame'] = df['shot_freeze_frame'].apply(
        lambda freeze_frame: [
            {
                'location': item['location'], 
                'teammate': item['teammate'], 
                'position_name': item['position']['name']
            } 
            for item in freeze_frame
        ]
    )
    
    df = calculates_keeper_cone_attributes(df)
    
    df = calculate_pressing(df)
    
    df = calculate_players_in_the_box(df)
    
    df.to_csv('equal_dataframe.csv', index=False)