In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('sb_short_30-05-1.csv', low_memory = False)
df.head()

Unnamed: 0,match_id,duration,id,index,location,minute,off_camera,out,period,play_pattern,...,shot_saved_off_target,shot_saved_to_post,shot_redirect,prev_shot_redirect,shot_follows_dribble,prev_pass_backheel,prev_goalkeeper_penalty_saved_to_post,shot_kick_off,x,y
0,3895302,0.052872,c577e730-b9f5-44f2-9257-9e7730c23d7b,436,"[100.4, 35.1]",6,,,1,From Free Kick,...,,,,,,,,,100.4,35.1
1,3895302,0.217872,bbc2c68d-c096-483d-abf4-32c0175a0f55,480,"[114.6, 33.5]",7,,,1,Regular Play,...,,,,,,,,,114.6,33.5
2,3895302,0.445768,12b5206b-9ed0-4b1e-9ec3-f2028187e09f,597,"[106.2, 55.8]",11,,,1,From Free Kick,...,,,,,,,,,106.2,55.8
3,3895302,0.085298,b2c3d59d-3bef-4f8a-ad86-26b69940c64e,684,"[113.9, 47.4]",13,,,1,From Corner,...,,,,,,,,,113.9,47.4
4,3895302,0.402989,bb53b537-1685-4019-9e8f-98f3805828eb,848,"[89.2, 42.5]",16,,,1,Regular Play,...,,,,,,,,,89.2,42.5


In [3]:
df.shape

(84981, 134)

In [4]:
import ast

def convert_to_list(s):
    """
    Convert a string representation of a list to an actual list.
    If the conversion fails, return an empty list and print an error message.
    """
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        print(f"Could not convert {s} to list.")
        return []
    
df.dropna(subset= ['prev_location', 'shot_freeze_frame'], inplace= True)
    
df['prev_location'] = df['prev_location'].apply(convert_to_list)
df['shot_freeze_frame'] = df['shot_freeze_frame'].apply(convert_to_list)

In [5]:
def calculate_distances2(row):
    shot_freeze_frame = row['shot_freeze_frame']
    shot_x = row['x']
    shot_y = row['y']
    
    # Initialize columns for positions and distances
    columns = {
        'goalkeeper_x': None,
        'goalkeeper_y': None,
        'goalkeeper_dx': None,
        'goalkeeper_dy': None
    }
    teammate_data = []
    opposition_data = []

    if not isinstance(shot_freeze_frame, list):
        print(f"Unexpected data format for shot_freeze_frame: {shot_freeze_frame}")
        return pd.Series(columns)

    # Sort entries by Euclidean distance
    shot_freeze_frame_sorted = sorted(shot_freeze_frame, key=lambda entry: ((entry['location'][0] - shot_x)**2 + (entry['location'][1] - shot_y)**2))

    for entry in shot_freeze_frame_sorted:
        if not isinstance(entry, dict):
            print(f"Unexpected entry format: {entry}")
            continue

        x, y = entry['location']
        dx = x - shot_x
        dy = y - shot_y

        if entry['position']['name'] == 'Goalkeeper' and entry['teammate'] == False:
            columns['goalkeeper_x'] = x
            columns['goalkeeper_y'] = y
            columns['goalkeeper_dx'] = dx
            columns['goalkeeper_dy'] = dy
        else:
            player_data = {
                'x': x,
                'y': y,
                'dx': dx,
                'dy': dy
            }
            if entry['teammate']:
                teammate_data.append(player_data)
            else:
                opposition_data.append(player_data)
    
    # Flatten the lists and create columns dynamically
    for idx, player_data in enumerate(teammate_data, start=1):
        columns[f'teammate_{idx}_x'] = player_data['x']
        columns[f'teammate_{idx}_y'] = player_data['y']
        columns[f'teammate_{idx}_dx'] = player_data['dx']
        columns[f'teammate_{idx}_dy'] = player_data['dy']
        
    for idx, player_data in enumerate(opposition_data, start=1):
        columns[f'opposition_{idx}_x'] = player_data['x']
        columns[f'opposition_{idx}_y'] = player_data['y']
        columns[f'opposition_{idx}_dx'] = player_data['dx']
        columns[f'opposition_{idx}_dy'] = player_data['dy']

    return pd.Series(columns)

# Apply the function to each row
distance_columns = df.apply(calculate_distances2, axis=1)

# Combine the new columns with the original DataFrame
df = pd.concat([df, distance_columns], axis=1)


In [7]:
dfd = df.copy()
dfd = dfd[['x', 'y', 'goalkeeper_dx', 'goalkeeper_dy', 'opposition_10_dx', 'opposition_10_dy', 'opposition_1_dx', 'opposition_1_dy', 'opposition_2_dx', 'opposition_2_dy', 'opposition_3_dx', 'opposition_3_dy', 'opposition_4_dx', 'opposition_4_dy', 'opposition_5_dx', 'opposition_5_dy', 'opposition_6_dx', 'opposition_6_dy', 'opposition_7_dx', 'opposition_7_dy', 'opposition_8_dx', 'opposition_8_dy', 'opposition_9_dx', 'opposition_9_dy', 'teammate_10_dx', 'teammate_10_dy', 'teammate_1_dx', 'teammate_1_dy', 'teammate_2_dx', 'teammate_2_dy', 'teammate_3_dx', 'teammate_3_dy', 'teammate_4_dx', 'teammate_4_dy', 'teammate_5_dx', 'teammate_5_dy', 'teammate_6_dx', 'teammate_6_dy', 'teammate_7_dx', 'teammate_7_dy', 'teammate_8_dx', 'teammate_8_dy', 'teammate_9_dx', 'teammate_9_dy']]
dfd.head()

Unnamed: 0,x,y,goalkeeper_dx,goalkeeper_dy,opposition_10_dx,opposition_10_dy,opposition_1_dx,opposition_1_dy,opposition_2_dx,opposition_2_dy,...,teammate_5_dx,teammate_5_dy,teammate_6_dx,teammate_6_dy,teammate_7_dx,teammate_7_dy,teammate_8_dx,teammate_8_dy,teammate_9_dx,teammate_9_dy
0,100.4,35.1,18.0,3.4,,,1.5,0.0,5.0,-3.6,...,-10.6,-5.2,12.8,3.4,-5.9,-15.2,,,,
1,114.6,33.5,3.7,2.9,,,-0.3,1.1,-1.6,3.6,...,-15.5,26.6,,,,,,,,
2,106.2,55.8,12.0,-12.9,,,2.7,3.9,2.9,-5.0,...,-12.6,-28.3,,,,,,,,
3,113.9,47.4,5.0,-5.5,-4.4,-15.2,0.2,-1.0,-0.7,-0.8,...,1.8,-12.3,-18.0,2.0,2.1,-42.1,,,,
4,89.2,42.5,27.7,-2.4,,,2.7,1.3,3.1,-1.9,...,-2.5,16.6,13.6,16.1,8.0,-28.0,,,,


In [8]:
print(dfd.isna().sum())

x                       0
y                       0
goalkeeper_dx          98
goalkeeper_dy          98
opposition_10_dx    70384
opposition_10_dy    70384
opposition_1_dx       147
opposition_1_dy       147
opposition_2_dx       299
opposition_2_dy       299
opposition_3_dx      1046
opposition_3_dy      1046
opposition_4_dx      3112
opposition_4_dy      3112
opposition_5_dx      6876
opposition_5_dy      6876
opposition_6_dx     13105
opposition_6_dy     13105
opposition_7_dx     23446
opposition_7_dy     23446
opposition_8_dx     38438
opposition_8_dy     38438
opposition_9_dx     55257
opposition_9_dy     55257
teammate_10_dx      83636
teammate_10_dy      83636
teammate_1_dx        1295
teammate_1_dy        1295
teammate_2_dx        5078
teammate_2_dy        5078
teammate_3_dx       13460
teammate_3_dy       13460
teammate_4_dx       27235
teammate_4_dy       27235
teammate_5_dx       43459
teammate_5_dy       43459
teammate_6_dx       59184
teammate_6_dy       59184
teammate_7_d

In [9]:
df.head()

Unnamed: 0,match_id,duration,id,index,location,minute,off_camera,out,period,play_pattern,...,teammate_7_x,teammate_7_y,teammate_8_dx,teammate_8_dy,teammate_8_x,teammate_8_y,teammate_9_dx,teammate_9_dy,teammate_9_x,teammate_9_y
0,3895302,0.052872,c577e730-b9f5-44f2-9257-9e7730c23d7b,436,"[100.4, 35.1]",6,,,1,From Free Kick,...,94.5,19.9,,,,,,,,
1,3895302,0.217872,bbc2c68d-c096-483d-abf4-32c0175a0f55,480,"[114.6, 33.5]",7,,,1,Regular Play,...,,,,,,,,,,
2,3895302,0.445768,12b5206b-9ed0-4b1e-9ec3-f2028187e09f,597,"[106.2, 55.8]",11,,,1,From Free Kick,...,,,,,,,,,,
3,3895302,0.085298,b2c3d59d-3bef-4f8a-ad86-26b69940c64e,684,"[113.9, 47.4]",13,,,1,From Corner,...,116.0,5.3,,,,,,,,
4,3895302,0.402989,bb53b537-1685-4019-9e8f-98f3805828eb,848,"[89.2, 42.5]",16,,,1,Regular Play,...,97.2,14.5,,,,,,,,


In [10]:
df.dropna(subset = ['goalkeeper_dx', 'goalkeeper_dy','teammate_1_dx', 'teammate_1_dy', 'teammate_2_dx', 'teammate_2_dy',
                   'opposition_1_dx', 'opposition_1_dy', 'opposition_2_dx', 'opposition_2_dy', 'opposition_3_dx', 'opposition_3_dy', 'opposition_4_dx', 'opposition_4_dy'], inplace = True)

In [11]:
for column in df.columns:
    print(f"{column}: {df[column].isna().sum()}")

match_id: 0
duration: 0
id: 0
index: 0
location: 0
minute: 0
off_camera: 77516
out: 76253
period: 0
play_pattern: 0
player: 0
player_id: 0
position: 0
possession: 0
possession_team: 0
possession_team_id: 0
related_events: 0
second: 0
shot_aerial_won: 70729
shot_body_part: 0
shot_end_location: 0
shot_first_time: 53769
shot_freeze_frame: 0
shot_key_pass_id: 22448
shot_one_on_one: 74584
shot_outcome: 0
shot_statsbomb_xg: 0
shot_technique: 0
shot_type: 0
team: 0
team_id: 0
timestamp: 0
type: 0
under_pressure: 59106
prev_50_50: 77615
prev_bad_behaviour_card: 77617
prev_ball_recovery_recovery_failure: 77613
prev_carry_end_location: 41715
prev_clearance_body_part: 77616
prev_clearance_right_foot: 77616
prev_counterpress: 77581
prev_dribble_nutmeg: 77616
prev_dribble_outcome: 77565
prev_duel_outcome: 77599
prev_duel_type: 77595
prev_duration: 231
prev_foul_won_advantage: 77555
prev_foul_won_defensive: 76936
prev_foul_won_penalty: 77617
prev_goalkeeper_body_part: 77617
prev_goalkeeper_outcome: 

In [12]:
df_slim = df.copy()
df_slim = df[['x', 'y', 'goalkeeper_dx', 'goalkeeper_dy', 'opposition_1_dx', 'opposition_1_dy', 'opposition_2_dx', 'opposition_2_dy', 'opposition_3_dx', 'opposition_3_dy', 'opposition_4_dx', 
              'opposition_4_dy', 'teammate_1_dx', 'teammate_1_dy', 'teammate_2_dx', 'teammate_2_dy', 
              'goalkeeper_x', 'goalkeeper_y', 'opposition_1_x', 'opposition_1_y', 'opposition_2_x', 'opposition_2_y', 'opposition_3_x', 'opposition_3_y', 'opposition_4_x', 
              'opposition_4_y', 'teammate_1_x', 'teammate_1_y', 'teammate_2_x', 'teammate_2_y',
              'play_pattern', 'shot_aerial_won', 'shot_body_part', 'shot_first_time', 'shot_one_on_one', 
              'shot_outcome', 'shot_technique', 'shot_statsbomb_xg', 'shot_type', 'under_pressure', 'prev_type', 'prev_location',
              ]]

df_slim.head()

Unnamed: 0,x,y,goalkeeper_dx,goalkeeper_dy,opposition_1_dx,opposition_1_dy,opposition_2_dx,opposition_2_dy,opposition_3_dx,opposition_3_dy,...,shot_body_part,shot_first_time,shot_one_on_one,shot_outcome,shot_technique,shot_statsbomb_xg,shot_type,under_pressure,prev_type,prev_location
0,100.4,35.1,18.0,3.4,1.5,0.0,5.0,-3.6,6.8,3.0,...,Right Foot,True,,Blocked,Normal,0.056644,Open Play,,Ball Recovery,"[101.7, 35.6]"
1,114.6,33.5,3.7,2.9,-0.3,1.1,-1.6,3.6,0.0,5.6,...,Left Foot,True,,Saved,Normal,0.143381,Open Play,True,Pass,"[111.0, 69.2]"
2,106.2,55.8,12.0,-12.9,2.7,3.9,2.9,-5.0,2.7,-9.6,...,Left Foot,True,,Blocked,Normal,0.038188,Open Play,,Ball Recovery,"[105.8, 56.5]"
3,113.9,47.4,5.0,-5.5,0.2,-1.0,-0.7,-0.8,-0.9,-2.3,...,Head,,,Blocked,Normal,0.052781,Open Play,,Pass,"[120.0, 0.1]"
4,89.2,42.5,27.7,-2.4,2.7,1.3,3.1,-1.9,1.9,9.0,...,Left Foot,,,Blocked,Normal,0.021272,Open Play,True,Carry,"[87.6, 46.0]"


In [13]:
df_slim.shape

(77617, 42)

In [14]:
for column in df_slim.columns:
    print(f"{column}: {df_slim[column].isna().sum()}")

x: 0
y: 0
goalkeeper_dx: 0
goalkeeper_dy: 0
opposition_1_dx: 0
opposition_1_dy: 0
opposition_2_dx: 0
opposition_2_dy: 0
opposition_3_dx: 0
opposition_3_dy: 0
opposition_4_dx: 0
opposition_4_dy: 0
teammate_1_dx: 0
teammate_1_dy: 0
teammate_2_dx: 0
teammate_2_dy: 0
goalkeeper_x: 0
goalkeeper_y: 0
opposition_1_x: 0
opposition_1_y: 0
opposition_2_x: 0
opposition_2_y: 0
opposition_3_x: 0
opposition_3_y: 0
opposition_4_x: 0
opposition_4_y: 0
teammate_1_x: 0
teammate_1_y: 0
teammate_2_x: 0
teammate_2_y: 0
play_pattern: 0
shot_aerial_won: 70729
shot_body_part: 0
shot_first_time: 53769
shot_one_on_one: 74584
shot_outcome: 0
shot_technique: 0
shot_statsbomb_xg: 0
shot_type: 0
under_pressure: 59106
prev_type: 0
prev_location: 0


In [15]:
df_slim = df_slim.reset_index(drop=True)

In [16]:
df_slim['shot_one_on_one'] = df_slim['shot_one_on_one'].fillna(False)
df_slim['shot_aerial_won'] = df_slim['shot_aerial_won'].fillna(False)
df_slim['shot_first_time'] = df_slim['shot_first_time'].fillna(False)
df_slim['under_pressure'] = df_slim['under_pressure'].fillna(False)

In [17]:
for column in df_slim.columns:
    print(f"{column}: {df_slim[column].isna().sum()}")

x: 0
y: 0
goalkeeper_dx: 0
goalkeeper_dy: 0
opposition_1_dx: 0
opposition_1_dy: 0
opposition_2_dx: 0
opposition_2_dy: 0
opposition_3_dx: 0
opposition_3_dy: 0
opposition_4_dx: 0
opposition_4_dy: 0
teammate_1_dx: 0
teammate_1_dy: 0
teammate_2_dx: 0
teammate_2_dy: 0
goalkeeper_x: 0
goalkeeper_y: 0
opposition_1_x: 0
opposition_1_y: 0
opposition_2_x: 0
opposition_2_y: 0
opposition_3_x: 0
opposition_3_y: 0
opposition_4_x: 0
opposition_4_y: 0
teammate_1_x: 0
teammate_1_y: 0
teammate_2_x: 0
teammate_2_y: 0
play_pattern: 0
shot_aerial_won: 0
shot_body_part: 0
shot_first_time: 0
shot_one_on_one: 0
shot_outcome: 0
shot_technique: 0
shot_statsbomb_xg: 0
shot_type: 0
under_pressure: 0
prev_type: 0
prev_location: 0


In [18]:
df_slim['goal'] = df_slim['shot_outcome']=='Goal'
df_slim.head(30)

Unnamed: 0,x,y,goalkeeper_dx,goalkeeper_dy,opposition_1_dx,opposition_1_dy,opposition_2_dx,opposition_2_dy,opposition_3_dx,opposition_3_dy,...,shot_first_time,shot_one_on_one,shot_outcome,shot_technique,shot_statsbomb_xg,shot_type,under_pressure,prev_type,prev_location,goal
0,100.4,35.1,18.0,3.4,1.5,0.0,5.0,-3.6,6.8,3.0,...,True,False,Blocked,Normal,0.056644,Open Play,False,Ball Recovery,"[101.7, 35.6]",False
1,114.6,33.5,3.7,2.9,-0.3,1.1,-1.6,3.6,0.0,5.6,...,True,False,Saved,Normal,0.143381,Open Play,True,Pass,"[111.0, 69.2]",False
2,106.2,55.8,12.0,-12.9,2.7,3.9,2.9,-5.0,2.7,-9.6,...,True,False,Blocked,Normal,0.038188,Open Play,False,Ball Recovery,"[105.8, 56.5]",False
3,113.9,47.4,5.0,-5.5,0.2,-1.0,-0.7,-0.8,-0.9,-2.3,...,False,False,Blocked,Normal,0.052781,Open Play,False,Pass,"[120.0, 0.1]",False
4,89.2,42.5,27.7,-2.4,2.7,1.3,3.1,-1.9,1.9,9.0,...,False,False,Blocked,Normal,0.021272,Open Play,True,Carry,"[87.6, 46.0]",False
5,110.2,32.6,9.0,5.9,0.7,-0.1,0.5,1.3,0.8,-1.2,...,False,False,Wayward,Normal,0.029664,Open Play,True,Pass,"[120.0, 80.0]",False
6,105.4,45.1,12.0,-3.4,0.7,-0.4,-3.0,0.0,-0.6,-6.0,...,False,False,Blocked,Normal,0.082293,Open Play,True,Carry,"[105.6, 48.9]",False
7,101.5,47.5,16.3,-6.4,1.9,-0.4,2.0,-9.0,7.9,-5.6,...,True,False,Blocked,Normal,0.05011,Open Play,False,Pass,"[120.0, 80.0]",False
8,116.3,46.0,2.3,-3.3,-1.1,-0.2,-0.7,-1.2,0.2,-1.8,...,False,False,Saved,Normal,0.109917,Open Play,True,Pass,"[120.0, 0.1]",False
9,116.3,43.3,2.5,-0.7,0.6,1.8,-2.5,-1.4,0.1,-3.5,...,True,False,Saved,Volley,0.323069,Open Play,False,Ball Recovery,"[116.5, 42.9]",False


In [19]:
df_slim[['prev_x','prev_y']] = df_slim['prev_location'].to_list()
df_slim['prev_x'] = df_slim['prev_x'].astype(float)
df_slim['prev_y'] = df_slim['prev_y'].astype(float)

In [20]:
df_slim.drop(['prev_location'], axis=1, inplace = True)

In [21]:
df_slim.head()

Unnamed: 0,x,y,goalkeeper_dx,goalkeeper_dy,opposition_1_dx,opposition_1_dy,opposition_2_dx,opposition_2_dy,opposition_3_dx,opposition_3_dy,...,shot_one_on_one,shot_outcome,shot_technique,shot_statsbomb_xg,shot_type,under_pressure,prev_type,goal,prev_x,prev_y
0,100.4,35.1,18.0,3.4,1.5,0.0,5.0,-3.6,6.8,3.0,...,False,Blocked,Normal,0.056644,Open Play,False,Ball Recovery,False,101.7,35.6
1,114.6,33.5,3.7,2.9,-0.3,1.1,-1.6,3.6,0.0,5.6,...,False,Saved,Normal,0.143381,Open Play,True,Pass,False,111.0,69.2
2,106.2,55.8,12.0,-12.9,2.7,3.9,2.9,-5.0,2.7,-9.6,...,False,Blocked,Normal,0.038188,Open Play,False,Ball Recovery,False,105.8,56.5
3,113.9,47.4,5.0,-5.5,0.2,-1.0,-0.7,-0.8,-0.9,-2.3,...,False,Blocked,Normal,0.052781,Open Play,False,Pass,False,120.0,0.1
4,89.2,42.5,27.7,-2.4,2.7,1.3,3.1,-1.9,1.9,9.0,...,False,Blocked,Normal,0.021272,Open Play,True,Carry,False,87.6,46.0


In [22]:
df_slim.to_csv('sb_30-05_slim_slim-1.csv', index=False)