<a href="https://colab.research.google.com/github/wswager/expected_goals/blob/main/data_organization/expected_goals_data_organization_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Capstone Project Submission**

* Student Name: Wes Swager
* Student Pace: Full Time
* Instructor Name: Claude Fried
* Scheduled Project Review Date/Time
    * Friday, June 11, 2021, 2:30pm CST

# Data Organization Notebook

# Packages

In [None]:
# Drive  and IO to access saved data
from google.colab import drive, files
drive.mount('/content/drive')

import io

# Pandas for Dataframes
import pandas as pd

# Numpy and for mathematical functions
import numpy as np

import warnings
warnings.filterwarnings('ignore')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data

Data sourced from [StatsBomb Open Data](https://github.com/statsbomb/open-data)

Data extracted in [expected_goals_data_extraction_notebook](https://github.com/wswager/expected_goals/blob/main/data_extraction/expected_goals_data_extraction_notebook.ipynb)

In [None]:
# Import events_shots_df from expected_goals_data_extraction_notebook

organized_data = pd.read_csv('/content/drive/MyDrive/expected_goals/data_organization/organized_data.csv')

# Extract Shot-Specific Data

In [None]:
# Extracting shot specific data from 
# events_df nested dictionaries

shots_df = events_shots_df[['index',
                            'timestamp',
                            'shot',
                            'location',
                            'player',
                            'possession_team']]

In [None]:
shots_df.head()

Unnamed: 0,index,timestamp,shot,location,player,possession_team
257,258,2021-06-11 00:04:38.609,"{'statsbomb_xg': 0.26615402, 'end_location': [...","[109.0, 46.0]","{'id': 4641, 'name': 'Francesca Kirby'}","{'id': 971, 'name': 'Chelsea FCW'}"
541,542,2021-06-11 00:11:45.046,"{'one_on_one': True, 'statsbomb_xg': 0.0935205...","[113.0, 35.0]","{'id': 15550, 'name': 'Bethany England'}","{'id': 971, 'name': 'Chelsea FCW'}"
613,614,2021-06-11 00:18:03.461,"{'statsbomb_xg': 0.036171142, 'end_location': ...","[94.0, 43.0]","{'id': 4638, 'name': 'Drew Spence'}","{'id': 971, 'name': 'Chelsea FCW'}"
876,877,2021-06-11 00:23:11.935,"{'statsbomb_xg': 0.016625367000000002, 'end_lo...","[86.0, 34.0]","{'id': 10193, 'name': 'Chloe Arthur'}","{'id': 969, 'name': 'Birmingham City WFC'}"
891,892,2021-06-11 00:23:45.810,"{'statsbomb_xg': 0.030716168000000002, 'end_lo...","[94.0, 33.0]","{'id': 15550, 'name': 'Bethany England'}","{'id': 971, 'name': 'Chelsea FCW'}"


In [None]:
shots_df.to_csv('/content/drive/MyDrive/flatiron/expected_goals/data_organization/shots_df.csv')

# Extract Features from Nested Dictionaries

## Shot-Specific Features

In [None]:
# Defining and extracting shot specific features from
# shots_df nested dictionaries

# Shot location
location_list = []
location_list.extend(list(shots_df['location'].values))

# Create dataframe of shot features
extracted_data = pd.DataFrame(location_list)
extracted_data.columns = ['location_x',
                          'location_y']

# Shot timestamp
time_list = []
time_list.extend(list(shots_df['timestamp'].values))
extracted_data['time'] = time_list

# StatBombs' xG metric
statsbomb_xg_list = []
for i in range(0, len(shots_df)):
    statsbomb_xg_list.append(shots_df.iloc[i]['shot']['statsbomb_xg'])
extracted_data['statsbomb_xg'] = statsbomb_xg_list

# Outcome of shot
outcome_list = []
for i in range(0, len(shots_df)):
    outcome_list.append(shots_df.iloc[i]['shot']['outcome']['name'])
extracted_data['outcome'] = outcome_list
        
# Player who shot
player_shot_list = []
for i in range(0, len(shots_df)):
    player_shot_list.append(shots_df.iloc[i]['player']['name'])
extracted_data['player_shot'] = player_shot_list
        
# Player who shot's team
team_list = []
for i in range(0, len(shots_df)):
    team_list.append(shots_df.iloc[i]['possession_team']['name'])
extracted_data['team'] = team_list
        
# Bodypart used to shoot
bodypart_list = []
for i in range(0, len(shots_df)):
    bodypart_list.append(shots_df.iloc[i]['shot']['body_part']['name'])
extracted_data['bodypart'] = bodypart_list
        
# Technique used for shot
technique_list = []
for i in range(0, len(shots_df)):
    technique_list.append(shots_df.iloc[i]['shot']['technique']['name'])
extracted_data['technique'] = technique_list
        
# If the shot was taken with the player's 1st-touch
first_time_list = []
for i in range(0, len(shots_df)):
    try:
        first_time_list.append(shots_df.iloc[i]['shot']['first_time'])
    except:
        first_time_list.append(False)
extracted_data['first_time'] = first_time_list
        
# State of play
state_of_play_list = []
for i in range(0, len(shots_df)):
    state_of_play_list.append(shots_df.iloc[i]['shot']['type']['name'])
extracted_data['state_of_play'] = state_of_play_list

In [None]:
extracted_data.head()

Unnamed: 0,location_x,location_y,time,statsbomb_xg,outcome,player_shot,team,bodypart,technique,first_time,state_of_play
0,109.0,46.0,2021-06-11 00:04:38.609,0.266154,Blocked,Francesca Kirby,Chelsea FCW,Left Foot,Normal,False,Open Play
1,113.0,35.0,2021-06-11 00:11:45.046,0.093521,Off T,Bethany England,Chelsea FCW,Head,Normal,False,Open Play
2,94.0,43.0,2021-06-11 00:18:03.461,0.036171,Saved,Drew Spence,Chelsea FCW,Left Foot,Normal,False,Open Play
3,86.0,34.0,2021-06-11 00:23:11.935,0.016625,Off T,Chloe Arthur,Birmingham City WFC,Left Foot,Normal,False,Open Play
4,94.0,33.0,2021-06-11 00:23:45.810,0.030716,Off T,Bethany England,Chelsea FCW,Right Foot,Normal,False,Open Play


## Assist-Specific Features

In [None]:
# Defining and extracting features specific to
# the pass which lead to the shot from
# shots_df nested dictionaries

# Add pass features to dataframe

# Type of pass which lead to the shot
assist_list = []

for i in range(0, len(shots_df)):
    try:
        # Define 'key pass' within shots_df and events_df
        key_pass = events_df['id'] == shots_df.iloc[i]['shot']['key_pass_id']
        
        # Define assist in events_df
        assist_id = events_df[key_pass].dropna(axis = 'columns')['pass']
        
        assist_list.append(assist_id.iloc[0]['height']['name'])
        
    except KeyError:
        assist_list.append(np.nan)
        
extracted_data['assist'] = assist_list

# Second alternative source for type of pass
# which lead to the shot
assist2_list = []

for i in range(0, len(shots_df)):
    try:
        # Define 'key pass' within shots_df and events_df
        key_pass = events_df['id'] == shots_df.iloc[i]['shot']['key_pass_id']
        
        # Define assist in events_df
        assist_id = events_df[key_pass].dropna(axis = 'columns')['pass']
        
        assist2_list.append(assist_id.iloc[0]['technique']['name'])
        
    except KeyError:
        assist2_list.append(np.nan)

extracted_data['assist2'] = assist2_list

# Third alternative source for type of pass
# which lead to the shot
assist3_list = []

for i in range(0, len(shots_df)):
    try:
        # Define 'key pass' within shots_df and events_df
        key_pass = events_df['id'] == shots_df.iloc[i]['shot']['key_pass_id']
        
        # Define assist in events_df
        assist_id = events_df[key_pass].dropna(axis = 'columns')['pass']
        
        if 'cross' in assist_id.iloc[0]:
            assist3_list.append('Cross')
        
        elif 'cut_back' in assist_id.iloc[0]:
            assist3_list.append('Cut Back')
        
        elif 'through_ball' in assist_id.iloc[0]:
            assist3_list.append('Through Ball')
        
        else:
            assist3_list.append(np.nan)
        
    except KeyError:
        assist3_list.append(np.nan)

extracted_data['assist3'] = assist3_list

# State of play for pass which lead to the shot
assist_state_of_play_list = []
for i in range(0, len(shots_df)):
    try:
        # Define 'key pass' within shots_df and events_df
        key_pass = events_df['id'] == shots_df.iloc[i]['shot']['key_pass_id']
        
        # Define assist in events_df
        assist_play_id = events_df[key_pass]['play_pattern']
        
        assist_state_of_play_list.append(assist_play_id.iloc[0]['name'])

    except KeyError:
        assist_state_of_play_list.append(np.nan)

extracted_data['assist_state_of_play'] = assist_state_of_play_list

In [None]:
extracted_data.head()

In [None]:
organized_data = extracted_data
organized_data.to_csv('/content/drive/MyDrive/flatiron/expected_goals/data_organization/organized_data.csv')

Continued in [expected_goals_feature_engineering_notebook](https://github.com/wswager/expected_goals/blob/main/feature_engineering/expected_goals_data_feature_engineering_notebook.ipynb)