<a href="https://colab.research.google.com/github/wswager/expected_goals/blob/main/data_preprocessing/expected_goals_data_preprocessing_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


**Capstone Project Submission**

* Student Name: Wes Swager
* Student Pace: Full Time
* Instructor Name: Claude Fried
* Scheduled Project Review Date/Time
    * Friday, June, 11, 2:30pm CST

# Data Preprocessing Notebook

<a id = 'packages'></a>
# Packages

In [None]:
# Drive  and IO to access saved data
from google.colab import drive, files
drive.mount('/content/drive')

import io

# Pandas for Dataframes
import pandas as pd

# Numpy and for mathematical functions
import numpy as np

# Import Scikit-learn for modeling
import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<a id = 'data'></a>
# Data

1. Data sourced from [StatsBomb Open Data](https://github.com/statsbomb/open-data)
2. Data extracted in [expected_goals_data_extraction_notebook](https://github.com/wswager/expected_goals/blob/main/data_extraction/expected_goals_data_extraction_notebook.ipynb)
3. Data organized in [expected_goals_data_organization_notebook](https://github.com/wswager/expected_goals/blob/main/data_organization/expected_goals_data_organization_notebook.ipynb)
4. Features engineered in [expected_goals_feature_engineering_notebook](https://github.com/wswager/expected_goals/blob/main/feature_engineering/expected_goals_feature_engineering_notebook.ipynb)
5. Data cleaned in [expected_goals_data_cleaning_notebook](https://github.com/wswager/expected_goals/blob/main/data_cleaning/expected_goals_data_cleaning_notebook.ipynb)

In [None]:
# Import dataframes extracted from
# expected_goals_data_extraction_notebook

cleaned_data = pd.read_csv('/content/drive/MyDrive/flatiron/expected_goals/data_cleaning/cleaned_data.csv')

In [None]:
cleaned_data = cleaned_data.iloc[: , 1:]

In [None]:
cleaned_data.head()

Unnamed: 0,statsbomb_xg,goal,player_shot,team,location_x,location_y,shot_distance,shot_angle,bodypart,technique,first_time,state_of_play,assist,assist_state_of_play
0,0.266154,False,Francesca Kirby,Chelsea FCW,109.0,46.0,12.529964,118.61,Left Foot,Normal Shot,False,Open Play Shot,Ground Pass,Open Play Assist
1,0.093521,False,Bethany England,Chelsea FCW,113.0,35.0,8.602325,54.46,Head,Normal Shot,False,Open Play Shot,High Pass,Set Piece - Free Kick
2,0.036171,False,Drew Spence,Chelsea FCW,94.0,43.0,26.172505,96.58,Left Foot,Normal Shot,False,Open Play Shot,Ground Pass,Open Play Assist
3,0.016625,False,Chloe Arthur,Birmingham City WFC,86.0,34.0,34.525353,79.99,Left Foot,Normal Shot,False,Open Play Shot,Ground Pass,Set Piece - Goal Kick
4,0.030716,False,Bethany England,Chelsea FCW,94.0,33.0,26.925824,74.93,Right Foot,Normal Shot,False,Open Play Shot,Ground Pass,Set Piece - Goal Kick


# Encode Categorical Variables

In [None]:
# Drop unique variables, 'player_shot' and 'team'

# Drop statsbomb_xg as modeling will generate new xG

# Drop location_x and location_y, variables utilized
# for feature engineering, but no longer 
# useful independently

cleaned_data.drop(['statsbomb_xg',
                   'player_shot',
                   'team',
                   'location_x',
                   'location_y',],
                  axis = 1,
                  inplace = True)

In [None]:
cleaned_data.head()

Unnamed: 0,goal,shot_distance,shot_angle,bodypart,technique,first_time,state_of_play,assist,assist_state_of_play
0,False,12.529964,118.61,Left Foot,Normal Shot,False,Open Play Shot,Ground Pass,Open Play Assist
1,False,8.602325,54.46,Head,Normal Shot,False,Open Play Shot,High Pass,Set Piece - Free Kick
2,False,26.172505,96.58,Left Foot,Normal Shot,False,Open Play Shot,Ground Pass,Open Play Assist
3,False,34.525353,79.99,Left Foot,Normal Shot,False,Open Play Shot,Ground Pass,Set Piece - Goal Kick
4,False,26.925824,74.93,Right Foot,Normal Shot,False,Open Play Shot,Ground Pass,Set Piece - Goal Kick


## Label Encode Boolean Variables

In [None]:
# Boolean variables = 'goal' and 'first_time'

cleaned_data['goal'] = LabelEncoder().fit_transform(cleaned_data['goal'])
cleaned_data['first_time'] = LabelEncoder().fit_transform(cleaned_data['first_time'])

In [None]:
cleaned_data.head()

Unnamed: 0,goal,shot_distance,shot_angle,bodypart,technique,first_time,state_of_play,assist,assist_state_of_play
0,0,12.529964,118.61,Left Foot,Normal Shot,0,Open Play Shot,Ground Pass,Open Play Assist
1,0,8.602325,54.46,Head,Normal Shot,0,Open Play Shot,High Pass,Set Piece - Free Kick
2,0,26.172505,96.58,Left Foot,Normal Shot,0,Open Play Shot,Ground Pass,Open Play Assist
3,0,34.525353,79.99,Left Foot,Normal Shot,0,Open Play Shot,Ground Pass,Set Piece - Goal Kick
4,0,26.925824,74.93,Right Foot,Normal Shot,0,Open Play Shot,Ground Pass,Set Piece - Goal Kick


## One Hot Encode Categorical Variables

In [None]:
# Categorical variables = 'bodypart', 'technique', 'state_of_play',
# 'assist', 'assist_state_of_play'

categorical_data = cleaned_data.drop(['goal',
                                      'shot_distance',
                                      'shot_angle',
                                      'first_time'],
                                     axis = 1)

In [None]:
# One Hot Encode categorical_data

encoded_bodypart = pd.get_dummies(categorical_data['bodypart'])

encoded_technique = pd.get_dummies(categorical_data['technique'])
encoded_categories = encoded_bodypart.join(encoded_technique)

encoded_state_of_play = pd.get_dummies(categorical_data['state_of_play'])
encoded_categories = encoded_categories.join(encoded_state_of_play)

encoded_assist = pd.get_dummies(categorical_data['assist'])
encoded_categories = encoded_categories.join(encoded_assist)

encoded_assist_state_of_play = pd.get_dummies(categorical_data['assist_state_of_play'])
encoded_assist_state_of_play.drop( 'No Assist',
                                  axis = 1,
                                  inplace = True)

encoded_categories = encoded_categories.join(encoded_assist_state_of_play)

In [None]:
encoded_categories.head()

Unnamed: 0,Head,Left Foot,Other Bodypart,Right Foot,Half Volley,Normal Shot,Volley,Open Play Shot,Set Piece - Direct Free Kick,Set Piece - Penalty,Cross,Cut Back,Ground Pass,High Pass,Low Pass,Through Ball,Unassisted,Open Play - Assist,Open Play - Counter Attack,Open Play - Keeper,Open Play Assist,Set Piece - Corner,Set Piece - Free Kick,Set Piece - Goal Kick,Set Piece - Kick Off,Set Piece - Throw In
0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


# Scale Numerical Variables

In [None]:
# Numerical data = 'shot_distance' and 'shot_angle'

scaler = MinMaxScaler(feature_range = (0, 1))

numerical_variables = ['shot_distance', 'shot_angle']
cleaned_data[numerical_variables] = scaler.fit_transform(cleaned_data[numerical_variables])

# Create New Dataframe from Preprocessed Data

In [None]:
encoded_data = encoded_categories
encoded_data['shot_distance'] = cleaned_data['shot_distance']
encoded_data['shot_angle'] = cleaned_data['shot_angle']
encoded_data['goal'] = cleaned_data['goal']
encoded_data['first_time'] = cleaned_data['first_time']

In [None]:
encoded_data.head()

Unnamed: 0,Head,Left Foot,Other Bodypart,Right Foot,Half Volley,Normal Shot,Volley,Open Play Shot,Set Piece - Direct Free Kick,Set Piece - Penalty,Cross,Cut Back,Ground Pass,High Pass,Low Pass,Through Ball,Unassisted,Open Play - Assist,Open Play - Counter Attack,Open Play - Keeper,Open Play Assist,Set Piece - Corner,Set Piece - Free Kick,Set Piece - Goal Kick,Set Piece - Kick Off,Set Piece - Throw In,shot_distance,shot_angle,goal,first_time
0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0.175922,0.658944,0,0
1,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0.115995,0.302556,0,0
2,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0.384077,0.536556,0,0
3,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0.511523,0.444389,0,0
4,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0.395571,0.416278,0,0


In [None]:
preprocessed_data = encoded_data
preprocessed_data.to_csv('/content/drive/MyDrive/flatiron/expected_goals/data_preprocessing/preprocessed_data.csv')

Continued in [expected_goals_data_modeling_notebook](https://github.com/wswager/expected_goals/blob/main/data_modeling/expected_goals_data_modeling_notebook.ipynb)