# Principal Component Analysis Assignment

In [35]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).

In [2]:
data = pd.read_csv('https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')
data

Unnamed: 0,gameString,gameDate,visitor,home,inning,side,balls,strikes,outs,batterId,batterName,batterHand,batterPosition,pitcherId,pitcherName,pitcherHand,timesFaced,catcherId,catcher,umpireId,umpire,probCalledStrike,pitchResult,pitchType,releaseVelocity,spinRate,spinDir,locationHoriz,locationVert,movementHoriz,movementVert,battedBallType,battedBallAngle,battedBallDistance,paResult
0,gid_2016_04_04_sfnmlb_milmlb_1,2016-04-04,SF,MIL,1,T,0,0,0,452655,Denard Span,L,CF,503449,Wily Peralta,R,1,518960,Jonathan Lucroy,427192,Brian Gorman,0.975,SL,FF,94.2,2044.22,205.477,-0.374,2.933,-6.93,8.28,,,,
1,gid_2016_04_04_sfnmlb_milmlb_1,2016-04-04,SF,MIL,1,T,0,1,0,452655,Denard Span,L,CF,503449,Wily Peralta,R,1,518960,Jonathan Lucroy,427192,Brian Gorman,0.745,SL,FT,97.1,1966.32,220.143,0.342,3.223,-7.48,7.35,,,,
2,gid_2016_04_04_sfnmlb_milmlb_1,2016-04-04,SF,MIL,1,T,0,2,0,452655,Denard Span,L,CF,503449,Wily Peralta,R,1,518960,Jonathan Lucroy,427192,Brian Gorman,0.968,IP,FF,96.5,2127.17,198.816,0.389,2.266,-5.22,9.79,FB,27.78,323.03,IP_OUT
3,gid_2016_04_04_sfnmlb_milmlb_1,2016-04-04,SF,MIL,1,T,0,0,1,605412,Joe Panik,L,2B,503449,Wily Peralta,R,1,518960,Jonathan Lucroy,427192,Brian Gorman,1.000,SL,FT,95.6,1947.11,198.734,-0.004,2.380,-7.24,8.40,,,,
4,gid_2016_04_04_sfnmlb_milmlb_1,2016-04-04,SF,MIL,1,T,0,1,1,605412,Joe Panik,L,2B,503449,Wily Peralta,R,1,518960,Jonathan Lucroy,427192,Brian Gorman,1.000,F,FF,95.6,1903.08,205.503,0.272,2.421,-6.79,9.37,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79926,gid_2016_09_26_oakmlb_anamlb_1,2016-09-26,OAK,LAA,9,T,1,1,2,475174,Yonder Alonso,L,1B,457732,Andrew Bailey,R,1,542908,Jett Bandy,427315,Alfonso Marquez,0.989,F,FF,92.5,1802.52,170.921,0.124,2.782,1.44,8.95,,,,
79927,gid_2016_09_26_oakmlb_anamlb_1,2016-09-26,OAK,LAA,9,T,1,2,2,475174,Yonder Alonso,L,1B,457732,Andrew Bailey,R,1,542908,Jett Bandy,427315,Alfonso Marquez,0.000,B,FF,93.5,1591.22,173.371,-1.577,3.571,0.92,7.85,,,,
79928,gid_2016_09_26_oakmlb_anamlb_1,2016-09-26,OAK,LAA,9,T,2,2,2,475174,Yonder Alonso,L,1B,457732,Andrew Bailey,R,1,542908,Jett Bandy,427315,Alfonso Marquez,0.939,F,FC,88.3,743.92,151.086,0.357,2.942,1.91,3.42,,,,
79929,gid_2016_09_26_oakmlb_anamlb_1,2016-09-26,OAK,LAA,9,T,2,2,2,475174,Yonder Alonso,L,1B,457732,Andrew Bailey,R,1,542908,Jett Bandy,427315,Alfonso Marquez,0.826,F,FF,93.6,2239.31,173.146,0.565,2.027,1.35,11.15,,,,


### Keep only the pitch type and the numeric columns (exluding ID fields).

* Drop any remaining records that contain null values.
* Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.

In [3]:
data.dropna(axis=1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79931 entries, 0 to 79930
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gameString       79931 non-null  object 
 1   gameDate         79931 non-null  object 
 2   visitor          79931 non-null  object 
 3   home             79931 non-null  object 
 4   inning           79931 non-null  int64  
 5   side             79931 non-null  object 
 6   balls            79931 non-null  int64  
 7   strikes          79931 non-null  int64  
 8   outs             79931 non-null  int64  
 9   batterId         79931 non-null  int64  
 10  batterName       79931 non-null  object 
 11  batterHand       79931 non-null  object 
 12  batterPosition   79931 non-null  object 
 13  pitcherId        79931 non-null  int64  
 14  pitcherName      79931 non-null  object 
 15  pitcherHand      79931 non-null  object 
 16  timesFaced       79931 non-null  int64  
 17  catcherId   

In [12]:
X = data[['inning', 'balls', 'strikes', 'outs', 
          'timesFaced', 'releaseVelocity', 'locationHoriz',
          'locationVert','movementHoriz', 'movementVert']]
y = data.pitchType

### Reduce the dimensionality of the data using PCA to two components.

Don't forget to scale.

In [27]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

pca = PCA(n_components=2)
pca.fit(X_std)
pca_comp = pca.transform(X_std)

### Compute the explained variance for new data set.

In [28]:
pca.explained_variance_

array([2.02793766, 1.38269509])

### Generate a scatter plot that visualizes how the component values are distributed.

In [39]:
fig = px.scatter(pca_comp, x=0, y=1)
fig.show()

### Create a line plot that shows the explained variance by number of components.

In [40]:
pca = PCA(n_components=10)
pca.fit(X_std)
X_pca = pca.transform(X_std)

In [41]:
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

### Iteratively train and score a Random Forest Classifier (to predict `pitchType`).

* In each iteration:
    * Increase the number of principal components (start with `2`)
    * Store the number of components and model score

In [77]:
scores = []

for i in np.arange(0,10):
    pca = PCA(n_components=i+1)
    pca.fit(X_std)
    X_pca = pca.transform(X_std)

    rf = RandomForestClassifier(max_depth=4, random_state=1)
    rf.fit(X_pca, y)
    print(rf.score(X_pca, y))
    scores.append(rf.score(X_pca, y))

0.5379014399919931
0.533847943851572
0.5467841012873603
0.5505373384544169
0.5484980795936495
0.5576684890718244
0.5623850571117589
0.5656879058187687
0.576809998623813
0.5736697901940424


### Create a line plot that shows the training scores across the different numbers of principal components.

In [78]:
px.area(
    x=range(1, len(scores) + 1),
    y=scores,
    labels={"x": "# Components", "y": "Explained Variance"}
)

### Based on the analysis you conducted, what is the optimal number of principal components?

In [83]:
print(f'The optimal number of principal components is {scores.index(max(scores)) + 1}')

The optimal number of principal components is 9
