# Principal Component Analysis Assignment

In [243]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).

In [244]:
data = pd.read_csv('https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')

### Keep only the pitch type and the numeric columns (exluding ID fields).

* Drop any remaining records that contain null values.
* Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.

In [245]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79931 entries, 0 to 79930
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   gameString          79931 non-null  object 
 1   gameDate            79931 non-null  object 
 2   visitor             79931 non-null  object 
 3   home                79931 non-null  object 
 4   inning              79931 non-null  int64  
 5   side                79931 non-null  object 
 6   balls               79931 non-null  int64  
 7   strikes             79931 non-null  int64  
 8   outs                79931 non-null  int64  
 9   batterId            79931 non-null  int64  
 10  batterName          79931 non-null  object 
 11  batterHand          79931 non-null  object 
 12  batterPosition      79931 non-null  object 
 13  pitcherId           79931 non-null  int64  
 14  pitcherName         79931 non-null  object 
 15  pitcherHand         79931 non-null  object 
 16  time

In [246]:
df = data.select_dtypes(exclude=['object'])

In [247]:
pitch = data.pitchType

In [248]:
df_ = pd.concat([df, pitch], axis=1)
df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79931 entries, 0 to 79930
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   inning              79931 non-null  int64  
 1   balls               79931 non-null  int64  
 2   strikes             79931 non-null  int64  
 3   outs                79931 non-null  int64  
 4   batterId            79931 non-null  int64  
 5   pitcherId           79931 non-null  int64  
 6   timesFaced          79931 non-null  int64  
 7   catcherId           79931 non-null  int64  
 8   umpireId            79931 non-null  int64  
 9   probCalledStrike    79921 non-null  float64
 10  releaseVelocity     79931 non-null  float64
 11  spinRate            73734 non-null  float64
 12  spinDir             79620 non-null  float64
 13  locationHoriz       79931 non-null  float64
 14  locationVert        79931 non-null  float64
 15  movementHoriz       79931 non-null  float64
 16  move

In [249]:
df_.drop(columns=['batterId','pitcherId','catcherId','umpireId'], inplace=True)

In [250]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79931 entries, 0 to 79930
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   inning              79931 non-null  int64  
 1   balls               79931 non-null  int64  
 2   strikes             79931 non-null  int64  
 3   outs                79931 non-null  int64  
 4   timesFaced          79931 non-null  int64  
 5   probCalledStrike    79921 non-null  float64
 6   releaseVelocity     79931 non-null  float64
 7   spinRate            73734 non-null  float64
 8   spinDir             79620 non-null  float64
 9   locationHoriz       79931 non-null  float64
 10  locationVert        79931 non-null  float64
 11  movementHoriz       79931 non-null  float64
 12  movementVert        79931 non-null  float64
 13  battedBallAngle     14499 non-null  float64
 14  battedBallDistance  14499 non-null  float64
 15  pitchType           79931 non-null  object 
dtypes: f

In [251]:
df_.dropna()

Unnamed: 0,inning,balls,strikes,outs,timesFaced,probCalledStrike,releaseVelocity,spinRate,spinDir,locationHoriz,locationVert,movementHoriz,movementVert,battedBallAngle,battedBallDistance,pitchType
2,1,0,2,0,1,0.968,96.5,2127.170,198.816,0.389,2.266,-5.22,9.79,27.78,323.03,FF
5,1,0,2,1,1,0.321,98.3,2038.060,206.732,-0.206,1.430,-8.30,7.96,-15.32,121.39,FT
8,1,1,1,2,1,0.944,96.3,1909.360,203.540,-0.280,1.990,-6.03,7.98,-34.27,97.07,FF
11,1,1,1,0,1,0.753,92.5,2424.900,140.242,-0.523,1.858,9.10,10.75,-32.44,257.08,FT
23,1,2,2,0,1,0.974,92.1,2319.270,138.209,-0.666,2.667,6.46,9.35,2.02,305.16,FF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79902,8,1,0,1,1,0.720,84.1,692.172,109.742,0.482,1.579,3.63,1.25,38.89,51.92,SL
79907,8,1,2,2,1,0.000,84.0,551.530,131.627,1.374,1.263,2.27,1.97,16.58,104.32,SL
79911,9,0,2,0,1,0.297,87.9,896.178,129.103,0.622,3.149,3.66,2.93,-105.31,61.77,FC
79915,9,1,2,1,1,0.043,86.3,544.542,108.367,1.178,2.103,2.77,0.88,32.34,34.43,FC


In [116]:
X = df_.drop(columns=['pitchType'])
y = df_.pitchType

In [118]:
print(X.shape)
print(y.shape)

(13438, 15)
(13438,)


### Reduce the dimensionality of the data using PCA to two components.

Don't forget to scale.

In [145]:
scale = StandardScaler()
scaled = pd.DataFrame(scale.fit_transform(X), index=df_.index)

In [252]:
pca = PCA(n_components=2)
#pca.fit(scaled)
X_pca = pca.fit_transform(scaled)

### Compute the explained variance for new data set.

In [253]:
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

[2.38766003 1.6238814 ]
[0.15916549 0.1082507 ]


### Generate a scatter plot that visualizes how the component values are distributed.

In [256]:
components = pd.DataFrame(X_pca, index=scaled.index)

In [257]:
components

Unnamed: 0,0,1
2,-1.334168,-0.995767
5,-1.131012,1.452652
8,-1.398263,0.598154
11,0.277485,-1.146975
23,-0.108249,-1.558939
...,...,...
79902,1.660935,1.683845
79907,2.265141,2.650377
79911,1.196412,1.406667
79915,2.061587,2.079788


In [169]:
fig = px.scatter(components, x=0, y=1, color=df_['pitchType'])
fig.show()

### Create a line plot that shows the explained variance by number of components.

In [155]:
pcaline = PCA()
pcaline.fit(scaled)
exp_var_cum = np.cumsum(pcaline.explained_variance_ratio_) 

In [160]:
px. area(
    x=range(1, exp_var_cum.shape[0] + 1),
    y=exp_var_cum,
    labels={'x': 'Number of components', 'y':'Explained variance'}
)

### Iteratively train and score a Random Forest Classifier (to predict `pitchType`).

* In each iteration:
    * Increase the number of principal components (start with `2`)
    * Store the number of components and model score

In [238]:
comps = list(range(2, len(X.columns)))
scores = []
components_ = []


In [239]:
for component in comps:
  pca_ = PCA(n_components=component)
  components_.append(pca_.fit_transform(scaled))

In [178]:
model = RandomForestClassifier(max_depth=4, random_state=1)

In [240]:
for comp in components_:
  rfc = RandomForestClassifier(max_depth=4, random_state=1)
  rfc.fit(comp, y)
  scores.append(rfc.score(comp, y))
  

In [260]:
pairs = []
for x, y in zip(comps, scores):
  pairs.append((x, y))

In [261]:
pairs

[(2, 0.5541747283821997),
 (3, 0.5496353624051198),
 (4, 0.5601279952373865),
 (5, 0.5634767078434291),
 (6, 0.5611698169370442),
 (7, 0.5520166691471945),
 (8, 0.5628813811579104),
 (9, 0.5909361512129782),
 (10, 0.5773924691174281),
 (11, 0.5696532222056854),
 (12, 0.5638487870218782),
 (13, 0.5503795207620181),
 (14, 0.5456913231135586)]

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

[0.5541747283821997,
 0.5496353624051198,
 0.5601279952373865,
 0.5634767078434291,
 0.5611698169370442,
 0.5520166691471945,
 0.5628813811579104,
 0.5909361512129782,
 0.5773924691174281,
 0.5696532222056854,
 0.5638487870218782,
 0.5503795207620181,
 0.5456913231135586]

### Create a line plot that shows the training scores across the different numbers of principal components.

In [266]:
px. area(
    x=comps,
    y=scores,
    labels={'x': 'Number of components', 'y':'Scores'}
)

### Based on the analysis you conducted, what is the optimal number of principal components?

In [None]:
# The optimal number of components is 9. 