# Principal Component Analysis Assignment

In [32]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).

In [2]:
data = pd.read_csv('https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')

In [3]:
data

Unnamed: 0,gameString,gameDate,visitor,home,inning,side,balls,strikes,outs,batterId,batterName,batterHand,batterPosition,pitcherId,pitcherName,pitcherHand,timesFaced,catcherId,catcher,umpireId,umpire,probCalledStrike,pitchResult,pitchType,releaseVelocity,spinRate,spinDir,locationHoriz,locationVert,movementHoriz,movementVert,battedBallType,battedBallAngle,battedBallDistance,paResult
0,gid_2016_04_04_sfnmlb_milmlb_1,2016-04-04,SF,MIL,1,T,0,0,0,452655,Denard Span,L,CF,503449,Wily Peralta,R,1,518960,Jonathan Lucroy,427192,Brian Gorman,0.975,SL,FF,94.2,2044.22,205.477,-0.374,2.933,-6.93,8.28,,,,
1,gid_2016_04_04_sfnmlb_milmlb_1,2016-04-04,SF,MIL,1,T,0,1,0,452655,Denard Span,L,CF,503449,Wily Peralta,R,1,518960,Jonathan Lucroy,427192,Brian Gorman,0.745,SL,FT,97.1,1966.32,220.143,0.342,3.223,-7.48,7.35,,,,
2,gid_2016_04_04_sfnmlb_milmlb_1,2016-04-04,SF,MIL,1,T,0,2,0,452655,Denard Span,L,CF,503449,Wily Peralta,R,1,518960,Jonathan Lucroy,427192,Brian Gorman,0.968,IP,FF,96.5,2127.17,198.816,0.389,2.266,-5.22,9.79,FB,27.78,323.03,IP_OUT
3,gid_2016_04_04_sfnmlb_milmlb_1,2016-04-04,SF,MIL,1,T,0,0,1,605412,Joe Panik,L,2B,503449,Wily Peralta,R,1,518960,Jonathan Lucroy,427192,Brian Gorman,1.000,SL,FT,95.6,1947.11,198.734,-0.004,2.380,-7.24,8.40,,,,
4,gid_2016_04_04_sfnmlb_milmlb_1,2016-04-04,SF,MIL,1,T,0,1,1,605412,Joe Panik,L,2B,503449,Wily Peralta,R,1,518960,Jonathan Lucroy,427192,Brian Gorman,1.000,F,FF,95.6,1903.08,205.503,0.272,2.421,-6.79,9.37,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79926,gid_2016_09_26_oakmlb_anamlb_1,2016-09-26,OAK,LAA,9,T,1,1,2,475174,Yonder Alonso,L,1B,457732,Andrew Bailey,R,1,542908,Jett Bandy,427315,Alfonso Marquez,0.989,F,FF,92.5,1802.52,170.921,0.124,2.782,1.44,8.95,,,,
79927,gid_2016_09_26_oakmlb_anamlb_1,2016-09-26,OAK,LAA,9,T,1,2,2,475174,Yonder Alonso,L,1B,457732,Andrew Bailey,R,1,542908,Jett Bandy,427315,Alfonso Marquez,0.000,B,FF,93.5,1591.22,173.371,-1.577,3.571,0.92,7.85,,,,
79928,gid_2016_09_26_oakmlb_anamlb_1,2016-09-26,OAK,LAA,9,T,2,2,2,475174,Yonder Alonso,L,1B,457732,Andrew Bailey,R,1,542908,Jett Bandy,427315,Alfonso Marquez,0.939,F,FC,88.3,743.92,151.086,0.357,2.942,1.91,3.42,,,,
79929,gid_2016_09_26_oakmlb_anamlb_1,2016-09-26,OAK,LAA,9,T,2,2,2,475174,Yonder Alonso,L,1B,457732,Andrew Bailey,R,1,542908,Jett Bandy,427315,Alfonso Marquez,0.826,F,FF,93.6,2239.31,173.146,0.565,2.027,1.35,11.15,,,,


In [4]:
data.pitchType.value_counts()

FF    29027
SL    12068
FT    10416
CH     8473
CU     6917
SI     5308
FC     4081
KC     1714
FS     1043
KN      604
UN      268
EP       11
SC        1
Name: pitchType, dtype: int64

### Keep only the pitch type and the numeric columns (exluding ID fields).

* Drop any remaining records that contain null values.
* Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.

In [5]:
print(data.shape)
data2 = data.loc[:,~data.columns.str.endswith('Id')]
print(data2.shape)
data3 = pd.concat([data2.pitchType, data2.select_dtypes(include='number')], axis=1).dropna(axis=0)
print(data3.shape)

(79931, 35)
(79931, 31)
(13438, 16)


In [6]:
y = data3.pitchType
X = data3.drop(columns=['pitchType'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1)

### Reduce the dimensionality of the data using PCA to two components.

Don't forget to scale.

In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
pca_components = pca.fit_transform(X_scaled)

### Compute the explained variance for new data set.

In [15]:
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

[2.38765913 1.62380707]
[0.15916543 0.10824575]


### Generate a scatter plot that visualizes how the component values are distributed.

In [34]:
fig = px.scatter(pca_components, x=0, y=1)
fig.show()

### Create a line plot that shows the explained variance by number of components.

In [36]:
import plotly.express as px

pca = PCA()
pca.fit(X_scaled)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

### Iteratively train and score a Random Forest Classifier (to predict `pitchType`).

* In each iteration:
    * Increase the number of principal components (start with `2`)
    * Store the number of components and model score

In [28]:
model = RandomForestClassifier(max_depth=4, random_state=1)

In [80]:
scores = []

for i in range(2, 13):
  pca = PCA(n_components= i+1)
  pca.fit(X_scaled)
  X_pca = pca.transform(X_scaled)

  model = RandomForestClassifier(max_depth=4, random_state=1)
  model.fit(X_pca, y)
  scores.append(model.score(X_pca, y))

In [75]:
table = pd.DataFrame(scores, columns=['Model_scores'])
table

Unnamed: 0,Model_scores
0,0.548742
1,0.560277
2,0.563477
3,0.56117
4,0.552017
5,0.562881
6,0.590936
7,0.577392
8,0.569653
9,0.563849


### Create a line plot that shows the training scores across the different numbers of principal components.

In [84]:
px.area(
    x=range(1, len(scores) + 1),
    y=scores,
    labels={"x": "# Components", "y": "Explained Variance"}
)

### Based on the analysis you conducted, what is the optimal number of principal components?

The optimal n_components appears to be 7