# Project T Final: PCA and CCA

By Jai Bansal, Abhinav Gopal, Grace Kull, William McEachen, Shrey Vasavada

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge



Run the cell below only once per session

In [2]:
!ln -s ./utils.py utils.py

ln: utils.py: File exists


In [3]:
import utils

## Part 1: Initial Data Cleaning

In [4]:
baseball = pd.read_csv("baseball_data.csv")

In [5]:
baseball.head()

Unnamed: 0,id,yearid,teamid,lgid,namefirst,namelast,salary,pos,g.x,gs,...,sh,sf,gidp,years,cab,ch,chr,cr,crbi,cbb
0,abreubo01,2012.0,LAA,AL,Bobby,Abreu,9000000.0,OF,54,50,...,0,1,7,17,8347,2437,287,1441,1349,1456
1,abreuto01,,,,,,,23,21,17,...,0,1,1,4,437,110,4,40,46,16
2,ackledu01,2012.0,SEA,AL,Dustin,Ackley,2100000.0,12,153,147,...,1,1,3,2,940,228,18,123,86,99
3,adamsma01,,,,,,,1B,24,23,...,0,0,3,1,86,21,2,8,13,5
4,allenbr01,2012.0,OAK,AL,Brandon,Allen,482500.0,O1,6,5,...,0,0,0,4,344,70,12,44,41,42


In [6]:
# Initial data cleaning by dropping rows with no salary information, 
# and dropping the columns with information that is not relevant enough to predict salaries
# ('id', 'yearid','teamid','lgid','namefirst','namelast')
data = baseball.drop(columns = ['id', 'yearid','teamid','lgid','namefirst','namelast'])
data = data[data['salary'] > 0]

In [7]:
data.shape

(421, 34)

In [8]:
# Have students do one-hot encoding, similar to the code below. But have detailed instructions so that they can do it. 

In [9]:
# One-hot encoding for each of the different positions
positions = np.unique(data['pos'])
#for pos in positions:
temp_array = np.array(data['pos'])
for position in positions:
    indicator = np.zeros(len(data['pos']))
    for j in range(len(temp_array)):
        if temp_array[j] == position:
            indicator[j] = 1
    data[position] = indicator

data_one_hot = data.drop(columns = ['pos', 'g_batting'])

# Then replace all NaN values with 0.0
cleaned = data_one_hot.fillna(value=0.0)

In [10]:
# Cleaned data from one-hot encoding the positions. 
cleaned.head(10)

Unnamed: 0,salary,g.x,gs,innouts,po,a,e,dp,g.y,ab,...,C1,CF,LF,O1,O2,O3,OC,OF,RF,SS
0,9000000.0,54,50,1133,70,2,1,1,100,219,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2100000.0,153,147,3953,289,398,8,96,153,607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,482500.0,6,5,129,29,2,0,1,10,20,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1400000.0,149,144,3829,1269,96,12,77,155,549,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,483000.0,147,142,3680,257,410,11,83,147,576,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2200000.0,145,143,3819,73,264,27,23,149,525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,481000.0,107,64,1783,120,128,4,22,106,275,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,1300000.0,127,107,3036,195,306,13,71,127,384,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,2625000.0,153,150,3999,233,414,16,91,158,629,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
13,1250000.0,62,37,1146,115,2,2,0,68,158,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Part 2: Initial Linear Regression, PCA and CCA Observations

In [11]:
# Students will run PCA and CCA on the cleaned data to see which one predicts the salaries better
# They will also make plots for both PCA and CCA

# Have students implement PCA and CCA without the libraries.
# Ask which one performs better, and ask why CCA is almost as good as regression
# Also have an intro to linear regression too

In [18]:
noised_salaries = utils.noised_predictions(cleaned)
noised_salaries

0       9000000.0
2       2100000.0
4        482500.0
5       1400000.0
6        483000.0
          ...    
621     6750000.0
622      481000.0
624    16174974.0
625    12000000.0
626     4687300.0
Name: salary, Length: 421, dtype: float64

## Part 3: Dirty Data

You will now run the following cell in order to generate a noised version of the above dataset. The function <code>dirty_data</code> is a black box function from the utils.py file that adds random amounts of noise to the above dataset, and will output a different result every time it's run.

In [111]:
dirtied_data = utils.dirty_data(cleaned)

In [112]:
dirtied_data

Unnamed: 0,'dp',3B,2B,ibb,'H','C','12',OC,chr,'RF',...,O1,'gidp',sf,crbi,3S,hr,C1,YEARS,ab,g.y
193,,1.0,,2.0,147.0,0.0,0.0,0.0,35.0,0.0,...,0.0,19.0,2.0,177.0,0.0,20.0,,4.0,501,144.0
14,2,0.0,0.0,1.0,150.0,0.0,0.0,0.0,10.0,0.0,...,0.0,6.0,5.0,50.0,0.0,,0.0,1.0,520,151.0
431,41,0.0,,2.0,147.0,0.0,0.0,,7.0,0.0,...,0.0,13.0,4.0,68.0,0.0,5.0,0.0,,475,132.0
296,7,0.0,0.0,0.0,18.0,0.0,0.0,0.0,95.0,0.0,...,0.0,4.0,0.0,398.0,0.0,4.0,0.0,10.0,87,38.0
56,0,,0.0,1.0,171.0,,0.0,0.0,22.0,0.0,...,0.0,2.0,4.0,215.0,0.0,9.0,0.0,7.0,624,155.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,0,0.0,0.0,5.0,142.0,0.0,0.0,0.0,23.0,0.0,...,0.0,9.0,3.0,82.0,0.0,23.0,0.0,1.0,487,129.0
60,11,,0.0,0.0,2.0,0.0,0.0,0.0,10.0,0.0,...,0.0,0.0,0.0,67.0,0.0,0.0,0.0,5.0,21,16.0
322,3,0.0,0.0,7.0,128.0,0.0,0.0,0.0,134.0,0.0,...,0.0,11.0,6.0,519.0,0.0,30.0,0.0,8.0,506,141.0
539,0,0.0,0.0,,92.0,0.0,0.0,0.0,65.0,0.0,...,0.0,4.0,3.0,233.0,0.0,14.0,0.0,6.0,383,125.0


Now run PCA on your noised dataset, using 2, 5, and 8 components and find the MSE for your predictions. How does the performance compare to PCA of the original cleaned dataset?

In [101]:
# YOUR CODE HERE #

# Solution #
pca2 = PCA(n_components=2)
pca2.fit(noised_data.fillna(value=0.0))
# End Solution #

PCA(n_components=2)