# PCA-1 - Intro

Dimension reduction of data

## Step 1: Load the Data

In [1]:
import os
import urllib.request

data_location = "../data/cars/mtcars_header.csv"
data_url = 'https://elephantscale-public.s3.amazonaws.com/data/cars/mtcars_header.csv'

if not os.path.exists (data_location):
    data_location = os.path.basename(data_location)
    if not os.path.exists(data_location):
        print("Downloading : ", data_url)
        urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

Downloading :  https://elephantscale-public.s3.amazonaws.com/data/cars/mtcars_header.csv
data_location: mtcars_header.csv


In [2]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format

dataset = pd.read_csv(data_location)
print ('dataset.shape : ', dataset.shape)
dataset.sample(10)
# dataset

dataset.shape :  (32, 12)


Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
14,Cadillac Fleetwood,10.4,8,472.0,205,2.93,5.25,17.98,0,0,3,4
22,AMC Javelin,15.2,8,304.0,150,3.15,3.44,17.3,0,0,3,2
10,Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.88,17.02,0,1,4,4
15,Lincoln Continental,10.4,8,460.0,215,3.0,5.42,17.82,0,0,3,4
11,Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
23,Camaro Z28,13.3,8,350.0,245,3.73,3.84,15.41,0,0,3,4
18,Honda Civic,30.4,4,75.7,52,4.93,1.61,18.52,1,1,4,2
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.21,19.44,1,0,3,1


## Step 2 - Explore Data

As we can see, this data has 12 columns.  We want to plot this data in 2D (X and Y axis).  So we need to reduce the dimensions to 2

In [3]:
dataset2 = dataset.drop(['model'], axis=1)
dataset2

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110,3.9,2.88,17.02,0,1,4,4
2,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,21.4,6,258.0,110,3.08,3.21,19.44,1,0,3,1
4,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
6,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


## Step 3 - Scale Data

We need to scale data before PCA

There are 2 options
1. Using Standard scaler
2. Using built-in pandas functions

In [12]:
## Option -1 

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler ()

scaled = scaler.fit_transform(dataset2)
scaled_data = pd.DataFrame(scaled, columns = dataset2.columns)
scaled_data.head(10)

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,0.15,-0.11,-0.58,-0.54,0.58,-0.62,-0.79,-0.88,1.21,0.43,0.75
1,0.15,-0.11,-0.58,-0.54,0.58,-0.36,-0.47,-0.88,1.21,0.43,0.75
2,0.46,-1.24,-1.01,-0.8,0.48,-0.93,0.43,1.13,1.21,0.43,-1.14
3,0.22,-0.11,0.22,-0.54,-0.98,-0.0,0.9,1.13,-0.83,-0.95,-1.14
4,-0.23,1.03,1.06,0.42,-0.85,0.23,-0.47,-0.88,-0.83,-0.95,-0.51
5,-0.34,-0.11,-0.05,-0.62,-1.59,0.25,1.35,1.13,-0.83,-0.95,-1.14
6,-0.98,1.03,1.06,1.46,-0.73,0.37,-1.14,-0.88,-0.83,-0.95,0.75
7,0.73,-1.24,-0.69,-1.25,0.18,-0.03,1.22,1.13,-0.83,0.43,-0.51
8,0.46,-1.24,-0.74,-0.77,0.61,-0.07,2.87,1.13,-0.83,0.43,-0.51
9,-0.15,-0.11,-0.52,-0.35,0.61,0.23,0.26,1.13,-0.83,0.43,0.75


In [13]:
## Option 2 

#scaled_data = (dataset2 - dataset2.mean()) / dataset2.std()
#scaled_data.head(10)

## Step 4 - PCA

In [19]:
from sklearn.decomposition import PCA

# we are doing 2 PC
pca = PCA(n_components = 3)
pca_data = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(pca_data, columns = ['pc1', 'pc2', 'pc3'])
pca_df.head(10)

Unnamed: 0,pc1,pc2,pc3
0,0.66,1.74,-0.6
1,0.63,1.55,-0.38
2,2.78,-0.15,-0.24
3,0.31,-2.36,-0.14
4,-1.97,-0.75,-1.13
5,0.06,-2.79,0.16
6,-3.0,0.33,-0.36
7,2.06,-1.47,0.94
8,2.29,-1.98,1.8
9,0.53,-0.16,1.49


## Step 5 - Plot

Now let's plot the reduced data

In [29]:
import matplotlib.pyplot as plt

##plt.scatter(pca_df['pc1'], pca_df['pc2'], pca_df['pc3'], marker='o')
##plt.show()

import plotly.express as px
fig = px.scatter_3d(pca_df, x='pc1', y='pc2', z= 'pc3',labels={'0': 'pc1', '1': 'pc2', '2': 'pc3'})
fig.show()

## Step 6 - Compare Correlation Matrices

Now let's compare correlation matrix of original data and PCA data.

TODO: Can you explain the difference?

In [23]:
# original data
scaled_data.corr()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
mpg,1.0,-0.85,-0.85,-0.78,0.68,-0.87,0.42,0.66,0.6,0.48,-0.55
cyl,-0.85,1.0,0.9,0.83,-0.7,0.78,-0.59,-0.81,-0.52,-0.49,0.53
disp,-0.85,0.9,1.0,0.79,-0.71,0.89,-0.43,-0.71,-0.59,-0.56,0.39
hp,-0.78,0.83,0.79,1.0,-0.45,0.66,-0.71,-0.72,-0.24,-0.13,0.75
drat,0.68,-0.7,-0.71,-0.45,1.0,-0.71,0.09,0.44,0.71,0.7,-0.09
wt,-0.87,0.78,0.89,0.66,-0.71,1.0,-0.17,-0.55,-0.69,-0.58,0.43
qsec,0.42,-0.59,-0.43,-0.71,0.09,-0.17,1.0,0.74,-0.23,-0.21,-0.66
vs,0.66,-0.81,-0.71,-0.72,0.44,-0.55,0.74,1.0,0.17,0.21,-0.57
am,0.6,-0.52,-0.59,-0.24,0.71,-0.69,-0.23,0.17,1.0,0.79,0.06
gear,0.48,-0.49,-0.56,-0.13,0.7,-0.58,-0.21,0.21,0.79,1.0,0.27


In [24]:
# pca
pca_df.corr()

Unnamed: 0,pc1,pc2,pc3
pc1,1.0,0.0,-0.0
pc2,0.0,1.0,-0.0
pc3,-0.0,-0.0,1.0


## Step 7 - Understanding PCA

[SKLearn PCA reference](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html?highlight=pca#sklearn.decomposition.PCA)

* `pca.components_`  : Principal axes in feature space, representing the directions of maximum variance in the data. The components are sorted by explained_variance_
* `pca.explained_variance_ratio_` : Percentage of variance explained by each of the selected components.


In [25]:
print ('\n number of columns in original data : ', scaled_data.shape[1])
print ('\n each pca component length : ',  len(pca.components_[0]))
print ('\n pca.components_ :\n',  pca.components_)
print ('\n pca.explained_variance_ratio_ : ' , pca.explained_variance_ratio_)


 number of columns in original data :  11

 each pca component length :  11

 pca.components_ :
 [[ 0.3625305  -0.37391603 -0.3681852  -0.33005692  0.29415138 -0.34610332
   0.20045635  0.30651132  0.23494289  0.20691624 -0.21401766]
 [ 0.0161244   0.04374371 -0.04932413  0.24878402  0.27469408 -0.14303825
  -0.46337482 -0.23164699  0.42941765  0.46234863  0.41357106]
 [-0.22574419 -0.17531118 -0.06148414  0.14001476  0.16118879  0.34181851
   0.40316904  0.42881517 -0.20576657  0.28977993  0.52854459]]

 pca.explained_variance_ratio_ :  [0.60076366 0.24095163 0.05701793]


## Step 8 - Find 3 Principal Components 

In [11]:
## TODO - you code goes here
## Adjust your code from step-4