In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler 
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from sklearn.decomposition import PCA
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

<h2>Random Data with PCA</h2>
<h4>Demonstrate how PCA works with completely random dataset</h4>
<br>
Features: 10 random columns<br>
Output: PCA Components<br>

Objective: <br><quote>There should not be much reduction in dimensions when using all random colums</quote>

In [3]:
# 1000 rows x 10 columns
np.random.seed(5)
random_data = np.random.rand(1000,10)

In [4]:
random_data.shape

(1000, 10)

In [5]:
df = pd.DataFrame(random_data)

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.221993,0.870732,0.206719,0.918611,0.488411,0.611744,0.765908,0.518418,0.296801,0.187721
1,0.080741,0.73844,0.441309,0.15831,0.879937,0.274086,0.414235,0.29608,0.628788,0.579838
2,0.599929,0.265819,0.284686,0.253588,0.327564,0.144164,0.165613,0.963931,0.960227,0.188415
3,0.024307,0.204556,0.699844,0.779515,0.022933,0.577663,0.001642,0.515473,0.639795,0.985624
4,0.259098,0.802497,0.870483,0.92275,0.002214,0.469488,0.981469,0.398945,0.813732,0.546456


In [7]:
df.corr()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,-0.010171,-0.068103,-0.015572,-0.016316,-0.029882,0.009654,-0.01637,-0.007551,0.045091
1,-0.010171,1.0,-0.007099,-0.005478,0.031386,-0.035002,0.036971,0.009322,0.05114,-0.009717
2,-0.068103,-0.007099,1.0,0.047277,-0.006887,0.038011,-0.008302,-0.029974,0.014149,-0.066695
3,-0.015572,-0.005478,0.047277,1.0,0.004733,0.012298,0.023168,0.033222,-0.014268,-0.049929
4,-0.016316,0.031386,-0.006887,0.004733,1.0,0.020822,0.030233,-0.024307,0.011156,0.033403
5,-0.029882,-0.035002,0.038011,0.012298,0.020822,1.0,0.019708,-0.039218,0.06484,-0.044626
6,0.009654,0.036971,-0.008302,0.023168,0.030233,0.019708,1.0,-0.01595,0.004169,-0.023448
7,-0.01637,0.009322,-0.029974,0.033222,-0.024307,-0.039218,-0.01595,1.0,0.017539,0.03932
8,-0.007551,0.05114,0.014149,-0.014268,0.011156,0.06484,0.004169,0.017539,1.0,0.016253
9,0.045091,-0.009717,-0.066695,-0.049929,0.033403,-0.044626,-0.023448,0.03932,0.016253,1.0


In [8]:
df.dtypes

0    float64
1    float64
2    float64
3    float64
4    float64
5    float64
6    float64
7    float64
8    float64
9    float64
dtype: object

In [None]:
# Test PCA

In [14]:
# Two modes to test with PCA
# How many components we need in final output or how much variance do we need to capture as a percentage

pca = PCA(n_components=0.9) # percentage of variance to capture
#pca = PCA(n_components=2) # number of components

In [15]:
pca.fit(df)

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [16]:
# number of components PCA came up with
pca.n_components_

9

In [17]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.221993,0.870732,0.206719,0.918611,0.488411,0.611744,0.765908,0.518418,0.296801,0.187721
1,0.080741,0.73844,0.441309,0.15831,0.879937,0.274086,0.414235,0.29608,0.628788,0.579838
2,0.599929,0.265819,0.284686,0.253588,0.327564,0.144164,0.165613,0.963931,0.960227,0.188415
3,0.024307,0.204556,0.699844,0.779515,0.022933,0.577663,0.001642,0.515473,0.639795,0.985624
4,0.259098,0.802497,0.870483,0.92275,0.002214,0.469488,0.981469,0.398945,0.813732,0.546456


In [18]:
pca.transform(df)

array([[ 0.28785518, -0.08044738,  0.24957468, ...,  0.31448073,
         0.28302158, -0.32265232],
       [-0.00863067, -0.36817681,  0.04840754, ...,  0.08616637,
         0.0359108 ,  0.01067538],
       [-0.26406733,  0.10765481,  0.32950411, ...,  0.05930589,
         0.27002476,  0.43645724],
       ...,
       [ 0.21391975,  0.21794533, -0.57616844, ..., -0.1608504 ,
        -0.25210373,  0.30723125],
       [ 0.0187493 ,  0.52070042, -0.01391376, ..., -0.13804123,
         0.31326016,  0.06086482],
       [-0.00091793,  0.19966785, -0.16966791, ..., -0.25599951,
        -0.2042108 ,  0.01747299]])

In [19]:
def transform_with_pca(pca, df, columns):
    transformed_data = pca.transform(df[columns])
    
    tcols = []
    for i in range(pca.n_components_):       
        tcols.append('component_' + str(i))
    
    print ('components:',tcols)
    df_transformed = pd.DataFrame(transformed_data, columns=tcols)
    
    for col in df_transformed.columns:
        df[col] = df_transformed[col]
    
    df.drop(columns, inplace=True, axis=1)
    
    return tcols

In [20]:
transform_with_pca(pca,df, df.columns)

components: ['component_0', 'component_1', 'component_2', 'component_3', 'component_4', 'component_5', 'component_6', 'component_7', 'component_8']


['component_0',
 'component_1',
 'component_2',
 'component_3',
 'component_4',
 'component_5',
 'component_6',
 'component_7',
 'component_8']

In [21]:
df.head()

Unnamed: 0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
0,0.287855,-0.080447,0.249575,-0.48268,0.089045,0.142066,0.314481,0.283022,-0.322652
1,-0.008631,-0.368177,0.048408,-0.029248,-0.483864,-0.443418,0.086166,0.035911,0.010675
2,-0.264067,0.107655,0.329504,0.43355,0.299702,-0.107044,0.059306,0.270025,0.436457
3,0.117525,0.374161,0.303687,0.720273,-0.184429,-0.011993,-0.136169,-0.221403,-0.56113
4,0.464811,-0.151338,0.343986,-0.134883,0.43345,0.000799,-0.021157,-0.568048,-0.407248


## Summary

1.	With random datasets, not much reduction is possible
2.	We are capturing 90% variance and PCA came up 9 components
3.	In the next demo, let’s look a dataset that has related features.
