In [None]:
# Drop data columns with missing values
# Drop data columns with low variance, remove data lower than defined threashold after Normalization
# Drop data columns with high correlations, only one set can be kept
# Apply satistical function - Principle Component Analysis
# Transforms original dataset into a new set of coordinates,
# by keeping the highest possible variance to ensure there is a significant loss in data

### PCA
#### is a linear dimensionality reduction method which uses singular value decomposition of the data and keeps only the most significant vectors to project the data to lower dimensional space.
#### Primarily used to compress or reduce the data.
#### Tries to capture the variance, which helps it pick up interesting features.
#### PCA is used to reduce dimensionality in the dataset and to build our feature vector.
#### the principal axes in the feature space represents the direction of maximum variance in the data.
![image.png](attachment:image.png)

In [None]:
# keep only the significant singular vector
# sklearn.decomposition.PCA(n_components=None,copy=True,whiten=False)

# n comp not set then all the components are retained, 
# copy not often used frequently because we dont often need to overwrite the transformed data
# whiten improves the prediction acurracy 

![image.png](attachment:image.png)

In [1]:
from sklearn.decomposition import PCA
from sklearn.datasets import make_blobs

In [3]:
n_sample = 20
random_state = 20

In [5]:
# generate the dataset with 10 features (dimensions)
X,y = make_blobs(n_samples=n_sample, n_features = 10, random_state = None)
# goal to reduce feature to load or desired value

In [6]:
# view the shape of the dataset
print(X.shape)
print(y.shape)

(20, 10)
(20,)


In [7]:
# define PCA estimator with number of reduced components
pca = PCA(n_components=3)

In [8]:
# fit the data into the PCA estimator
pca.fit(X)

PCA(n_components=3)

In [9]:
# pca percentage variance explained by each of the selected components
pca.explained_variance_ratio_

array([0.57117241, 0.40434131, 0.00772105])

In [13]:
# print the PCA components
for i in range(3):
    print('Component', i+1, pca.components_[i])

Component 1 [-0.46905776  0.18493554  0.03993829  0.03887402 -0.03424601 -0.13928605
  0.28665518  0.5961203  -0.48492589  0.22230676]
Component 2 [ 0.18567036 -0.72377295  0.2133897  -0.34704938 -0.16549317 -0.18828205
 -0.10561441 -0.02100628 -0.4305362   0.12611735]
Component 3 [-0.16670151  0.07176912 -0.33537186 -0.31403458 -0.82416794  0.16218536
  0.16684437 -0.11159397  0.09747328 -0.02489215]


In [14]:
# use transform method over fitted data to apply dimensionality reduction
pca_reduced = pca.transform(X)

In [18]:
pca_reduced.shape # features are successfully reduced from 10 to 3

(20, 3)