## Dimensionality Reduction Using Feature Extraction 

#### Reducing Features Using Principal Components Analysis

In [440]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import datasets

# load data
digits = datasets.load_digits()

# Standardize the feature matrix
features = StandardScaler().fit_transform(digits.data)

# Create PCA
pca = PCA(n_components = 0.95, whiten = True)

# fit PCA
features_pca = pca.fit_transform(features)

# Show results
print("original features numb:", features.shape[1])
print('pca on df', features_pca.shape[1])

# we use n_components between 0.95 and 0.99 to use 95% to 99% 
# of variance retained
# Whiten: make mean = 0 and unit variance
# svd_solver ='randomized' : stochastic algo

original features numb: 64
pca on df 40


#### Reducing Feature when Data is Linearly Inseparable using Kernel PCA

In [448]:
from sklearn.decomposition import KernelPCA
from sklearn.datasets import make_circles

# generate inseparable data
features, _ = make_circles(n_samples = 1000,
                          random_state = 420,
                          noise = 0.1,
                          factor = 0.1)

# create kernel pca
kernel_pca = KernelPCA(kernel = 'rbf', gamma =15, n_components = 1)

# reduce features
features_pca = kernel_pca.fit_transform(features)

# view
print('original data shape:', features.shape)
print('kernel pca :', features_pca.shape)

# note: pca tries to maximize variance

# we use rbf as kernel for gaussian radical basis function
# using kernel = 'linear' makes regular PCA
# n_components: choose the number of parameters
# gamma : kernel hyperparameters

original data shape: (1000, 2)
kernel pca : (1000, 1)


#### Reducing Featrues by Maximizing Class Separability with LDA

In [451]:
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# create lda
lda = LinearDiscriminantAnalysis(n_components = 1)

# reduce features
features_lda = lda.fit(features, target).transform(features)

# view
print('original df : ', features.shape)
print('features lda: ', features_lda.shape)

# amount of variance explained by eacg component
lda.explained_variance_ratio_

# note: lda tries to maximize mean of clusters and minimize 
# variance for better separation. has additional goal than pca:mean


original df :  (150, 4)
features lda:  (150, 1)


array([0.9912126])

#### Reducing Features with non negative Matrix Factorization

In [452]:
from sklearn.decomposition import NMF
from sklearn import datasets

# load data
digits = datasets.load_digits()
features = digits.data

# create NMF
nmf = NMF(n_components = 10, random_state =420)

# reduce features
features_nmf = nmf.fit_transform(features)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_nmf.shape[1])

# unsupervised technique for linear dimensionality reduction
# breaks up data into latent variabels.
# values has to be positive to work

Original number of features: 64
Reduced number of features: 10


#### Reducing Features using Sparse Data using TSVD: only non nul elements

In [464]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn import datasets
import numpy as np

# load data
digits = datasets.load_digits()

# standardize features
features_standardized = StandardScaler().fit_transform(digits.data)

# put features in sparse matrix
sparse_mat = csr_matrix(features_standardized)

# create TSVD
tsvd = TruncatedSVD(n_components = 10)

# apply tsvd on sparse mat
features_sparse_tsvd = tsvd.fit_transform(sparse_mat)

# Show results
print("features standardized:", features_standardized.shape[1])
print("sparse matrix features:", sparse_mat.shape[1])
print("Reduced number of features:", features_sparse_tsvd.shape[1])

# see data loss
tsvd.explained_variance_ratio_[:].sum()

features standardized: 64
sparse matrix features: 64
Reduced number of features: 10


0.5887245646379315