In [1]:
import numpy as np
import os
import datetime
import pandas as pd
import random
# from tqdm import tqdm
from matplotlib import pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn import preprocessing
from sklearn import datasets

In [2]:
from sklearn import decomposition
from sklearn import discriminant_analysis
from scipy import sparse

### Dimensionality Reduction
- PCA: unsupervised, attempts to retain most variance, direction with the most variance as the first principal componenet, second most the second principal, whiten parameter transforms the values of each prinicpal component so that they have zero meena and unit variance
- http://www.math.union.edu/~jaureguj/PCA.pdf
- https://www.coursera.org/lecture/machine-learning/choosing-the-number-of-principal-components-S1bq1

In [10]:
# Load data
digits = datasets.load_digits()
len(digits), digits['data'][0].shape

(5, (64,))

In [13]:
digits['data'][0][:10]

array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0.])

In [14]:
# Standardize
features = preprocessing.StandardScaler().fit_transform(digits.data)
features[0][:10]

array([ 0.        , -0.33501649, -0.04308102,  0.27407152, -0.66447751,
       -0.84412939, -0.40972392, -0.12502292, -0.05907756, -0.62400926])

In [15]:
pca = decomposition.PCA(n_components=0.99, whiten=True)
features_pca = pca.fit_transform(features)

In [16]:
print("Original Features: {0}".format(features.shape))
print("PCA Features: {0}".format(features_pca.shape))

Original Features: (1797, 64)
PCA Features: (1797, 54)


### Kernel trick

In [17]:
# Creeate linearly inseparable data, make_circles: one class is urrounded on all sides by the other
nFeatures, _ = datasets.make_circles(n_samples=1000, random_state=1, noise=0.1, factor=0.1)
# Apply Kernel PCA with radius basis function kernel (RBF)
kpca = decomposition.KernelPCA(kernel='rbf', gamma=15, n_components=1)
features_kpca = kpca.fit_transform(nFeatures)

In [18]:
print("Original Features: {0}".format(nFeatures.shape))
print("PCA Features: {0}".format(features_kpca.shape))

Original Features: (1000, 2)
PCA Features: (1000, 1)


### Reduce Features and Maximize Class Spearability for Classification

In [19]:
# Load Data Set
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [21]:
# LDA Linear Discriminant Analysis to transform the ffeatures
lda = discriminant_analysis.LinearDiscriminantAnalysis(n_components=1)
features_lda = lda.fit(features, target).transform(features)
print("Original Features: {0}".format(features.shape))
print("LDA Features: {0}".format(features_lda.shape))
print("Explained variance kept: {0}".format(lda.explained_variance_ratio_))

Original Features: (150, 4)
LDA Features: (150, 1)
Explained variance kept: [0.9912126]


In [22]:
# Run for all options to determine how many features needed to retain a certain variance by setting n_components to None
lda = discriminant_analysis.LinearDiscriminantAnalysis(n_components=None)
features_lda = lda.fit(features, target)
# Get variances
lda_var_ratios = lda.explained_variance_ratio_
lda_var_ratios

array([0.9912126, 0.0087874])

### Matrix Factorization
- feature matrix of non-negative values
- Unsupervised linear dimensionality reduction that factorizes, breaks up into multiple matrices whose product approximates the original
- V = WH : V is our feature matrix of (features x observations), W is (features x r) and H is an (r x n)  and adjust r to set the amount of dimensionality reduction desired
- No explained variance provided

In [23]:
digits = datasets.load_digits()
features = digits.data
features.shape

(1797, 64)

In [24]:
# Create, fit and apply NMF
nmf = decomposition.NMF(n_components=10, random_state=1)
features_nmf = nmf.fit_transform(features)
print("Original Features: {0}".format(features.shape))
print("NMF Features: {0}".format(features_nmf.shape))

Original Features: (1797, 64)
NMF Features: (1797, 10)


### Sparse Data Feature Reduction
- truncated Singular Value Decomposition
- provides variance retained

In [25]:
digits = datasets.load_digits()
features = preprocessing.StandardScaler().fit_transform(digits.data)
features.shape

(1797, 64)

In [27]:
# Sparse matrix
feature_sparse = sparse.csr_matrix(features)
# Create a TSVD
tsvd = decomposition.TruncatedSVD(n_components=10)
features_tsvd = tsvd.fit(feature_sparse).transform(feature_sparse)

In [28]:
print("Original Features: {0}".format(feature_sparse.shape))
print("TSVD Features: {0}".format(features_tsvd.shape))

Original Features: (1797, 64)
TSVD Features: (1797, 10)


In [29]:
tot_var = 0
for i, ev in enumerate(tsvd.explained_variance_ratio_):
    tot_var += ev
    print("Comp {0}: {1}".format(i, tot_var))

Comp 0: 0.12033916097680719
Comp 1: 0.21594970477833478
Comp 2: 0.30039385369524324
Comp 3: 0.3653779284288926
Comp 4: 0.4139794572255754
Comp 5: 0.4561205799248659
Comp 6: 0.4955413762181741
Comp 7: 0.5294342908372134
Comp 8: 0.5594131619738609
Comp 9: 0.5887330248066919


### Feature Selection
- filter: select best using satistical properties
- wrapper: tiral and error to find subset of features that produce models with highest quality predictions
- embedded: select best feature as part of algorithms learning/training process

In [30]:
from sklearn import feature_selection
from sklearn import linear_model

In [32]:
# keep only features that have a given variance
iris = datasets.load_iris()
features = iris.data
target = iris.target
# Create Threshold
thresh = feature_selection.VarianceThreshold(threshold=0.5)
features[0:3]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2]])

In [34]:
# Create high variance feature matrix
feature_hv = thresh.fit_transform(features)
# view high variance feature matrix
feature_hv[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

- doesn't work when feature sets contain ddifferent units
- variance threshold is manual
- if features have been standardized then will not work correctly
- Categorical: remove those that are predominately 1 class

In [35]:
thresh.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [36]:
# Create data set: feature 0: 80% c0, feature 1: 80% c1, feature 2: 60 c0
my_fea = [[0, 1, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0]]
thresholder = feature_selection.VarianceThreshold(threshold=(0.75*0.25))
thresholder.fit_transform(my_fea)

array([[0],
       [1],
       [0],
       [1],
       [0]])

### Highly Correlated Columns

In [38]:
df = pd.DataFrame(features)
df.head(3)

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


In [40]:
corr_mat = df.corr().abs()
corr_mat

Unnamed: 0,0,1,2,3
0,1.0,0.11757,0.871754,0.817941
1,0.11757,1.0,0.42844,0.366126
2,0.871754,0.42844,1.0,0.962865
3,0.817941,0.366126,0.962865,1.0


In [43]:
upper = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(np.bool))
# find columns to ddrop
[c for c in upper.columns if any(upper[c] > 0.95)]

[3]

In [44]:
df.drop([c for c in upper.columns if any(upper[c] > 0.95)], axis=1).head(3)

Unnamed: 0,0,1,2
0,5.1,3.5,1.4
1,4.9,3.0,1.4
2,4.7,3.2,1.3


### Remove Irrelevant Features for Classification
- Calculate a chi-square statistic between each feature and the target: difference between the observed number of observations in each class and what we woul expect if the feature was independent
- quantitative features, compute the ANOVA F-value betwewen each featuer and target vector: are the means statistically different

In [54]:
# Load Data
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [55]:
# Convert to categorical
features = features.astype(int)

In [56]:
# select two with highest chi-square statistic
chi2_selector = feature_selection.SelectKBest(feature_selection.chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)

In [58]:
print("Original Features: {0}".format(features.shape))
print("Chi Features: {0}".format(features_kbest.shape))

Original Features: (150, 4)
Chi Features: (150, 2)


In [59]:
# select 2 features with highest f-value
fvalue_selector = feature_selection.SelectKBest(feature_selection.f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

In [60]:
print("Original Features: {0}".format(features.shape))
print("Chi Features: {0}".format(features_kbest.shape))

Original Features: (150, 4)
Chi Features: (150, 2)


In [61]:
# select percentile of features with highest f-value
fvalue_selector = feature_selection.SelectPercentile(feature_selection.f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)

In [62]:
print("Original Features: {0}".format(features.shape))
print("Chi Features: {0}".format(features_kbest.shape))

Original Features: (150, 4)
Chi Features: (150, 3)


### Recursive Feature Elimination w/ Cross Validation
repeatedly train a model removing a featuer until model performance becomes worse

In [63]:
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [64]:
# Data Generation
features, target = datasets.make_regression(n_samples=1000, n_features=100, n_informative=3, random_state=1)
# model
ols = linear_model.LinearRegression()

In [65]:
# Recurrsion
rfecv = feature_selection.RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)



array([[-0.71242059, -1.42855575,  0.20271102],
       [ 0.63015866, -0.6579047 , -0.78209345],
       [-1.33953296,  0.86635759, -2.02573104],
       ...,
       [ 2.26534949,  0.15301017,  1.32665135],
       [-0.48969413,  1.95787748, -0.00832094],
       [-1.06119079,  0.11066107, -1.07171859]])

In [66]:
print("Number of best features: {0}".format(rfecv.n_features_))

Number of best features: 3


In [67]:
rfecv.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [68]:
rfecv.ranking_

array([76, 13, 61, 20, 83, 74, 58, 52, 62, 97, 80, 81, 59, 30, 72, 60, 98,
       35,  9, 27,  7, 92, 78, 34, 19, 70, 18, 43, 77, 53, 84, 47,  1, 89,
       38, 95, 10, 55, 63,  4, 16, 69,  6,  5,  3, 67, 85, 42, 65, 93, 82,
       26, 94, 31, 54, 75, 36, 64, 96, 17, 22, 33, 91, 39, 37, 46, 66, 40,
       45, 86, 51,  1, 25, 79, 57,  8, 88, 87, 24,  2, 90, 15, 50, 28, 14,
        1, 49, 71, 11, 44, 29, 68, 32, 41, 21, 56, 73, 23, 12, 48])