## Dimensionality Reduction Using Feature Selection 

#### Thresholding Numerical Feature Variance

In [466]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

# load data
iris = datasets.load_iris()
features = iris.data
tragets= iris.target

# create thresholder
thresholder = VarianceThreshold(threshold = .5)

# create features matrix
feature_mat = thresholder.fit_transform(features)

# view mat
feature_mat[:3]

# view variance
thresholder.fit(features).variances_

# premisse: features with low variance are less interesting than 
# feature with high variance. formula: moyenne (value- mean) squared
# we can't use  standardizer because all values will be 1

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

#### Thresholding with Binary Feature variacne:select subset of features with bernouilli random var above threshold

In [468]:
from sklearn.feature_selection import VarianceThreshold

# create features matrix
features = [[0, 1, 0],
 [0, 1, 1],
 [0, 1, 0],
 [0, 1, 1],
 [1, 0, 0]]

# create threshold
threshold = VarianceThreshold(threshold = .75 * (1 - .75))

# subset variables
threshold.fit_transform(features)

# we examine a ds variacne with bernouilli random var: p(1-p)

array([[0],
       [1],
       [0],
       [1],
       [0]])

#### Remove highly correlated features with correlatioin matrix

In [475]:
# create data with highly correlated values
features = np.array([[1, 1, 1],
 [2, 2, 0],
 [3, 3, 1],
 [4, 4, 0],
 [5, 5, 1],
 [6, 6, 0],
 [7, 7, 1],
 [8, 7, 0],
 [9, 7, 1]])

# convert features into matrix
df = pd.DataFrame(features)

# create correlation matrix
corr_matrix = df.corr().abs()

# determine variables to keep: upper triangle
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
 k=1).astype(np.bool))

# find index of features to drop: remove feat above 95 of correlation
col_indices_to_drop = [column for column in upper.columns
                       if any(upper[column] > 0.95)]

# drop columns
df.drop(df.columns[col_indices_to_drop], axis =1). head()

## Note: we use corr() to view higl correlated variables,
# and remove all except one because the information we get
# is redundant 

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1
3,4,0
4,5,1


#### Remove Irrelevant Features for Classification with chi-square statistic and SelectKbest for categorical features

In [482]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2, f_classif

# load data
iris = load_iris()
features = iris.data
targets = iris.target

# Convert categorical data in integer
features = features.astype(int)

### Select the best two features: Method with highest chi-square
# if values are categorical
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, targets)

### Select the best two features: Method with f_class (ANOVA) for
# values are quantitative
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, targets)

# Select the best n features using percentiles
fvalue_selector = SelectPercentile(f_classif, percentile = 75)
features_kbest = fvalue_selector.fit_transform(features, targets)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

## Note: we want to remove uninformaive features. 
# we apply chi-square (X-squared) stat between each feature 
# and target vector to determine independance between nb of 
# obs and class
### if two variables are higly dependant, they are informative
# when training the model
### ANOVA with f_classif calculate mean for each class and tells
# if they are different
### Select Percentile is an alternative 


Original number of features: 4
Reduced number of features: 3


#### Remove Irrelevant Features for Classification with f_classif ANOVA and SelectPercentile for quantitative features

In [None]:
# load data
iris = load_iris()
features = iris.data
targets = iris.target

# Convert categorical data in integer
features = features.astype(int)

### Select the best two features: Method with f_class (ANOVA) for
# values are quantitative
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, targets)

# Select the best n features using percentiles
fvalue_selector = SelectPercentile(f_classif, percentile = 75)
features_kbest = fvalue_selector.fit_transform(features, targets)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])


#### Recursively remove features with RFECV: recursive feature elimination with cross validation

In [485]:
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy",
 message="^internal gelsd")

# generate features matrix and target vector with make regression
features, target = make_regression( n_samples = 10000,
                                  n_features = 100,
                                  n_informative =2, 
                                  random_state = 420)

# create linear regression
lin_reg = linear_model.LinearRegression()

# recursively remove features
rfecv = RFECV(estimator = lin_reg, step = 1,
              scoring = 'neg_mean_squared_error')
rfecv.fit(features, target).transform(features)

### WE train a model and remove its worst feature until the
# model accuracy becomes worse

array([[ 0.39766312,  0.35157785],
       [-1.34432116, -0.16255151],
       [ 0.11539519, -0.91775481],
       ...,
       [-0.50487478, -0.07594816],
       [ 0.12172277,  1.98293771],
       [-0.04197372,  1.16751964]])