In [15]:
from warnings import filterwarnings

***10.Dimensionality Reduction Using Features Selection***

In [16]:
#10.1 Thresholding Numerical Features Variance
from sklearn.feature_selection import VarianceThreshold
from sklearn import datasets
iris=datasets.load_iris()
features=iris.data
target=iris.target
thresholder=VarianceThreshold(threshold=.5)
features_high_variance=thresholder.fit_transform(features)
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [17]:
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [18]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
features_scaled=scaler.fit_transform(features)  
selector=VarianceThreshold(threshold=0.5)
selector.fit(features_scaled).variances_

array([1., 1., 1., 1.])

In [19]:
#10.2 Thresholding Binary Features  Variance
#Bernoulli random variable variance = p(1-p)
features=[[0,1,0],[0,1,1],[0,1,0],[0,1,1],[1,0,0]]
thresholder=VarianceThreshold(threshold=(.75*(1-.75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

In [20]:
#10.3 Handling Highly Correlated Features
import numpy as np
import pandas as pd
features=np.array([[2.5,2.4],[0.5,0.7],[2.2,2.9],[1.9,2.2],[3.1,3.0],[2.3,2.7],[2,1.6],[1,1.1],[1.5,1.6],[1.1,0.9]])
df=pd.DataFrame(features)
corr_matrix=df.corr().abs()
upper_tri=corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool_))
to_drop=[column for column in upper_tri.columns if any(upper_tri[column]>0.95)]
df.drop(df.columns[to_drop],axis=1).head(3)

Unnamed: 0,0,1
0,2.5,2.4
1,0.5,0.7
2,2.2,2.9


In [21]:
df.corr()

Unnamed: 0,0,1
0,1.0,0.925929
1,0.925929,1.0


In [22]:
upper_tri

Unnamed: 0,0,1
0,,0.925929
1,,


In [23]:
#10.4 Removing Irrelevant Features for Classification
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
data=load_breast_cancer()
features=data.data
target=data.target
features=features.astype(int)
chi2_selector=SelectKBest(chi2,k=2)
features_kbeast_chi2=chi2_selector.fit_transform(features,target)
features_kbeast_chi2[0:3]
    

array([[1001, 2019],
       [1326, 1956],
       [1203, 1709]])

In [27]:
fvalue_selector=SelectKBest(f_classif,k=2)
features_kbeast_fvalue=fvalue_selector.fit_transform(features,target)
features_kbeast_fvalue[1]



  f = msb / msw


array([ 24, 158])

In [26]:
from sklearn.feature_selection import SelectPercentile
fvalue_selector=SelectPercentile(f_classif,percentile=10)
features_kbeast_fvalue=fvalue_selector.fit_transform(features,target)
features_kbeast_fvalue.shape[1]

  f = msb / msw


3

In [28]:
#10.5 Recursive Feature Elimination
import warnings 
from sklearn.datasets import make_regression 
from sklearn.feature_selection import RFECV
from  sklearn import datasets,linear_model
warnings.filterwarnings(action='ignore',module='scipy',message='^internal gelsd')
features,target=make_regression(n_samples=10000,n_features=100,n_informative=2,random_state=1)
ols=linear_model.LinearRegression()
rfecv=RFECV(estimator=ols,step=1,cv=5,scoring='neg_mean_squared_error')
rfecv.fit(features,target)
rfecv.transform(features)


array([[ 0.00850799,  0.7031277 ],
       [-1.07500204,  2.56148527],
       [ 1.37940721, -1.77039484],
       ...,
       [-0.80331656, -1.60648007],
       [ 0.39508844, -1.34564911],
       [-0.55383035,  0.82880112]])

In [29]:
rfecv.n_features_

np.int64(2)

In [30]:
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [31]:
rfecv.ranking_

array([94, 78, 93, 92, 82,  1, 56, 18, 12, 19, 80, 76,  2, 77, 42, 70, 16,
       74, 60, 87, 89, 27, 38, 44,  9, 83, 75,  8, 48, 30, 13, 99,  3, 53,
       64, 29, 52, 79, 23,  1, 84, 31, 81, 85, 54, 49, 28, 51, 63, 34, 45,
       73, 39, 17, 95, 21, 58, 47, 96, 43, 26, 71, 88, 91, 68, 59, 15, 50,
       90, 46,  4, 86, 97, 35, 40, 11, 67, 36, 10, 72, 55, 22, 66, 62, 61,
        6,  5, 24, 33, 57, 65, 20, 69, 37, 98,  7, 41, 32, 25, 14])