# Week 3 Lab: Feature Selection and Resampling

## 1. Univariate Selection with `SelectKBest`

`SelectKBest` is used to find the features that have the strongest relationship with the output variable. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
data = pd.read_csv("diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
X = data.values[:,0:8]
Y = data.values[:,8]

In [7]:
# feature extraction
test = SelectKBest(score_func=chi2, k=4) # using the chi-squared statistical test for non-negative features
fit = test.fit(X,Y)
# summarise scores
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarise selected features
print(features[0:5,:])

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]
[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]


In [20]:
# Find which features are selected by SelectKBest
fit.get_support()

array([False,  True, False, False,  True,  True, False,  True])

## 2. Recursive Feature Elimination (RFE) `RFE`
* It works by recursively removing attributes and building a model on those attributes that remain.
* Uses model accuracy to identify which attributes contribute the most to predicting the output.

In [21]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# feature extraction
model = LogisticRegression()
rfe = RFE(model,3)
fit = rfe.fit(X,Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]


## 3. Data reduction using PCA `PCA`

In [22]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 3)
fit = pca.fit(X)

print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [0.889 0.062 0.026]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [-2.265e-02 -9.722e-01 -1.419e-01  5.786e-02  9.463e-02 -4.697e-02
  -8.168e-04 -1.402e-01]
 [-2.246e-02  1.434e-01 -9.225e-01 -3.070e-01  2.098e-02 -1.324e-01
  -6.400e-04 -1.255e-01]]


## 4. Feature Importance - `ExtraTreesClassifier`

Use bagged decision trees like Random Forest and Extra Trees to estimate the importance of features

#### Random Forest vs Extra Tree

* Extra Tree is good for noisy dataset - i.e. when not all features are relevant to the output.
* Extra Tree is faster than RF.
* Random Forest is good when all features are importance. 

In [25]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,Y)
print(model.feature_importances_) # the larger the score, the more important the attribute

[0.1   0.253 0.096 0.074 0.075 0.142 0.121 0.138]


## 5. Resampling

* Train and Test Sets
* k-fold Cross Validation
* Leave One Out Cross Validation
* Repeated Random Test-Train Splits

### Train and Test Set `train_test_split`
* Ideal for very large datasets, fast computation. 
* Downside is that it can have a high variance. - i.e. differences in the training and test dataset can result in meaningful differences in the accuracy. 

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

test_size = 0.33
seed = 7

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = test_size,random_state = seed)
model = LogisticRegression()
model.fit(X_train,Y_train)
result = model.score(X_test,Y_test)
print("Accuracy: %.3f%%" % (result * 100.0))

Accuracy: 75.591%


### K-fold Cross Validation `KFold`
* Less variance than a single train-test set split
* For modest sized dataset (e.g. thousands or tens of thousands of records), k values of 3, 5 and 10 are common.

In [29]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

num_folds = 10
seed = 7
kfold = KFold(n_splits = num_folds,random_state = seed)
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv = kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0,results.std()*100.0)) # accuracy, sd

Accuracy: 76.951% (4.841%)


### Leave One Out Cross Validation (LOOCV) `LeaveOneOut`

* Gives more reliable estimate on model accuracy
* Downside is that it is computationally more expensive

In [30]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

loocv = LeaveOneOut()
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv = loocv)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0,results.std()*100.0)) # accuracy, sd

Accuracy: 76.823% (42.196%)


### Repeated Random Test-Train Splits `ShuffleSplit`

* Fast speed of using a train/test split and reduction in variance
* Downside is that repetitions may include much of the same data in the train or the test split

In [31]:
# 67%/33% split repeated 10 times
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

n_splits = 10
test_size = 0.33
seed = 7

kfold = ShuffleSplit(n_splits = n_splits, test_size = test_size,random_state = seed)
results = cross_val_score(model, X, Y, cv = kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0,results.std()*100.0)) # accuracy, sd

Accuracy: 76.496% (1.698%)
