## Imports

In [1]:
import scipy.io
import numpy as np
from scipy.stats import linregress
from statsmodels.sandbox.stats.multicomp import multipletests 

# seaborn can be used to "prettify" default matplotlib plots by importing and setting as default
import seaborn as sns
sns.set() # Set searborn as default

## Load dataset

In [3]:
mat = scipy.io.loadmat('sand.mat')
X = mat['X']
y = mat['Y'].ravel()

[n, p] = X.shape

def centerData(data):
    
    mu = np.mean(data,axis=0)
    data = data - mu
    
    return data, mu

def normalize(X):
    d = np.linalg.norm(X, ord=2, axis=0)
    return X / d, d

seed = 42

### 3 Perform univariate feature selection for the sand data using:

> (a) Bonferroni correction to control the family-wise error rate(FWER). Use FWER = 0.05.

In [37]:

# Calculate the pvalue for each feature one at the time because OLS breaks down with this many features
# Use the stats models linear regression, since p value already is included
# Otherwise check https://stackoverflow.com/questions/27928275/find-p-value-significance-in-scikit-learn-linearregression
# Which explains how to expand the class in sklearn to calculate it
X_norm,_ = centerData(X)
X_norm,_ = normalize(X_norm)
y_norm,_ = centerData(y)

pvals = np.zeros(p)
for j in range(p):
    slope, intcpt, r_val, pvals[j], stderr= linregress(X_norm[:, j], y_norm)


# Sort p-values in acending order
# pvals.sort()

# include all features with p values lower  than p / features
features_to_include = np.arange(p)[np.sort(pvals) < 0.05/p]

In [38]:
len(features_to_include)

72

> (b) Benjamini-Hochberg’s algorithm for FDR. Use an acceptable fraction of mistakes,
q = 0.15.

In [43]:
# Use multipletests  to get the FDR corrected p values
reject, pvals_corrected, _, _ = multipletests(pvals, alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False)
# Sort p-values in acending order

# include all features with p values lower  than q
features_to_include = np.arange(p)[np.sort(pvals_corrected) < 0.15]

In [44]:
len(features_to_include)

721

Compare the solutions in terms of number of selected features and selected features.

*It is clear that FDR "allows" for more features to be kept in the model, and through this the chance of having false discoveries are higher, this is done to make sure that all significant features are kept in the model, whereas bonferroni might remove some significant features because of the more stringent cutoff.*