## Imports

In [11]:
import scipy.io
import numpy as np
from scipy.stats import linregress
from statsmodels.sandbox.stats.multicomp import multipletests 

# seaborn can be used to "prettify" default matplotlib plots by importing and setting as default
import seaborn as sns
sns.set() # Set searborn as default

## Load dataset

In [12]:
mat = scipy.io.loadmat('sand.mat')
X = mat['X']
y = mat['Y'].ravel()

[n, p] = X.shape

def centerData(data):
    
    mu = np.mean(data,axis=0)
    data = data - mu
    
    return data, mu

def normalize(X):
    '''
    Function for normalizing the columns (variables) of a data matrix to unit length.
    Returns the normalized data and the euclidian lenghts of the variables 
    
    Input  (X) --------> The data matrix to be normalized 
    Output (X_pre)-----> The normalized data matrix 
    Output (d) --------> Array with the euclidian lenghts of the variables 
    '''
    d = np.linalg.norm(X,axis=0,ord=2)  # d is the the L2 norms of the variables
    d[d==0]=1                           # Avoid dividing by zero if column L2 norm is 0 
    X_pre = X / d                       # Normalize the data with the euclidian lengths
    return X_pre,d                      # Return normalized data and the euclidian lengths

### 3 Perform univariate feature selection for the sand data using:

> (a) Bonferroni correction to control the family-wise error rate(FWER). Use FWER = 0.05.

In [13]:
PValues = np.zeros(p)
Xsub = np.zeros(p)
for j in range(p):
    Xsub = X[:,j]
    # Use the stats models linear regression, since p value already is included
    # Otherwise check https://stackoverflow.com/questions/27928275/find-p-value-significance-in-scikit-learn-linearregression
    # Which explains how to expand the class in sklearn to calculate it
    slope, intercept, r_value, PValues[j], std_err = linregress(Xsub, y)

# Sort p-values in acending order
idx1 = np.argsort(PValues)
p = PValues[idx1]

remaining_features_bonf = len(np.where(p < (0.05 / 2016))[0]) # Amount af features included
print(f'Remaining features after correcting with Bonferroni correction: {remaining_features_bonf}.')

Remaining features after correcting with Bonferroni correction: 72.


> (b) Benjamini-Hochberg’s algorithm for FDR. Use an acceptable fraction of mistakes,
q = 0.15.

In [14]:
FDR = multipletests(PValues, alpha = 0.05, method = "fdr_bh")[1] # Computing Benjamini Hochberg's FDR

idx2 = np.argsort(FDR)
fdr = FDR[idx2]

remaining_features_fdr = len(np.where(fdr < 0.15)[0]) # How many values are below 0.15?
print(f'Remaining features after applying DFR: {remaining_features_fdr}.')

Remaining features after applying DFR: 721.


Compare the solutions in terms of number of selected features and selected features.

*It is clear that FDR "allows" for more features to be kept in the model, and through this the chance of having false discoveries are higher, this is done to make sure that all significant features are kept in the model, whereas bonferroni might remove some significant features because of the more stringent cutoff.*