In [1]:
from os import chdir 
chdir('./lib')

In [2]:
from project_5 import general_process, load_data_from_database, make_data_dict, general_model, general_transformer

# Step 2 - Identify Salient Features Using $\ell1$-penalty

**NOTE: EACH OF THESE SHOULD BE WRITTEN SOLELY WITH REGARD TO STEP 2 - Identify Features**

### Domain and Data

MADELON is an artificial dataset, which was part of the NIPS 2003 feature selection challenge. This is a two-class classification problem with continuous input variables. The difficulty is that the problem is multivariate and highly non-linear.
MADELON is an artificial dataset containing data points grouped in 32 clusters placed on the vertices of a five dimensional hypercube and randomly labeled +1 or -1. The five dimensions constitute 5 informative features. 15 linear combinations of those features were added to form a set of 20 (redundant) informative features. Based on those 20 features one must separate the examples into the 2 classes (corresponding to the +-1 labels). We added a number of distractor feature called 'probes' having no predictive power. The order of the features and patterns were randomized.

### Problem Statement

Now that we have a benchmark established, we can can make some tweaks in order to filter our features.

### Solution Statement

We will do this by using the LASSO parameter within the Logistic Regression

### Metric

Our metric will be the same...our accuracy score from our train and test data.

### Benchmark

Our benchmark was a 78.8% training score and 51.6% test score.

## Implementation

Implement the following code pipeline using the functions you write in `lib/project_5.py`.

<img src="assets/identify_features.png" width="600px">

In [3]:
# Let's use the function we wrote to load our data into the dataframe
df = load_data_from_database()
df.shape

(2000, 502)

In [4]:
# Let's get info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 502 entries, index to label
dtypes: int64(502)
memory usage: 7.7 MB


In [5]:
# Describe the Data
df.describe()

Unnamed: 0,index,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_491,feat_492,feat_493,feat_494,feat_495,feat_496,feat_497,feat_498,feat_499,label
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,999.5,481.7225,483.4525,510.166,483.3845,501.6125,479.259,480.1095,476.565,486.7935,...,478.8115,486.3565,496.5655,493.4995,510.893,478.2195,483.309,507.977,490.266,0.0
std,577.494589,6.421769,30.186294,38.899165,9.059895,41.389418,6.795956,40.575925,1.384461,15.043836,...,4.011735,23.967366,127.635442,34.81902,37.459353,5.880613,13.559847,37.224297,25.825273,1.00025
min,0.0,462.0,381.0,370.0,453.0,371.0,459.0,334.0,471.0,430.0,...,463.0,391.0,130.0,368.0,398.0,457.0,435.0,363.0,403.0,-1.0
25%,499.75,477.0,464.0,485.0,477.0,475.0,475.0,452.75,476.0,477.0,...,476.0,471.0,404.0,470.0,486.0,474.0,474.0,482.0,473.0,-1.0
50%,999.5,482.0,483.0,510.5,483.0,500.0,479.0,480.0,477.0,487.0,...,479.0,486.0,504.0,492.0,511.0,478.0,483.0,508.0,490.0,0.0
75%,1499.25,486.0,503.0,536.0,490.0,528.0,484.0,506.25,477.0,496.25,...,481.0,502.0,586.0,517.0,535.0,482.0,492.0,533.0,507.25,1.0
max,1999.0,503.0,600.0,654.0,519.0,688.0,505.0,611.0,481.0,536.0,...,497.0,566.0,920.0,615.0,661.0,500.0,535.0,644.0,583.0,1.0


In [6]:
# Let's load the data , run a scaler, and run a Logistic Regression with a LASSO this time with a C value closer to zero
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
this_dd = make_data_dict(df, random_state=0)
this_scale = general_transformer(StandardScaler(), this_dd)
this_model = general_model(LogisticRegression(C=1, penalty='l1'), this_scale)
this_model


{'X_test': array([[ 1.5789349 , -0.11224769,  0.07458629, ...,  1.31747113,
         -1.29695092, -0.07002267],
        [-0.08967286,  1.00174163, -1.65498372, ..., -0.08346638,
         -0.54096321, -1.05091555],
        [ 0.48336675, -0.58967168, -0.40019763, ...,  0.21146783,
          0.59301836, -0.30543696],
        ..., 
        [-0.33700303, -0.74881301, -1.21411293, ..., -0.3784006 ,
          0.16102538,  0.16539162],
        [-1.47785698, -1.54451966, -1.48541803, ...,  0.13773428,
         -0.97295619,  1.30322735],
        [ 0.93970833,  0.8426003 ,  1.36328552, ...,  0.5801356 ,
         -0.43296496, -0.22696553]]),
 'X_train': array([[ 0.92403248,  0.04689365,  0.14241257, ...,  0.21146783,
         -0.21696847, -0.61932268],
        [-1.55275273, -0.11224769, -0.56976332, ..., -0.45213415,
         -0.43296496, -1.83562985],
        [ 0.74114749, -1.226237  ,  0.99024101, ..., -0.3784006 ,
          1.16000915,  2.00947023],
        ..., 
        [-0.23075556,  0.524317

In [12]:
# Our accuracy scores after running a Logistic Regression with a LASSO penalty
this_model['train_score'], this_model['test_score']

(0.79066666666666663, 0.53800000000000003)

In [13]:

num_of_coefs = this_model['model']
coeff = num_of_coefs.coef_
coeff

array([[ -5.67131112e-02,   5.42340695e-03,   4.33116140e-02,
         -1.29593179e-02,  -1.45597340e-02,   1.47154076e-01,
         -6.55420325e-02,   2.44978869e-02,   3.43546451e-02,
          3.48974649e-03,  -3.58889113e-02,  -2.31413235e-01,
         -3.58752073e-02,   8.63717866e-02,  -1.96585647e-02,
         -7.62270086e-02,   3.45752047e-02,   1.52351812e-02,
         -9.37061613e-02,   9.86350531e-02,  -1.07301354e-01,
          0.00000000e+00,   8.58390890e-02,   9.28661899e-02,
          1.42005398e-02,   2.11142041e-01,   0.00000000e+00,
          9.52500180e-02,   1.73637736e-01,  -1.93696907e-01,
         -4.48882420e-02,  -3.14394854e-02,  -6.44491270e-02,
         -3.21191723e-02,   8.16966045e-02,  -1.53102627e-01,
          1.50988721e-01,  -5.55835791e-02,   1.03251694e-01,
         -1.47349059e-01,   7.00193082e-02,   9.60039141e-02,
          3.01132051e-01,  -1.59337426e-01,  -1.38810165e-02,
         -1.24331867e-02,  -5.89960071e-02,   1.42235094e-01,
        

In [16]:
# Eliminated features
count = 0
for x in coeff.flat:
    if abs(x) < .001:
        count = count + 1
print "Eliminated Features: ", count
        

Eliminated Features:  34


We eliminated 34 features using C = 1. Let's do this a few more times with a different C value

In [15]:
# Let's do this again with a different C value

this_dd = make_data_dict(df, random_state=0)
this_scale = general_transformer(StandardScaler(), this_dd)
this_model = general_model(LogisticRegression(C=.1, penalty='l1'), this_scale)
this_model

{'X_test': array([[ 1.04825975,  0.21247572,  0.59782959, ..., -0.91398562,
          1.73432724, -0.71142192],
        [ 0.6714777 , -0.09695494,  1.78727721, ..., -0.46743708,
          0.35499994, -0.71142192],
        [-1.64130427,  0.98605235,  0.79607086, ...,  0.20238572,
         -0.06940846, -1.2559767 ],
        ..., 
        [-1.48503522, -0.87053157, -1.54978417, ...,  0.79778377,
         -0.99780183, -1.45046055],
        [-0.94156618,  0.05776039, -0.42641697, ..., -0.16973806,
         -1.26305708, -0.43914453],
        [ 0.11932705,  1.75962898, -0.2281757 , ..., -0.6162866 ,
          0.03669364,  0.57217149]]),
 'X_train': array([[-0.16716621, -1.95353886, -0.19513549, ..., -0.83956086,
         -0.20203608, -1.45046055],
        [ 0.81211985, -1.6441082 , -0.4924974 , ..., -0.6162866 ,
         -0.59991896, -1.10038962],
        [ 0.1002275 , -1.48939288, -1.68194502, ..., -0.31858757,
          0.83245939, -1.29487347],
        ..., 
        [-0.01263348,  1.450198

In [17]:
# Slightly better test...doesn't tell us much
this_model['train_score'], this_model['test_score']

(0.75600000000000001, 0.55200000000000005)

In [18]:
num_of_coefs = this_model['model']
coeff = num_of_coefs.coef_
coeff

array([[  0.00000000e+00,   0.00000000e+00,   1.79239948e-02,
          0.00000000e+00,   0.00000000e+00,   2.20504893e-02,
         -2.54727143e-02,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,  -8.50564532e-02,
          0.00000000e+00,   4.47917328e-02,  -5.68927692e-03,
         -5.54605836e-02,   5.79897622e-02,  -1.43637793e-02,
          0.00000000e+00,   4.88612719e-02,  -1.32508409e-01,
         -6.60661103e-03,   5.94849190e-02,   0.00000000e+00,
          0.00000000e+00,   3.32440486e-02,   0.00000000e+00,
          5.43423005e-02,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         -7.96358477e-02,   1.23289400e-02,   0.00000000e+00,
          0.00000000e+00,  -4.10518915e-02,  -1.13902794e-01,
          6.08687615e-02,   0.00000000e+00,   8.69847094e-02,
        

In [19]:
# Eliminated features
count = 0
for x in coeff.flat:
    if abs(x) < .001:
        count = count + 1
print "Eliminated Features: ", count
        

Eliminated Features:  251


Way more features eliminated at 251. Let's try it one more time with a different C value

In [24]:
# Let's do this again with a different C value

this_dd = make_data_dict(df, random_state=0)
this_scale = general_transformer(StandardScaler(), this_dd)
this_model = general_model(LogisticRegression(C=.01, penalty='l1'), this_scale)
this_model

{'X_test': array([[-0.58977923, -1.35750959,  0.23926797, ..., -0.74916165,
         -1.90783583, -1.19841145],
        [-1.37358866,  1.28154618,  0.00930791, ...,  0.35500035,
          0.31398007,  0.42189762],
        [ 1.12035044, -0.11560099,  1.29051399, ...,  1.60638396,
         -0.3991954 , -1.81567205],
        ..., 
        [-1.29538151,  1.12630761, -0.58201798, ..., -0.45471845,
          0.28655025,  1.03915822],
        [ 1.41927554,  0.50535331,  1.19195968, ...,  0.94388676,
         -0.07003749, -0.96693873],
        [-0.5741378 ,  0.03963758,  0.60063379, ...,  1.60638396,
          1.38374329,  0.80768549]]),
 'X_train': array([[-1.42920264, -0.89179387,  1.15910824, ...,  0.50222195,
         -1.16723053,  2.19652184],
        [ 0.77797692,  1.12630761, -1.04193811, ...,  0.50222195,
         -2.12727443, -1.27556902],
        [ 1.67475223,  0.35011473,  1.1262568 , ...,  0.13416795,
         -0.75578314,  1.30920973],
        ..., 
        [ 1.41058586, -0.581316

In [25]:
# Slightly better test...doesn't tell us much
this_model['train_score'], this_model['test_score']

(0.61333333333333329, 0.62)

In [26]:
num_of_coefs = this_model['model']
coeff = num_of_coefs.coef_
coeff

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

In [27]:
# Eliminated features
count = 0
for x in coeff.flat:
    if abs(x) < .001:
        count = count + 1
print "Eliminated Features: ", count
        

Eliminated Features:  500


## Conclusion:

We eliminated 500 features this time out of 502. We may have penalized a little too much with our C value this time. I think somewhere between .1 and .01 would be a good choice for a C value.