In [1]:
from os import chdir 
chdir('./lib')

In [2]:
!pwd

/Users/ginodefalco/dsi/dsi_repo/DSI_SM_3/projects/project-05/lib


In [3]:
from project_5 import general_process, load_data_from_database, make_data_dict, general_model, general_transformer

# Step 1 - Benchmarking

**NOTE: EACH OF THESE SHOULD BE WRITTEN SOLELY WITH REGARD TO STEP 1 - BENCHMARKING**

### Domain and Data


MADELON is an artificial dataset, which was part of the NIPS 2003 feature selection challenge. This is a two-class classification problem with continuous input variables. The difficulty is that the problem is multivariate and highly non-linear.
MADELON is an artificial dataset containing data points grouped in 32 clusters placed on the vertices of a five dimensional hypercube and randomly labeled +1 or -1. The five dimensions constitute 5 informative features. 15 linear combinations of those features were added to form a set of 20 (redundant) informative features. Based on those 20 features one must separate the examples into the 2 classes (corresponding to the +-1 labels). We added a number of distractor feature called 'probes' having no predictive power. The order of the features and patterns were randomized.

### Problem Statement

We need to demonstrate a capacity to identify relevant features of this data set using machine learning. However, in this first step, we are establishing a benchmark.

### Solution Statement


In order to establish a benchmark for our problem, we need to connect and access the MADELON database and query the relevant data to return from the dataframe. From this dataframe we will make a data dictionary by splitting the data into a feature matrix and target matrix using train_test_split. After that, we will write a function that will transform the data, in this case we will use Standard Scaler. And, finally, we will write another function that will run a general regression model on the transformed data from the previous step.

### Metric

Our metric is our accuracy score from our training and test data.

### Benchmark

Our benchmark was our accuracy score from our general logistic regression with a high C value(1000) in order to perform minimal regularization: 78.8% train score and 51.6% test score.

## Implementation

Implement the following code pipeline using the functions you write in `lib/project_5.py`.

<img src="assets/benchmarking.png" width="600px">

In [4]:
# Let's use the function we wrote to load our data into the dataframe
df = load_data_from_database()
df.shape

(2000, 502)

In [5]:
# Let's get info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 502 entries, index to label
dtypes: int64(502)
memory usage: 7.7 MB


In [6]:
# Describe the Data
df.describe()

Unnamed: 0,index,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_491,feat_492,feat_493,feat_494,feat_495,feat_496,feat_497,feat_498,feat_499,label
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,999.5,481.7225,483.4525,510.166,483.3845,501.6125,479.259,480.1095,476.565,486.7935,...,478.8115,486.3565,496.5655,493.4995,510.893,478.2195,483.309,507.977,490.266,0.0
std,577.494589,6.421769,30.186294,38.899165,9.059895,41.389418,6.795956,40.575925,1.384461,15.043836,...,4.011735,23.967366,127.635442,34.81902,37.459353,5.880613,13.559847,37.224297,25.825273,1.00025
min,0.0,462.0,381.0,370.0,453.0,371.0,459.0,334.0,471.0,430.0,...,463.0,391.0,130.0,368.0,398.0,457.0,435.0,363.0,403.0,-1.0
25%,499.75,477.0,464.0,485.0,477.0,475.0,475.0,452.75,476.0,477.0,...,476.0,471.0,404.0,470.0,486.0,474.0,474.0,482.0,473.0,-1.0
50%,999.5,482.0,483.0,510.5,483.0,500.0,479.0,480.0,477.0,487.0,...,479.0,486.0,504.0,492.0,511.0,478.0,483.0,508.0,490.0,0.0
75%,1499.25,486.0,503.0,536.0,490.0,528.0,484.0,506.25,477.0,496.25,...,481.0,502.0,586.0,517.0,535.0,482.0,492.0,533.0,507.25,1.0
max,1999.0,503.0,600.0,654.0,519.0,688.0,505.0,611.0,481.0,536.0,...,497.0,566.0,920.0,615.0,661.0,500.0,535.0,644.0,583.0,1.0


In [7]:
# Let's load our make_data_dict function and run a train_test_split on the data
df = make_data_dict(df, random_state=0)
df

{'X_test':       index  feat_000  feat_001  feat_002  feat_003  feat_004  feat_005  \
 349     349       480       488       498       481       481       476   
 433     433       474       491       506       489       503       483   
 1863   1863       473       482       438       484       511       478   
 157     157       475       516       531       476       490       496   
 1295   1295       489       481       511       485       472       475   
 220     220       488       484       497       466       529       474   
 499     499       474       503       572       472       491       483   
 961     961       497       493       537       505       542       479   
 856     856       477       503       547       492       483       468   
 644     644       494       503       526       471       444       475   
 870     870       485       464       436       474       502       478   
 1025   1025       483       489       516       479       486       490   
 9

In [8]:
# Let's standardize our data
from sklearn.preprocessing import StandardScaler
df = general_transformer(StandardScaler(), df)
df

{'X_test': array([[-1.13953285, -0.27435433,  0.11906386, ..., -1.1556245 ,
         -1.84664861,  0.73678577],
        [-0.99429601, -1.20087927,  0.21810034, ..., -0.4053489 ,
          1.31542093,  2.14088061],
        [ 1.47818828, -1.3553001 , -0.0790091 , ..., -1.00556938,
          0.44312588, -0.31628536],
        ..., 
        [-0.96317383, -0.42877515, -0.0790091 , ...,  1.39531255,
         -0.10205852,  0.03473835],
        [ 0.88340884,  0.65217061, -0.64021582, ..., -0.48037646,
         -0.23835462, -0.94032752],
        [-1.5804304 , -0.11993351,  1.70364753, ..., -0.25529378,
          0.57942198, -0.00426429]]),
 'X_train': array([[ 0.48054951, -0.7376168 ,  0.51520977, ..., -1.75584499,
         -0.72902059, -1.25234859],
        [ 0.31629356, -0.58319598,  0.84533137, ...,  0.3449267 ,
         -0.64724292,  2.72592013],
        [-0.5240053 , -0.11993351,  0.54822193, ..., -1.08059694,
         -1.05613123,  0.11274362],
        ..., 
        [-0.53610837,  1.733116

In [9]:
# We are running a simple logistic regression with a large C value to  perform minimum regularization
# and set a benchmark score
from sklearn.linear_model import LogisticRegression
df = general_model(LogisticRegression(C=100000), df)

In [10]:
df

{'X_test': array([[-1.13953285, -0.27435433,  0.11906386, ..., -1.1556245 ,
         -1.84664861,  0.73678577],
        [-0.99429601, -1.20087927,  0.21810034, ..., -0.4053489 ,
          1.31542093,  2.14088061],
        [ 1.47818828, -1.3553001 , -0.0790091 , ..., -1.00556938,
          0.44312588, -0.31628536],
        ..., 
        [-0.96317383, -0.42877515, -0.0790091 , ...,  1.39531255,
         -0.10205852,  0.03473835],
        [ 0.88340884,  0.65217061, -0.64021582, ..., -0.48037646,
         -0.23835462, -0.94032752],
        [-1.5804304 , -0.11993351,  1.70364753, ..., -0.25529378,
          0.57942198, -0.00426429]]),
 'X_train': array([[ 0.48054951, -0.7376168 ,  0.51520977, ..., -1.75584499,
         -0.72902059, -1.25234859],
        [ 0.31629356, -0.58319598,  0.84533137, ...,  0.3449267 ,
         -0.64724292,  2.72592013],
        [-0.5240053 , -0.11993351,  0.54822193, ..., -1.08059694,
         -1.05613123,  0.11274362],
        ..., 
        [-0.53610837,  1.733116

In [11]:
df['train_score'], df['test_score']

(0.77200000000000002, 0.502)

# Step 1 Conclusion:

We built pipeline to perform a naive logistic regression as a baseline model
and set a high C value in order to perform minimal regularization. Our benchmark score was:
77.2% on the train
50.2% on the test
 - Looks like we overfit a bit