In [1]:
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, SelectFromModel, f_regression
import seaborn as sns
%matplotlib inline

## Importing and sampling the UCI data / basic EDA

### FIRST BATCH OF 2200


In [3]:
X = pd.read_pickle('../data/first_batch_X.p')

In [4]:
y = pd.read_pickle('../data/first_batch_labels.p')

In [5]:
X.head()

Unnamed: 0_level_0,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_990,feat_991,feat_992,feat_993,feat_994,feat_995,feat_996,feat_997,feat_998,feat_999
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
68411,0.020855,0.578265,-0.13612,-0.410538,-1.772283,1.187936,0.402231,1.176466,-0.792536,-1.821266,...,-1.492082,-0.302283,-0.259279,-1.326535,-1.993,0.540589,-1.205038,-0.581775,-0.613431,0.365626
118486,-0.66898,0.411098,-0.818625,-0.68011,1.437646,0.638755,0.201362,0.467585,-0.347586,0.962761,...,0.337309,0.404857,0.594332,0.80181,0.108186,0.218923,-0.4331,-1.383996,1.760135,0.256725
26213,-0.255765,0.04038,1.29295,-0.478335,-0.688653,-0.094722,0.387218,-0.938971,-1.000622,1.521842,...,-0.475559,0.576874,2.402998,0.330567,1.089679,1.599995,0.173667,-0.705471,0.473086,-0.595255
121169,-0.118851,-1.572511,0.104159,1.283724,0.024256,-0.257493,0.374294,-0.893251,-0.098985,0.223367,...,0.102171,-0.256557,0.064411,-1.307146,-1.028803,-0.813137,1.492722,1.395631,1.233597,0.769766
87524,-0.269251,1.187003,-1.003767,1.106124,1.524958,0.18264,0.801729,-1.49161,-0.143916,0.399354,...,2.051286,-0.200767,-1.2924,-0.173536,0.93777,2.080136,-0.41223,2.234453,-0.906541,1.336438


In [6]:
y.head()

_id
68411     1
118486    1
26213     1
121169    0
87524     0
Name: target, dtype: int64

In [7]:
X.shape, y.shape

((2200, 1000), (2200,))

## SELECTKBEST for Feature Selection

Used Y Labels instead of dropping features

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = .3,
                                                    )

In [10]:
skb = SelectKBest(k=20)

skb.fit(X_train,y_train)

SelectKBest(k=20, score_func=<function f_classif at 0x7f270e67f400>)

In [11]:
skb_feats = np.where(skb.get_support())[0]
skb_feats

array([  2,  52,  78, 269, 315, 341, 507, 524, 611, 680, 681, 701, 736,
       745, 769, 808, 829, 891, 907, 920])

## SELECT FROM MODEL for Feature Selection

Used Y Labels instead of dropping features

In [12]:
sfm = SelectFromModel(LogisticRegression(), threshold='mean')


In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
sfm.fit(X_train_scaled, y_train)

SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        prefit=False, threshold='mean')

In [15]:
sfm_feats = np.where(sfm.get_support())[0]
sfm_feats

array([  1,   2,  11,  13,  18,  19,  22,  23,  27,  29,  31,  33,  34,
        36,  37,  41,  48,  49,  51,  53,  57,  61,  62,  63,  65,  67,
        73,  77,  80,  81,  85,  90,  93,  96,  98,  99, 106, 107, 108,
       111, 112, 114, 115, 118, 124, 126, 128, 129, 133, 135, 146, 148,
       149, 150, 151, 155, 156, 158, 159, 162, 163, 168, 180, 182, 185,
       189, 190, 191, 193, 194, 196, 199, 202, 203, 205, 207, 211, 212,
       213, 214, 216, 217, 223, 225, 229, 234, 235, 241, 242, 245, 246,
       251, 254, 257, 261, 262, 263, 269, 273, 274, 276, 277, 278, 281,
       282, 283, 285, 294, 296, 300, 301, 305, 306, 308, 310, 314, 323,
       330, 331, 334, 335, 344, 345, 346, 347, 348, 353, 354, 358, 359,
       360, 363, 366, 368, 375, 376, 378, 380, 382, 386, 387, 388, 392,
       393, 394, 400, 404, 408, 411, 412, 413, 414, 416, 421, 422, 424,
       425, 427, 428, 430, 431, 433, 435, 436, 438, 439, 441, 445, 446,
       448, 450, 451, 452, 453, 457, 458, 462, 464, 465, 466, 46

## PEARSON CORRELATION MASKING For Feature Selection

In [17]:
corr_df = X.corr()

In [18]:
corrs = list(corr_df[corr_df[corr_df.abs() >.5].count() > 1].index)

In [19]:
corrs

['feat_257',
 'feat_269',
 'feat_308',
 'feat_315',
 'feat_336',
 'feat_341',
 'feat_395',
 'feat_504',
 'feat_526',
 'feat_639',
 'feat_681',
 'feat_701',
 'feat_724',
 'feat_736',
 'feat_769',
 'feat_808',
 'feat_829',
 'feat_867',
 'feat_920',
 'feat_956']

### PEARSON CORRELATION MASKING technique returns the following features that corroborate with SelectKBest:

341, 681, 701, 736, 769, 808, 829, 920

### PEARSON CORRELATION MASKING technique returns the following features that corroborate with SelectFromModel:

257, 269, 308, 504, 681, 701, 769, 808, 829, 920, 956

In total, SelectKBest found 8 informative/redundant features that were produced by the pearson correlation technique, while SelectFromModel confirmed 11. 

### SECOND BATCH OF 2200


In [32]:
X_2 = pd.read_pickle('../data/second_batch_X.p')

In [33]:
y_2 = pd.read_pickle('../data/second_batch_labels.p')

## SELECTKBEST for Feature Selection

Used Y Labels instead of dropping features

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_2,
                                                    y_2,
                                                    test_size = .3,
                                                    )

In [35]:
skb = SelectKBest(k=20)

skb.fit(X_train,y_train)

SelectKBest(k=20, score_func=<function f_classif at 0x7f270e67f400>)

In [36]:
skb_feats = np.where(skb.get_support())[0]
skb_feats

array([ 54, 199, 269, 315, 341, 504, 526, 546, 639, 681, 701, 720, 736,
       769, 805, 808, 829, 867, 907, 920])

## SELECT FROM MODEL for Feature Selection

Used Y Labels instead of dropping features

In [37]:
sfm = SelectFromModel(LogisticRegression(), threshold='mean')

In [38]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [39]:
sfm.fit(X_train_scaled, y_train)

SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        prefit=False, threshold='mean')

In [40]:
sfm_feats = np.where(sfm.get_support())[0]
sfm_feats

array([  0,   3,   4,   5,   6,   8,  11,  12,  13,  20,  25,  26,  29,
        30,  32,  37,  39,  41,  45,  46,  48,  53,  54,  55,  58,  59,
        62,  63,  65,  68,  69,  71,  72,  73,  78,  80,  81,  82,  85,
        87,  89,  92,  94,  98,  99, 100, 103, 105, 106, 112, 113, 116,
       117, 119, 120, 124, 125, 129, 130, 132, 135, 138, 139, 143, 144,
       146, 147, 151, 153, 158, 160, 162, 164, 166, 167, 168, 171, 175,
       176, 180, 181, 186, 187, 188, 189, 190, 192, 197, 199, 201, 202,
       203, 205, 206, 207, 209, 213, 215, 217, 222, 229, 232, 234, 235,
       240, 242, 243, 245, 248, 249, 255, 257, 263, 266, 269, 270, 271,
       272, 277, 278, 280, 284, 285, 287, 289, 294, 295, 296, 297, 298,
       299, 300, 301, 302, 305, 306, 310, 317, 318, 319, 322, 323, 326,
       330, 331, 333, 340, 342, 343, 345, 350, 354, 355, 364, 367, 371,
       373, 375, 377, 378, 381, 382, 385, 391, 392, 397, 399, 405, 411,
       415, 416, 417, 418, 419, 421, 422, 423, 424, 425, 426, 42

### The following features are corroborated in both SelectKBest and SelectFromModel

54, 199, 269, 504, 526, 546, 681, 701, 720, 769, 805, 808, 829, 907, 920

In [42]:
corr_df_2 = X_2.corr()

In [43]:
corrs_2 = list(corr_df_2[corr_df_2[corr_df_2.abs() >.5].count() > 1].index)

In [44]:
corrs_2

['feat_257',
 'feat_269',
 'feat_308',
 'feat_315',
 'feat_336',
 'feat_341',
 'feat_395',
 'feat_504',
 'feat_526',
 'feat_639',
 'feat_681',
 'feat_701',
 'feat_724',
 'feat_736',
 'feat_769',
 'feat_808',
 'feat_829',
 'feat_867',
 'feat_920',
 'feat_956']

### PEARSON CORRELATION MASKING technique returns the following features that corroborate with both SelectKBest and SelectFromModel:

269, 504, 526, 681, 701, 769, 808, 829, 920

Note: All of the above features were also confirmed in the first sample

### THIRD BATCH OF 2200


In [45]:
X_3 = pd.read_pickle('../data/third_batch_X.p')

In [46]:
y_3 = pd.read_pickle('../data/third_batch_labels.p')

## SELECTKBEST for Feature Selection

Used Y Labels instead of dropping features

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X_3,
                                                    y_3,
                                                    test_size = .3,
                                                    )

In [48]:
skb = SelectKBest(k=20)

skb.fit(X_train,y_train)

SelectKBest(k=20, score_func=<function f_classif at 0x7f270e67f400>)

In [49]:
skb_feats = np.where(skb.get_support())[0]
skb_feats

array([137, 269, 315, 336, 341, 395, 504, 571, 649, 681, 701, 724, 736,
       769, 792, 808, 826, 829, 838, 920])

## SELECT FROM MODEL for Feature Selection

Used Y Labels instead of dropping features

In [50]:
sfm = SelectFromModel(LogisticRegression(), threshold='mean')

In [51]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [52]:
sfm.fit(X_train_scaled, y_train)

SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        prefit=False, threshold='mean')

In [None]:
[137, 269, 315, 336, 341, 395, 504, 571, 649, 681, 701, 724, 736,
       769, 792, 808, 826, 829, 838, 920])

137, 269, 336, 504, 571, 649, 681, 701, 736, 769, 792, 808, 826, 838, 

In [53]:
sfm_feats = np.where(sfm.get_support())[0]
sfm_feats

array([  2,   3,   5,   8,   9,  10,  13,  14,  15,  17,  18,  19,  21,
        26,  29,  31,  32,  34,  38,  40,  41,  43,  44,  54,  62,  63,
        64,  66,  75,  76,  78,  79,  84,  85,  87,  90,  91,  92,  93,
        94,  96,  97, 103, 105, 107, 108, 109, 115, 116, 117, 119, 120,
       124, 127, 128, 129, 130, 132, 134, 135, 136, 137, 138, 141, 143,
       144, 148, 149, 150, 151, 153, 155, 156, 158, 162, 164, 165, 166,
       167, 168, 169, 170, 178, 179, 180, 183, 184, 187, 188, 192, 193,
       194, 198, 201, 202, 205, 210, 211, 214, 215, 216, 218, 222, 224,
       231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 243, 244, 245,
       247, 251, 253, 257, 258, 260, 262, 264, 267, 268, 269, 270, 279,
       280, 281, 284, 285, 287, 289, 290, 292, 298, 299, 300, 305, 311,
       312, 313, 316, 320, 322, 329, 335, 336, 338, 340, 342, 343, 344,
       346, 347, 353, 356, 358, 359, 363, 373, 374, 375, 377, 378, 380,
       381, 385, 386, 394, 397, 399, 402, 403, 405, 406, 407, 42

### The following features are corroborated in both SelectKBest and SelectFromModel

137, 269, 336, 504, 571, 649, 681, 701, 736, 769, 792, 808, 826, 838, 

In [54]:
corr_df_3 = X_3.corr()

In [55]:
corrs_3 = list(corr_df_3[corr_df_3[corr_df_3.abs() >.5].count() > 1].index)

In [56]:
corrs_3

['feat_257',
 'feat_269',
 'feat_308',
 'feat_315',
 'feat_336',
 'feat_341',
 'feat_395',
 'feat_504',
 'feat_526',
 'feat_639',
 'feat_681',
 'feat_701',
 'feat_724',
 'feat_736',
 'feat_769',
 'feat_808',
 'feat_829',
 'feat_867',
 'feat_920',
 'feat_956']

### PEARSON CORRELATION MASKING technique returns the following features that corroborate with both SelectKBest and SelectFromModel:

504, 681, 701, 769, 808

Note: All of the above features were also confirmed in the first and second samples