In [3]:
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, SelectFromModel, f_regression
import seaborn as sns
%matplotlib inline

## Importing and sampling the UCI data / basic EDA

In [4]:
df_train_data = pd.read_csv('madelon_train.data.csv', delimiter=' ', header=None).drop(500, axis=1)

In [5]:
df_train_labels = pd.read_csv('madelon_train.labels.csv', delimiter=' ', header=None)

In [6]:
train_data_10pct_1 = df_train_data.sample(200)

In [8]:
y = df_train_labels[0]

In [9]:
y_10pct_1 = y.sample(200)

In [10]:
df_train_data.head(4)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,485,477,537,479,452,471,491,476,475,473,...,477,481,477,485,511,485,481,479,475,496
1,483,458,460,487,587,475,526,479,485,469,...,463,478,487,338,513,486,483,492,510,517
2,487,542,499,468,448,471,442,478,480,477,...,487,481,492,650,506,501,480,489,499,498
3,480,491,510,485,495,472,417,474,502,476,...,491,480,474,572,454,469,475,482,494,461


In [11]:
y_10pct_1.head(4)

525    -1
485     1
1735   -1
199     1
Name: 0, dtype: int64

In [13]:
train_data_10pct_1.shape, y_10pct_1.shape

((200, 500), (200,))

## SELECTKBEST for Feature Selection

Used Y Labels instead of dropping features

In [21]:
X_train, X_test, y_train, y_test = train_test_split(train_data_10pct_1,
                                                    y_10pct_1,
                                                    test_size = .3,
                                                    )

In [22]:
skb = SelectKBest(k=20)

skb.fit(X_train,y_train)

SelectKBest(k=20, score_func=<function f_classif at 0x7f93086c3400>)

In [23]:
skb_feats = np.where(skb.get_support())[0]
skb_feats

array([ 47, 100, 115, 173, 183, 189, 211, 234, 245, 249, 254, 279, 286,
       303, 345, 349, 370, 371, 400, 416])

## SELECT FROM MODEL for Feature Selection

Used Y Labels instead of dropping features

In [28]:
sfm = SelectFromModel(LogisticRegression(), threshold='mean')


In [29]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [30]:
sfm.fit(X_train_scaled, y_train)


SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        prefit=False, threshold='mean')

In [31]:
sfm_feats = np.where(sfm.get_support())[0]
sfm_feats

array([  1,   4,   8,  10,  11,  12,  13,  18,  21,  23,  28,  30,  31,
        32,  33,  34,  41,  42,  43,  45,  47,  48,  49,  51,  52,  53,
        54,  57,  58,  59,  60,  61,  63,  68,  69,  75,  78,  79,  83,
        85,  92,  96, 100, 103, 104, 107, 108, 109, 111, 113, 114, 115,
       116, 118, 121, 122, 123, 125, 127, 129, 131, 140, 142, 143, 146,
       147, 148, 155, 156, 157, 159, 163, 167, 168, 169, 171, 173, 175,
       176, 177, 181, 182, 183, 185, 186, 187, 189, 190, 192, 193, 197,
       199, 200, 201, 202, 203, 205, 206, 211, 215, 219, 220, 221, 223,
       224, 225, 226, 234, 236, 240, 241, 242, 243, 245, 246, 249, 251,
       254, 256, 257, 258, 262, 264, 268, 269, 270, 272, 273, 277, 278,
       279, 280, 284, 286, 287, 293, 294, 295, 296, 298, 299, 303, 304,
       308, 309, 311, 312, 313, 315, 317, 318, 324, 327, 328, 333, 334,
       337, 342, 344, 345, 349, 351, 352, 355, 356, 357, 358, 363, 368,
       369, 370, 371, 372, 374, 378, 379, 384, 389, 391, 395, 39

Note: All the features in the SelectFromModel technique exist in SelectKBest. 

## PEARSON CORRELATION MASKING For Feature Selection

In [32]:
corr_df = df_train_data.corr()

In [33]:
corrs = list(corr_df[corr_df[corr_df.abs() >.5].count() > 1].index)

In [34]:
corrs

[28,
 48,
 64,
 105,
 128,
 153,
 241,
 281,
 318,
 336,
 338,
 378,
 433,
 442,
 451,
 453,
 455,
 472,
 475,
 493]

In [35]:
imp_feat_df = df_train_data.iloc[:, corrs]

In [36]:
imp_feat_df.head()

Unnamed: 0,28,48,64,105,128,153,241,281,318,336,338,378,433,442,451,453,455,472,475,493
0,459,440,648,181,452,575,434,517,414,658,628,419,533,568,463,471,630,515,401,485
1,475,499,488,431,473,404,551,435,469,469,528,526,442,463,474,311,582,465,549,338
2,491,460,485,593,487,585,474,535,506,465,431,464,569,503,481,606,424,485,454,650
3,472,529,415,698,493,591,569,526,458,398,377,553,565,447,472,545,456,457,602,572
4,472,429,387,451,475,448,538,456,462,385,509,424,462,536,472,426,465,500,560,435


## JOSH'S METHOD WITH KNN For Feature Selection

In [46]:
def calculate_r_2_for_feature_with_KNN(data, feature):
    new_data = data.drop(feature, axis=1)

    X_train, \
    X_test,  \
    y_train, \
    y_test = train_test_split(
        new_data,data[feature],test_size=0.25
    )
    
#     scaler = StandardScaler()
#     scaler.fit(X_train, y_train)
#     X_tr_sc = scaler.transform(X_train, y_train)
#     X_ts_sc = scaler.transform(X_test, y_test)

    regressor = KNeighborsRegressor()
    regressor.fit(X_train, y_train)

    score = regressor.score(X_test, y_test)
    return score

In [47]:
def mean_r2_for_feature_knn(data, feature):
    scores = []
    for _ in range(100):
        scores.append(calculate_r_2_for_feature_with_KNN(data, feature))
        
    scores = np.array(scores)
    return scores.mean()

In [49]:
r2_knn_scores = []
r2_knn_means = []

for column in train_data_10pct_1.columns:
#     score_function_calculations = calculate_r_2_for_feature_with_KNN(train_data_10pct_1, column)
#     r2_knn_scores.append(score_function_calculations)
    score_function_means = mean_r2_for_feature_knn(train_data_10pct_1, column)
    r2_knn_means.append((column, score_function_means))
    print(column, score_function_means)

0 -0.245046979295
1 -0.238925676168
2 -0.195839297271
3 -0.207651775921
4 -0.290965546836
5 -0.186031961279
6 -0.216777881343
7 -0.22852013329
8 -0.221416913619
9 -0.22328209138
10 -0.318268885236
11 -0.219157795786
12 -0.178603747929
13 -0.245168102209
14 -0.239491599692
15 -0.256422742041
16 -0.293247965137
17 -0.289946144582
18 -0.270950427331
19 -0.209296180575
20 -0.242068306785
21 -0.361302534394
22 -0.252615732
23 -0.168248916448
24 -0.288563021175
25 -0.177480884715
26 -0.237122757349
27 -0.256043158178
28 0.496747259215
29 -0.2271373858
30 -0.234383160187
31 -0.323866301254
32 -0.22766524785
33 -0.243838775844
34 -0.177611404275
35 -0.197676629037
36 -0.393927081165
37 -0.289477672842
38 -0.256457505459
39 -0.202866432586
40 -0.174622036957
41 -0.229961633427
42 -0.231382713907
43 -0.292839034093
44 -0.135473120734
45 -0.260518894539
46 -0.271942387684
47 -0.201314072851
48 0.34547806554
49 -0.216840391025
50 -0.272817121185
51 -0.286391120442
52 -0.263805116557
53 -0.22970983

419 -0.216614244267
420 -0.197812214397
421 -0.0726315781183
422 -0.302908780955
423 -0.230106991028
424 -0.246172336024
425 -0.204131454895
426 -0.304711116094
427 -0.287979246557
428 -0.205795615942
429 -0.318337552124
430 -0.166955894222
431 -0.240398578975
432 -0.261686703119
433 0.798098582639
434 -0.169287524157
435 -0.199091267318
436 -0.441972723868
437 -0.165549468867
438 -0.319340552263
439 -0.260873112526
440 -0.287983412437
441 -0.264810872006
442 0.712780509012
443 -0.115768470997
444 -0.183097393634
445 -0.184623986647
446 -0.282303665309
447 -0.294685903823
448 -0.190612301519
449 -0.172727257191
450 -0.222374199754
451 0.499461060497
452 -0.141401135303
453 0.835594995345
454 -0.235244294889
455 0.807623712136
456 -0.213264400708
457 -0.206018246566
458 -0.207071577527
459 -0.18353907984
460 -0.220604481363
461 -0.163191493801
462 -0.348617201664
463 -0.240562510195
464 -0.163951298414
465 -0.17174593314
466 -0.330330649475
467 -0.187806612566
468 -0.194532830712
469 -0

In [50]:
print("{} {}".format("28: ", mean_r2_for_feature_knn(train_data_10pct_1, 28)))
print("{} {}".format("48: ", mean_r2_for_feature_knn(train_data_10pct_1, 48)))
print("{} {}".format("64: ", mean_r2_for_feature_knn(train_data_10pct_1, 64)))
print("{} {}".format("105: ", mean_r2_for_feature_knn(train_data_10pct_1, 105)))
print("{} {}".format("128: ", mean_r2_for_feature_knn(train_data_10pct_1, 128)))
print("{} {}".format("153: ", mean_r2_for_feature_knn(train_data_10pct_1, 153)))
print("{} {}".format("241: ", mean_r2_for_feature_knn(train_data_10pct_1, 241)))
print("{} {}".format("281: ", mean_r2_for_feature_knn(train_data_10pct_1, 281)))
print("{} {}".format("318: ", mean_r2_for_feature_knn(train_data_10pct_1, 318)))
print("{} {}".format("336: ", mean_r2_for_feature_knn(train_data_10pct_1, 336)))
print("{} {}".format("338: ", mean_r2_for_feature_knn(train_data_10pct_1, 338)))
print("{} {}".format("378: ", mean_r2_for_feature_knn(train_data_10pct_1, 378)))
print("{} {}".format("433: ", mean_r2_for_feature_knn(train_data_10pct_1, 433)))
print("{} {}".format("442: ", mean_r2_for_feature_knn(train_data_10pct_1, 442)))
print("{} {}".format("451: ", mean_r2_for_feature_knn(train_data_10pct_1, 451)))
print("{} {}".format("453: ", mean_r2_for_feature_knn(train_data_10pct_1, 453)))
print("{} {}".format("455: ", mean_r2_for_feature_knn(train_data_10pct_1, 455)))
print("{} {}".format("472: ", mean_r2_for_feature_knn(train_data_10pct_1, 472)))
print("{} {}".format("475: ", mean_r2_for_feature_knn(train_data_10pct_1, 475)))
print("{} {}".format("493: ", mean_r2_for_feature_knn(train_data_10pct_1, 493)))

28:  0.4970997525375522
48:  0.35864011297371406
64:  0.7673534545982738
105:  0.6884552637255332
128:  0.822643186968861
153:  0.7414818390725066
241:  0.7657560974466524
281:  0.807669537504913
318:  0.4280255542212325
336:  0.7471183291068563
338:  0.8282715333617031
378:  0.2552889342339038
433:  0.8043099619520632
442:  0.7048083922251225
451:  0.49965241847853037
453:  0.8429839060805597
455:  0.8106141167086496
472:  0.8001661139014282
475:  0.7463076827738361
493:  0.8316823985811691



The highest r2 scores returned using Josh's method corroborate with the Pearson Correlation Masking technique results. Both produce the same informative/redundant features. Other methods used such as SelectKBest and SelectFromModel did not produce the same features and were reliant on the labels in both the y_train and y_test, which makes them less reliable.