In [3]:

import pandas as pd
import numpy as np
from sklearn.feature_selection import (SelectKBest,
                                       SelectFpr,
                                       SelectFdr,
                                       SelectFwe,
                                       SelectPercentile,
                                       chi2, mutual_info_regression,
                                       f_classif)
from sklearn.decomposition import PCA

In [2]:
data = arff.load(open('./pol.arff'))

In [3]:
attrs = []
for attr in data['attributes']: 
    attrs.append(attr[0])
df = pd.DataFrame(data=data['data'], columns=attrs)
X, y = df.drop(columns=['foo']), df['foo']

In [4]:
df.to_csv('pol.csv', index=False)

In [5]:
df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f40,f41,f42,f43,f44,f45,f46,f47,f48,foo
0,110.0,100.0,100.0,100.0,60.0,108.0,76.0,71.0,94.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,110.0,100.0,100.0,100.0,130.0,77.0,76.0,71.0,94.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,110.0,100.0,100.0,100.0,110.0,89.0,76.0,71.0,94.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,110.0,100.0,100.0,100.0,13.0,126.0,89.0,72.0,94.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,110.0,100.0,100.0,100.0,15.0,119.0,78.0,71.0,94.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
def getBest(cls, X, y, score_func=chi2, **kwargs):
    k_best = cls(score_func=score_func, **kwargs).fit(X, y)
    idxs = k_best.get_support(indices=True)
    X = X.iloc[:,idxs]
    scores = k_best.scores_
    return X, np.sort(scores)

In [7]:
X_kbest, scores = getBest(SelectKBest, X, y, k=20)
print(X_kbest.columns)
print(scores)

Index(['f5', 'f6', 'f7', 'f13', 'f14', 'f17', 'f18', 'f19', 'f20', 'f21',
       'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f31', 'f32'],
      dtype='object')
[4.73689685e-28 9.41213885e-28 9.41213885e-28 9.41213885e-28
 2.07344314e+02 1.12310353e+03 1.48235614e+03 1.78823111e+03
 1.83720659e+03 1.84092215e+03 2.17391040e+03 2.42336316e+03
 2.89305615e+03 3.46633835e+03 3.65883050e+03 3.79672458e+03
 4.00323526e+03 4.31052678e+03 5.04738046e+03 5.41658773e+03
 6.85831130e+03 7.98744389e+03 9.81389266e+03 1.01448908e+04
 1.07506392e+04 1.62243918e+04 1.71386796e+04 1.88642809e+04
 1.97927348e+04 2.90857449e+04            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan]


In [8]:
X_fpr, scores = getBest(SelectFpr, X, y, score_func=chi2, alpha=0.01)
print(X_fpr.columns)
print(scores)

Index(['f5', 'f6', 'f7', 'f8', 'f9', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18',
       'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28',
       'f29', 'f30', 'f31', 'f32', 'f33'],
      dtype='object')
[4.73689685e-28 9.41213885e-28 9.41213885e-28 9.41213885e-28
 2.07344314e+02 1.12310353e+03 1.48235614e+03 1.78823111e+03
 1.83720659e+03 1.84092215e+03 2.17391040e+03 2.42336316e+03
 2.89305615e+03 3.46633835e+03 3.65883050e+03 3.79672458e+03
 4.00323526e+03 4.31052678e+03 5.04738046e+03 5.41658773e+03
 6.85831130e+03 7.98744389e+03 9.81389266e+03 1.01448908e+04
 1.07506392e+04 1.62243918e+04 1.71386796e+04 1.88642809e+04
 1.97927348e+04 2.90857449e+04            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan]


  return self.pvalues_ < self.alpha


In [9]:
X_fdr, scores = getBest(SelectFdr, X, y, score_func=chi2, alpha=0.01)
print(X_fdr.columns)
print(scores)

Index(['f5', 'f6', 'f7', 'f8', 'f9', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18',
       'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28',
       'f29', 'f30', 'f31', 'f32', 'f33'],
      dtype='object')
[4.73689685e-28 9.41213885e-28 9.41213885e-28 9.41213885e-28
 2.07344314e+02 1.12310353e+03 1.48235614e+03 1.78823111e+03
 1.83720659e+03 1.84092215e+03 2.17391040e+03 2.42336316e+03
 2.89305615e+03 3.46633835e+03 3.65883050e+03 3.79672458e+03
 4.00323526e+03 4.31052678e+03 5.04738046e+03 5.41658773e+03
 6.85831130e+03 7.98744389e+03 9.81389266e+03 1.01448908e+04
 1.07506392e+04 1.62243918e+04 1.71386796e+04 1.88642809e+04
 1.97927348e+04 2.90857449e+04            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan]


  np.arange(1, n_features + 1)]
  return self.pvalues_ <= selected.max()


In [10]:
X_fwe, scores = getBest(SelectFwe, X, y)
print(X_fdr.columns)
print(scores)

Index(['f5', 'f6', 'f7', 'f8', 'f9', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18',
       'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28',
       'f29', 'f30', 'f31', 'f32', 'f33'],
      dtype='object')
[4.73689685e-28 9.41213885e-28 9.41213885e-28 9.41213885e-28
 2.07344314e+02 1.12310353e+03 1.48235614e+03 1.78823111e+03
 1.83720659e+03 1.84092215e+03 2.17391040e+03 2.42336316e+03
 2.89305615e+03 3.46633835e+03 3.65883050e+03 3.79672458e+03
 4.00323526e+03 4.31052678e+03 5.04738046e+03 5.41658773e+03
 6.85831130e+03 7.98744389e+03 9.81389266e+03 1.01448908e+04
 1.07506392e+04 1.62243918e+04 1.71386796e+04 1.88642809e+04
 1.97927348e+04 2.90857449e+04            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan]


  return (self.pvalues_ < self.alpha / len(self.pvalues_))


In [11]:
X_percentile, scores = getBest(SelectPercentile, X, y, percentile=50)
print(X_percentile.columns)
print(scores)

Index(['f5', 'f6', 'f7', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19',
       'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29',
       'f30', 'f31', 'f32', 'f33'],
      dtype='object')
[4.73689685e-28 9.41213885e-28 9.41213885e-28 9.41213885e-28
 2.07344314e+02 1.12310353e+03 1.48235614e+03 1.78823111e+03
 1.83720659e+03 1.84092215e+03 2.17391040e+03 2.42336316e+03
 2.89305615e+03 3.46633835e+03 3.65883050e+03 3.79672458e+03
 4.00323526e+03 4.31052678e+03 5.04738046e+03 5.41658773e+03
 6.85831130e+03 7.98744389e+03 9.81389266e+03 1.01448908e+04
 1.07506392e+04 1.62243918e+04 1.71386796e+04 1.88642809e+04
 1.97927348e+04 2.90857449e+04            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan
            nan            nan            nan            nan]


In [4]:
pca = PCA(0.95)
pca.fit(X)
print(pca.explained_variance_)

NameError: name 'X' is not defined