In [1]:
import sklearn.feature_selection as fs
import numpy as np 

X = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1,
                                                                      1]])
var = fs.VarianceThreshold(threshold=0.2)
var.fit(X)
X_trans = var.transform(X)
print("The original data")
print(X)
print("The processed data by variance threshold")
print(X_trans)

The original data
[[0 0 1]
 [0 1 0]
 [1 0 0]
 [0 1 1]
 [0 1 0]
 [0 1 1]]
The processed data by variance threshold
[[0 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]]


In [2]:
import sklearn.datasets as datasets

X, y = datasets.make_classification(n_samples=300, n_features=10, n_informative=4)
# choose the f_classif as the metric and K is 3
bk = fs.SelectKBest(fs.f_classif, 3)
bk.fit(X, y)
X_trans = bk.transform(X)




In [4]:
import sklearn.feature_selection as fs
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

X, y = datasets.make_classification(n_samples=500,
                                    n_features=20,
                                    n_informative=8,
                                    random_state=42)

f1_list = []
for k in range(1, 15):
    bk = fs.SelectKBest(fs.f_classif, k)
    bk.fit(X, y)
    X_trans = bk.transform(X)
    train_x, test_x, train_y, test_y = train_test_split(X_trans,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    lr = LogisticRegression()
    lr.fit(train_x, train_y)
    y_pred = lr.predict(test_x)
    f1 = metrics.f1_score(test_y, y_pred)
    f1_list.append(f1)

fig, axe = plt.subplots(dpi = 300)
axe.plot(range(1, 15), f1_list)
axe.set_xlabel("best k features")
axe.set_ylabel("F1-score")
fig.savefig("../data/fe_img.png")
plt.close(fig)



In [5]:
import sklearn.feature_selection as fs
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.metrics as metrics

X, y = datasets.make_classification(n_samples=500,
                                    n_features=20,
                                    n_informative=6,
                                    random_state=21)

gb = GradientBoostingClassifier(n_estimators=20)
gb.fit(X, y)
print("The feature importances of GBDT")
print(gb.feature_importances_)

model = fs.SelectFromModel(gb, prefit=True)
X_trans = model.transform(X)
print("The shape of original data is {}".format(X.shape))
print("The shape of transformed data is {}".format(X_trans.shape))

The feature importances of GBDT
[0.         0.00493847 0.         0.00910409 0.         0.13488926
 0.1601943  0.         0.00134611 0.05031481 0.04073724 0.04862839
 0.0078042  0.         0.005109   0.         0.53693415 0.
 0.         0.        ]
The shape of original data is (500, 20)
The shape of transformed data is (500, 4)
