In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("data/train_all.csv", nrows=10000)
test_data = pd.read_csv("data/test_all.csv", nrows=100)

In [2]:
features_columns = [col for col in train_data.columns if col not in ['user_id', 'label']]
train = train_data[features_columns].values
test = test_data[features_columns].values
target = train_data['label'].values

In [4]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputer.fit(train)
train_imputer = imputer.transform(train)
test_imputer = imputer.transform(test)

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

def feature_selection(train, train_sel, target):
    clf = RandomForestClassifier(n_estimators=100,
                                 max_depth=2,
                                 random_state=0,
                                 n_jobs=-1)
    scores = cross_val_score(clf, train, target, cv=5)
    scores_sel = cross_val_score(clf, train_sel, target, cv=5)

    print("No select Accuracy: %0.2f (+/-%0.2f)" %
          (scores.mean(), scores.std()*2))
    print("Features select Accuracy: %0.2f (+/-%0.2f)" %
          (scores_sel.mean(), scores_sel.std()*2))

In [10]:
# 删除方差较小的特征
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(.8 * (1-.8)))
sel = sel.fit(train)
train_sel = sel.transform(train)
test_sel = sel.transform(test)
print("No select shape, ", train.shape)
print("select shape, ", train_sel.shape)

No select shape,  (2000, 229)
select shape,  (2000, 24)


In [9]:
feature_selection(train, train_sel, target)

No select Accuracy: 0.94 (+/-0.00)
Features select Accuracy: 0.94 (+/-0.00)
