In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("data/train_all.csv", nrows=10000)
test_data = pd.read_csv("data/test_all.csv", nrows=100)

In [2]:
features_columns = [col for col in train_data.columns if col not in ['user_id', 'label']]
train = train_data[features_columns].values
test = test_data[features_columns].values
target = train_data['label'].values

In [4]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputer.fit(train)
train_imputer = imputer.transform(train)
test_imputer = imputer.transform(test)

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

def feature_selection(train, train_sel, target):
    clf = RandomForestClassifier(n_estimators=100,
                                 max_depth=2,
                                 random_state=0,
                                 n_jobs=-1)
    scores = cross_val_score(clf, train, target, cv=5)
    scores_sel = cross_val_score(clf, train_sel, target, cv=5)

    print("No select Accuracy: %0.2f (+/-%0.2f)" %
          (scores.mean(), scores.std()*2))
    print("Features select Accuracy: %0.2f (+/-%0.2f)" %
          (scores_sel.mean(), scores_sel.std()*2))

In [10]:
# 删除方差较小的特征
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(.8 * (1-.8)))
sel = sel.fit(train)
train_sel = sel.transform(train)
test_sel = sel.transform(test)
print("No select shape, ", train.shape)
print("select shape, ", train_sel.shape)

No select shape,  (2000, 229)
select shape,  (2000, 24)


In [9]:
feature_selection(train, train_sel, target)

No select Accuracy: 0.94 (+/-0.00)
Features select Accuracy: 0.94 (+/-0.00)


In [11]:
# 单变量特征选择（基于单变量统计检验）
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

sel = SelectKBest(mutual_info_classif, k=2)
sel = sel.fit(train, target)
train_sel = sel.transform(train)
test_sel = sel.transform(test)
print("No select shape, ", train.shape)
print("select shape, ", train_sel.shape)

No select shape,  (2000, 229)
select shape,  (2000, 2)


In [12]:
feature_selection(train, train_sel, target)

No select Accuracy: 0.94 (+/-0.00)
Features select Accuracy: 0.94 (+/-0.00)


In [13]:
# 递归功能消除（每次把评分低的特征去除）
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10,
                            max_depth=2,
                            random_state=0,
                            n_jobs=-1)
selector = RFECV(clf, step=1, cv=2)
selector = selector.fit(train, target)
train_sel = selector.transform(train)
test_sel = selector.transform(test)
print("No select shape, ", train.shape)
print("select shape, ", train_sel.shape)

No select shape,  (2000, 229)
select shape,  (2000, 1)


In [14]:
feature_selection(train, train_sel, target)

No select Accuracy: 0.94 (+/-0.00)
Features select Accuracy: 0.94 (+/-0.00)


In [15]:
# 使用模型选择特征
# 利用LR L2范数
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
normalizer = normalizer.fit(train)
train_norm = normalizer.transform(train)
test_norm = normalizer.transform(test)

LR = LogisticRegression(penalty='l2', C=5)
LR = LR.fit(train_norm, target)
model = SelectFromModel(LR, prefit=True)
train_sel = model.transform(train)
test_sel = model.transform(test)
print("No select shape, ", train.shape)
print("select shape, ", train_sel.shape)

No select shape,  (2000, 229)
select shape,  (2000, 19)


In [16]:
LR.coef_[0][:10]

array([ 0.17120186, -0.00894058,  0.00097069,  0.47386776, -0.57425936,
       -0.04520622, -0.53460298,  0.15390687, -0.05802753,  0.00405779])

In [17]:
feature_selection(train, train_sel, target)

No select Accuracy: 0.94 (+/-0.00)
Features select Accuracy: 0.94 (+/-0.00)


In [20]:
# 基于树模型
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(train, target)

model = SelectFromModel(clf, prefit=True)
train_sel = model.transform(train)
test_sel = model.transform(test)
print("No select shape, ", train.shape)
print("select shape, ", train_sel.shape)

No select shape,  (2000, 229)
select shape,  (2000, 58)


In [21]:
feature_selection(train, train_sel, target)

No select Accuracy: 0.94 (+/-0.00)
Features select Accuracy: 0.94 (+/-0.00)


In [23]:
clf.feature_importances_[:10]

array([0.07270456, 0.01945938, 0.01229773, 0.01658408, 0.01875145,
       0.01560426, 0.02044343, 0.01840991, 0.01523613, 0.01053317])

In [25]:
import lightgbm
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train, target, test_size=0.4, random_state=0
)

clf = lightgbm
train_matrix = clf.Dataset(X_train, label=y_train)
test_matrix = clf.Dataset(X_test, label=y_test)
params = {
          'boosting_type': 'gbdt',
          #'boosting_type': 'dart',
          'objective': 'multiclass',
          'metric': 'multi_logloss',
          'min_child_weight': 1.5,
          'num_leaves': 2**5,
          'lambda_l2': 10,
          'subsample': 0.7,
          'colsample_bytree': 0.7,
          'colsample_bylevel': 0.7,
          'learning_rate': 0.03,
          'tree_method': 'exact',
          'seed': 2017,
          "num_class": 2,
          'silent': True,
          }
num_round = 10000
early_stopping_rounds = 100
model = clf.train(params,
                  train_matrix,
                  num_round,
                  valid_sets=test_matrix,
                  early_stopping_rounds=early_stopping_rounds)

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6016
[LightGBM] [Info] Number of data points in the train set: 1200, number of used features: 122
[LightGBM] [Info] Start training from score -0.060989
[LightGBM] [Info] Start training from score -2.827397
[1]	valid_0's multi_logloss: 0.261527
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 0.261457
[3]	valid_0's multi_logloss: 0.261348
[4]	valid_0's multi_logloss: 0.261564
[5]	valid_0's multi_logloss: 0.261454
[6]	valid_0's multi_logloss: 0.261396
[7]	valid_0's multi_logloss: 0.261591
[8]	valid_0's multi_logloss: 0.261887
[9]	valid_0's multi_logloss: 0.261977
[10]	valid_0's multi_logloss: 0.262117
[11]	valid_0's multi_logloss: 0.262126
[12]	valid_0's multi_logloss: 0.262056
[13]	valid_0's multi_logloss: 0.26229
[14]	valid_0's multi_logloss: 0.262298
[15]	valid_0's multi_logloss: 0.262397
[16]	valid_0's multi_logloss: 0.262582
[17]	valid_0's multi_logloss: 0

In [26]:
def lgb_transform(train, test, model, topK):
    train_df = pd.DataFrame(train)
    train_df.columns = range(train.shape[1])

    test_df = pd.DataFrame(test)
    test_df.columns = range(test.shape[1])

    features_import = pd.DataFrame()
    features_import['importance'] = model.feature_importance()
    features_import['col'] = range(train.shape[1])

    features_import = features_import.sort_values(['importance'],ascending=0).head(topK)
    sel_col = list(features_import.col)

    train_sel = train_df[sel_col]
    test_sel = test_df[sel_col]
    return train_sel, test_sel

In [27]:
train_sel, test_sel = lgb_transform(train, test, model, 20)
print('训练数据未特征筛选维度', train.shape)
print('训练数据特征筛选维度后', train_sel.shape)

训练数据未特征筛选维度 (2000, 229)
训练数据特征筛选维度后 (2000, 20)


In [28]:
model.feature_importance()[:10]

array([18,  4,  0, 11,  6, 11,  3,  3,  7,  0])

In [29]:
feature_selection(train, train_sel, target)

No select Accuracy: 0.94 (+/-0.00)
Features select Accuracy: 0.94 (+/-0.00)
