In [1]:
import pandas as pd
import numpy as np

In [2]:
# 读取全部数据
train_data = pd.read_csv('./dataset/train_all.csv', nrows=None)
test_data = pd.read_csv('./dataset/test_all.csv', nrows=None)

In [3]:
feature_columns = [col for col in train_data.columns if col not in ['user_id', 'label']]
train = train_data[feature_columns].values
test = test_data[feature_columns].values
target = train_data['label'].values

In [4]:
# 缺失值补全
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputer = imputer.fit(train)
train_imputer = imputer.transform(train)
test_imputer = imputer.transform(test)

### 特征选择

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

def feature_selection(train, train_sel, target):
    clf = RandomForestClassifier(n_estimators=100,
                                 max_depth=2,
                                 random_state=0,
                                 n_jobs=-1)
    scores = cross_val_score(clf, train, target, cv=5)
    scores_sel = cross_val_score(clf, train_sel, target, cv=5)

    print('No select accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
    print('Feature select accuracy: %0.2f (+/- %0.2f)' % (scores_sel.mean(), scores_sel.std() * 2))

In [7]:
# 1. 删除方差较小的特征
# VarianceThreshold 是一种简单的基线特征选择方法，会删除方差不符合某个阈值的所有特征。
# 在默认情况下，他会删除所有零方差特征，即在所有样本中具有相同值的特征。
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel = sel.fit(train)
train_sel = sel.transform(train)
test_sel = sel.transform(test)
print('训练集数据未特征筛选的维度：', train.shape)
print('训练集数据特征筛选后的维度：', train_sel.shape)

训练集数据未特征筛选的维度： (2000, 227)
训练集数据特征筛选后的维度： (2000, 23)


In [8]:
feature_selection(train, train_sel, target)

No select accuracy: 0.94 (+/- 0.00)
Feature select accuracy: 0.94 (+/- 0.00)


In [10]:
# 2. 单变量特征选择
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
sel = SelectKBest(mutual_info_classif, k=2)
sel = sel.fit(train, target)
train_sel = sel.transform(train)
test_sel = sel.transform(test)
print('训练集数据未特征筛选的维度：', train.shape)
print('训练集数据特征筛选后的维度：', train_sel.shape)

训练集数据未特征筛选的维度： (2000, 227)
训练集数据特征筛选后的维度： (2000, 2)


In [11]:
feature_selection(train, train_sel, target)

No select accuracy: 0.94 (+/- 0.00)
Feature select accuracy: 0.94 (+/- 0.00)


In [13]:
# 3. 递归功能消除
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10,
                             max_depth=2,
                             random_state=0,
                             n_jobs=-1)
selector = RFECV(clf, step=1, cv=2)
selector = selector.fit(train, target)
# 特征较多不方便打印
print(selector.support_)
print(selector.ranking_)

KeyboardInterrupt: 

In [15]:
# 4. 使用模型选择特征
# 使用LR拟合的参数进行变量选择，即L2范数进行特征选择，筛选对回归目标影响大的特征.
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
normalizer = normalizer.fit(train)
train_norm = normalizer.transform(train)
test_norm = normalizer.transform(test)

LR = LogisticRegression(penalty='l2', C=5)
LR = LR.fit(train_norm, target)
model = SelectFromModel(LR, prefit=True)
train_sel = model.transform(train)
test_sel = model.transform(test)
print('训练集数据未特征筛选的维度：', train.shape)
print('训练集数据特征筛选后的维度：', train_sel.shape)

训练集数据未特征筛选的维度： (2000, 227)
训练集数据特征筛选后的维度： (2000, 16)


In [16]:
# L2 范数选择参数：
LR.coef_[0][:10]

array([ 1.37163092e-01, -6.98754950e-03,  2.40857827e-04,  6.65627466e-01,
       -5.63401443e-01, -3.74640510e-02, -5.09345986e-01,  2.42581840e-01,
       -5.52420691e-02,  3.42213126e-03])

In [17]:
feature_selection(train, train_sel, target)

No select accuracy: 0.94 (+/- 0.00)
Feature select accuracy: 0.94 (+/- 0.00)


In [21]:
# 使用L1 范数进行特征选择。同样，LR模型采用拟合参数的形式进行变量选择，筛选对回归目标影响大对特征。
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
normalizer = normalizer.fit(train)

train_norm = normalizer.transform(train)
test_norm = normalizer.transform(test)

LR = LogisticRegression(penalty='l1', solver='liblinear', C=5)
LR = LR.fit(train_norm, target)
model = SelectFromModel(LR, prefit=True)
train_sel = model.transform(train)
test_sel = model.transform(test)
print('训练集数据未特征筛选的维度：', train.shape)
print('训练集数据特征筛选后的维度：', train_sel.shape)

训练集数据未特征筛选的维度： (2000, 227)
训练集数据特征筛选后的维度： (2000, 5)


In [22]:
LR.coef_[0][:10]

array([0.       , 0.       , 0.       , 0.3234966, 0.       , 0.       ,
       0.       , 0.       , 0.       , 0.       ])

In [23]:
feature_selection(train, train_sel, target)

No select accuracy: 0.94 (+/- 0.00)
Feature select accuracy: 0.94 (+/- 0.00)


In [24]:
# 5. 基于树模型的特征选择
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(train, target)
model = SelectFromModel(clf, prefit=True)
train_sel = model.transform(train)
test_sel = model.transform(test)
print('训练集数据未特征筛选的维度：', train.shape)
print('训练集数据特征筛选后的维度：', train_sel.shape)

训练集数据未特征筛选的维度： (2000, 227)
训练集数据特征筛选后的维度： (2000, 91)


In [25]:
clf.feature_importances_[:10]

array([0.07908459, 0.00582476, 0.00329156, 0.00427817, 0.00286365,
       0.00512337, 0.00453403, 0.00470917, 0.00450408, 0.00548654])

In [26]:
feature_selection(train, train_sel, target)

No select accuracy: 0.94 (+/- 0.00)
Feature select accuracy: 0.94 (+/- 0.00)


In [27]:
# 6. 基于LGB模型的特征选择
import lightgbm
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.4, random_state=0)

clf = lightgbm
train_matrix = clf.Dataset(X_train, label=y_train)
test_matrix = clf.Dataset(X_test, label=y_test)

params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'min_child_weight': 1.5,
    'num_leaves': 2**5,
    'lambda_l2': 10,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'colsample_bylevel': 0.7,
    'learning_rate': 0.03,
    'tree_method': 'exact',
    'seed': 2021,
    'num_class': 2,
    'silent': True
}
num_round = 10000
early_stopping_rounds = 100
model = clf.train(params,
                  train_matrix,
                  num_round,
                  valid_sets=test_matrix,
                  early_stopping_rounds=early_stopping_rounds)

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30834
[LightGBM] [Info] Number of data points in the train set: 1200, number of used features: 220
[LightGBM] [Info] Start training from score -0.060989
[LightGBM] [Info] Start training from score -2.827397
[1]	valid_0's multi_logloss: 0.261534
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 0.261699
[3]	valid_0's multi_logloss: 0.26197
[4]	valid_0's multi_logloss: 0.261935
[5]	valid_0's multi_logloss: 0.262027
[6]	valid_0's multi_logloss: 0.262278
[7]	valid_0's multi_logloss: 0.262326
[8]	valid_0's multi_logloss: 0.262415
[9]	valid_0's multi_logloss: 0.262906
[10]	valid_0's multi_logloss: 0.263175
[11]	valid_0's multi_logloss: 0.263492
[12]	valid_0's multi_logloss: 0.263893
[13]	valid_0's multi_logloss: 0.263872
[14]	valid_0's multi_logloss: 0.264125
[15]	valid_0's multi_logloss: 0.264561
[16]	valid_0's multi_logloss: 0.264738
[17]	valid_0's multi_logloss: 

In [29]:
def lgb_transform(train, test, model, topK):
    train_df = pd.DataFrame(train)
    train_df.columns = range(train.shape[1])

    test_df = pd.DataFrame(test)
    test_df.columns = range(test.shape[1])

    features_import = pd.DataFrame()
    features_import['importance'] = model.feature_importance()
    features_import['col'] = range(train.shape[1])
    features_import = features_import.sort_values(['importance'], ascending=0).head(topK)
    sel_col = list(features_import.col)

    train_sel = train_df[sel_col]
    test_sel = test_df[sel_col]
    return train_sel, test_sel
train_sel, test_sel = lgb_transform(train,test, model, 20)
print('训练集数据未特征筛选的维度：', train.shape)
print('训练集数据特征筛选后的维度：', train_sel.shape)

训练集数据未特征筛选的维度： (2000, 227)
训练集数据特征筛选后的维度： (2000, 20)


In [30]:
model.feature_importance()[:10]

array([1, 0, 0, 0, 0, 0, 1, 2, 0, 0], dtype=int32)

In [31]:
feature_selection(train, train_sel, target)

No select accuracy: 0.94 (+/- 0.00)
Feature select accuracy: 0.94 (+/- 0.00)
