### 4.4.1 读取数据

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# pd.set_option('display.max_columns', None)
sns.set()
import warnings
warnings.filterwarnings('ignore')

reduce_mem_usage 函数通过调整数据类型，帮助我们减少数据在内存中占用的空间

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
treeData = reduce_mem_usage(pd.read_csv('data_for_tree.csv'))

Memory usage of dataframe is 280000128.00 MB
Memory usage after optimization is: 69998696.00 MB
Decreased by 75.0%


In [4]:
X_train = treeData[treeData['price'].notnull()].drop('price', axis=1)
y_train = treeData['price'].dropna()
X_test = treeData[~treeData['price'].notnull()].drop('price', axis=1)
y_test = X_test[['SaleID']].copy()

print(X_train.shape)
X_train.head()

(150000, 174)


Unnamed: 0,SaleID,bodyType,brand,creatDate,fuelType,gearbox,kilometer,model,name,notRepairedDamage,...,city_in_model_prop,brand_target_mean,brand_target_std,brand_target_mad,model_target_mean,model_target_std,model_target_mad,city_target_mean,city_target_std,city_target_mad
0,0,1.0,6,2016-04-04,0.0,0.0,12.5,30.0,736,0.0,...,0.217773,3614.0,4700.0,3198.0,2774.0,3040.0,2298.0,6028.0,7736.0,5096.0
1,1,2.0,1,2016-03-09,0.0,0.0,15.0,40.0,2262,0.0,...,0.109131,9240.0,9360.0,6904.0,6768.0,6268.0,4724.0,5952.0,7516.0,5044.0
2,2,1.0,15,2016-04-02,0.0,0.0,12.5,115.0,14874,0.0,...,0.192993,9824.0,5332.0,4284.0,10904.0,5740.0,4744.0,5804.0,7264.0,4868.0
3,3,0.0,10,2016-03-12,0.0,1.0,15.0,109.0,71865,0.0,...,0.240112,8472.0,9048.0,6464.0,12848.0,12624.0,8920.0,5768.0,7248.0,4832.0
4,4,1.0,5,2016-03-13,0.0,0.0,5.0,110.0,111080,0.0,...,0.049103,3298.0,3350.0,2318.0,1573.0,1536.0,1246.0,6156.0,8068.0,5308.0


In [14]:
y_train_log = np.log1p(y_train)

In [6]:
# 将特征进行分类
cate_cols = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'city', 'regionCode']
num_cols = ['power_bin', 'kilometer', 'used_time'] + ['v_{}'.format(i) for i in range(15)]
date_cols = ['regDate_year', 'regDate_month', 'regDate_day', 'regDate_dayofweek', 'creatDate_year', 'creatDate_month', 'creatDate_day', 'creatDate_dayofweek']
cols = X_train.columns.tolist()

In [7]:
# 丢掉 ['v_1', 'v_3', 'v_5'] 进行训练，因为共线和与其他匿名特征高度相关
X_train.drop(['v_1', 'v_3', 'v_5'], axis=1, inplace=True)
X_test.drop(['v_1', 'v_3', 'v_5'], axis=1, inplace=True)

In [8]:
# 依次丢掉 ['v_1', 'v_3', 'v_5'] 进行训练
X_train.drop(date_cols, axis=1, inplace=True)
X_test.drop(date_cols, axis=1, inplace=True)

#### 4.4.4  模型调参

在此我们介绍了三种常用的调参方法如下：

  - 贪心算法 https://www.jianshu.com/p/ab89df9759c8
  - 网格调参 https://blog.csdn.net/weixin_43172660/article/details/83032029
  - 贝叶斯调参 https://blog.csdn.net/linxid/article/details/81189154

In [10]:
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor

import lightgbm as lgb
from lightgbm.sklearn import LGBMRegressor

from sklearn.model_selection import GridSearchCV

#### 4.4.4 - 2 Grid Search 调参

In [12]:
# 设定初始参数
params = {    
          'boosting_type': 'gbdt',
          'objective': 'mae',
          'metric': 'l1',
          'nthread': -1,
          'learning_rate': 0.2,
          'num_leaves': 63, 
          'max_depth': 6,   
          'subsample': 0.8, 
          'colsample_bytree': 0.8, 
}

#### 第一步：学习率和迭代次数

In [16]:
data_train = lgb.Dataset(X_train, y_train_log)
cv_results = lgb.cv(params, data_train, num_boost_round=3000, nfold=5, stratified=False, shuffle=True, metrics='l1', categorical_feature=cate_cols, early_stopping_rounds=50,seed=0)
print('best n_estimators:', len(cv_results['l1-mean']))
print('best cv score:', pd.Series(cv_results['l1-mean']).max())

best n_estimators: 2998
best cv score: 0.8185372109174794


### 第二步：确定max_depth和num_leaves

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer

In [None]:
estimator = LGBMRegressor(boosting_type='gbdt',
                          objective='mae',
                          metrics='l1',
                          learning_rate=0.2,
                          n_estimators=2998,
                          max_depth=6,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          gpu_platform_id = 0,
                          gpu_device_id = 0,
                          random_state=2020)

params_test1={'max_depth': range(5, 8, 1), 'num_leaves':range(10, 100, 10)}
              
gsearch1 = GridSearchCV(estimator, param_grid=params_test1, scoring=make_scorer(mean_absolute_error), cv=5, n_jobs=-1)
gsearch1.fit(X_train,y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_