# 二手车价格预测
模型融合

# 数据探索

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_train = pd.read_csv('./used_car_train_20200313.csv', sep=' ')
data_submit = pd.read_csv('./used_car_testB_20200421.csv', sep=' ')

In [3]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             150000 non-null  int64  
 1   name               150000 non-null  int64  
 2   regDate            150000 non-null  int64  
 3   model              149999 non-null  float64
 4   brand              150000 non-null  int64  
 5   bodyType           145494 non-null  float64
 6   fuelType           141320 non-null  float64
 7   gearbox            144019 non-null  float64
 8   power              150000 non-null  int64  
 9   kilometer          150000 non-null  float64
 10  notRepairedDamage  150000 non-null  object 
 11  regionCode         150000 non-null  int64  
 12  seller             150000 non-null  int64  
 13  offerType          150000 non-null  int64  
 14  creatDate          150000 non-null  int64  
 15  price              150000 non-null  int64  
 16  v_

In [4]:
data_submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SaleID             50000 non-null  int64  
 1   name               50000 non-null  int64  
 2   regDate            50000 non-null  int64  
 3   model              50000 non-null  float64
 4   brand              50000 non-null  int64  
 5   bodyType           48496 non-null  float64
 6   fuelType           47076 non-null  float64
 7   gearbox            48032 non-null  float64
 8   power              50000 non-null  int64  
 9   kilometer          50000 non-null  float64
 10  notRepairedDamage  50000 non-null  object 
 11  regionCode         50000 non-null  int64  
 12  seller             50000 non-null  int64  
 13  offerType          50000 non-null  int64  
 14  creatDate          50000 non-null  int64  
 15  v_0                50000 non-null  float64
 16  v_1                500

In [5]:
# 变量类型
date_columns = ['creatDate', 'regDate']
useless_columns = ['SaleID', 'name', 'seller', 'offerType']
label_columns = ['price']
categorical_columns = ['brand', 'bodyType', 'fuelType', 'gearbox', 
                       'notRepairedDamage', 'model','regionCode']
numeric_columns = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 
                   'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 
                   'v_10', 'v_11', 'v_12','v_13', 'v_14']

In [6]:
# 训练集数据
features = data_train[numeric_columns + categorical_columns + date_columns + label_columns]
labels = data_train[label_columns]

In [7]:
# 测试集数据
sub_features = data_submit[numeric_columns + categorical_columns + date_columns]

# 数据清洗

In [8]:
# 非法值
features['notRepairedDamage'] = features['notRepairedDamage'].apply(lambda x: int(float(x)) if x != '-' else 0)
sub_features['notRepairedDamage'] = sub_features['notRepairedDamage'].apply(lambda x: int(float(x)) if x != '-' else 0)

In [9]:
# 缺失值(均值、众数补全)
null_columns = ['bodyType', 'fuelType', 'gearbox', 'model']
for col in null_columns:
    fill_value = features[col].mode().values[0]
    features.fillna(value={col: fill_value}, inplace=True)
    sub_features.fillna(value={col: fill_value}, inplace=True)

# 特征构造

In [10]:
# 时间特征
features['creatDate'] = pd.to_datetime(features['creatDate'], format='%Y%m%d')
features['regDate'] = features['regDate'].apply(lambda x: str(x))
features['regDate'] = features['regDate'].apply(lambda x: x[:4]+'01'+x[6:] if x[4:6] == '00' else x)
features['regDate'] = pd.to_datetime(features['regDate'], format='%Y%m%d')
features['new_date'] = features['creatDate'] - features['regDate']
features['new_date'] = features['new_date'].apply(lambda x: x.days)
features.drop(date_columns, axis=1, inplace=True)

sub_features['creatDate'] = pd.to_datetime(sub_features['creatDate'], format='%Y%m%d')
sub_features['regDate'] = sub_features['regDate'].apply(lambda x: str(x))
sub_features['regDate'] = sub_features['regDate'].apply(lambda x: x[:4]+'01'+x[6:] if x[4:6] == '00' else x)
sub_features['regDate'] = pd.to_datetime(sub_features['regDate'], format='%Y%m%d')
sub_features['new_date'] = sub_features['creatDate'] - sub_features['regDate']
sub_features['new_date'] = sub_features['new_date'].apply(lambda x: x.days)
sub_features.drop(date_columns, axis=1, inplace=True)

In [11]:
# 增加价格最大最小平均标准差
add_col = ['brand']
agg_method = ['max', 'min', 'mean', 'median']
for col in add_col:
    # 设置列名与获取列数据的字典形式
    col_name = ['{}_{}'.format(col, x) for x in agg_method]
    temp_dict = features.groupby(col)['price'].agg(agg_method).to_dict()
    
    # 建立列名
    features = pd.concat([features, pd.DataFrame(columns=col_name)], axis=1)    
    sub_features = pd.concat([sub_features, pd.DataFrame(columns=col_name)], axis=1)  
    
    # 填充数据
    for i, j in zip(col_name, temp_dict):
        features[i] = features[col].apply(lambda x: temp_dict[j][x])
        sub_features[i] = sub_features[col].apply(lambda x: temp_dict[j][x])

features.drop('price', axis=1, inplace=True)

In [12]:
# 独热编码
for col in ['brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage',]:
    # 训练独热编码
    onehot = OneHotEncoder()
    features_array = onehot.fit_transform(features[col].values.reshape(-1, 1))
    sub_features_array = onehot.transform(sub_features[col].values.reshape(-1, 1))
    
    # 列名设置
    col_name = ['{}_{}'.format(col,x) for x in range(features[col].nunique())]
    
    # 训练集填充
    df_temp = pd.DataFrame(features_array.toarray(), columns=col_name)
    features = pd.concat([features, df_temp], axis=1)
    features.drop(col, axis=1)
    
    # 测试集填充
    df_temp = pd.DataFrame(sub_features_array.toarray(), columns=col_name)
    sub_features = pd.concat([sub_features, df_temp], axis=1)
    sub_features.drop(col, axis=1)
    break

# 数据拆分

In [13]:
ss = StandardScaler()
x = ss.fit_transform(features)
y = np.log(labels)
sub_train = ss.transform(sub_features)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1234)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(120000, 69)
(120000, 1)
(30000, 69)
(30000, 1)


# 模型训练

In [14]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [20]:
# lightgbm
lgb = LGBMRegressor()
lgb.fit(x_train, y_train)
lgb_predict = np.exp(lgb.predict(x_test))
lgb_mae = mean_absolute_error(lgb_predict, np.exp(y_test))

# xgb
xgb = XGBRegressor()
xgb.fit(x_train, y_train)
xgb_predict = np.exp(xgb.predict(x_test))
xgb_mae = mean_absolute_error(xgb_predict, np.exp(y_test))

print('lgb MAE', lgb_mae)
print('xgb MAE', xgb_mae)

lgb MAE 707.4735137234762
xgb MAE 661.6514194858869


In [21]:
# 设置权重，mae大的权重小
w_lgb = xgb_mae / (lgb_mae + xgb_mae)
w_xgb = lgb_mae / (lgb_mae + xgb_mae)

In [28]:
# 全量数据训练模型并通过模型融合预测
lgb_sub = LGBMRegressor()
lgb_sub.fit(x, y)
lgb_predict = np.exp(lgb.predict(sub_train))

xgb = XGBRegressor()
xgb.fit(x, y)
xgb_predict = np.exp(xgb.predict(sub_train))

result = lgb_predict * w_lgb + xgb_predict * w_xgb

In [30]:
# 结果保存
sub_csv = pd.read_csv('./used_car_sample_submit.csv')
sub_csv['price'] = result
sub_csv.to_csv('./xgb_lgb_model_predict.csv', index=False)
pd.read_csv('./xgb_lgb_model_predict.csv').head()

Unnamed: 0,SaleID,price
0,150000,1315.004431
1,150001,1857.325264
2,150002,7568.686097
3,150003,1087.552798
4,150004,2059.223986
