In [98]:
import pandas as pd
import datetime
import csv
import numpy as np
import os
import scipy as sp
import xgboost as xgb
import itertools
import operator
import warnings
warnings.filterwarnings("ignore")
from scipy import stats 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from matplotlib import pylab as plt
plot = True

goal = 'price'
myid = 'SaleID'


# 导入数据

In [205]:
train = pd.read_csv('../data/used_car_train_20200313.csv',sep=' ', parse_dates=['regDate', 'creatDate'])
train.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0.25141,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.56533,-0.832687,-0.229963
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0.274293,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0.228036,0.073205,0.09188,0.078819,0.121534,-1.89624,0.910783,0.93111,2.834518,1.923482


In [180]:
testA = pd.read_csv('../data/used_car_testA_20200313.csv', sep=' ', parse_dates=['regDate', 'creatDate'])
testA.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,150000,66932,20111212,222.0,4,5.0,1.0,1.0,313,15.0,...,0.264405,0.1218,0.070899,0.106558,0.078867,-7.050969,-0.854626,4.800151,0.620011,-3.664654
1,150001,174960,19990211,19.0,21,0.0,0.0,0.0,75,12.5,...,0.261745,0.0,0.096733,0.013705,0.052383,3.679418,-0.729039,-3.796107,-1.54123,-0.757055
2,150002,5356,20090304,82.0,21,0.0,0.0,0.0,109,7.0,...,0.260216,0.112081,0.078082,0.062078,0.05054,-4.92669,1.001106,0.826562,0.138226,0.754033
3,150003,50688,20100405,0.0,0,0.0,0.0,1.0,160,7.0,...,0.260466,0.106727,0.081146,0.075971,0.048268,-4.864637,0.505493,1.870379,0.366038,1.312775
4,150004,161428,19970703,26.0,14,2.0,0.0,0.0,75,15.0,...,0.250999,0.0,0.077806,0.0286,0.081709,3.616475,-0.673236,-3.197685,-0.025678,-0.10129


# 数据加载

In [210]:
def load_data():
    """
        加载数据，设定数值型和非数值型数据
    """
    train = pd.read_csv('../data/used_car_train_20200313.csv',sep=' ', parse_dates=['regDate', 'creatDate'])
    test = pd.read_csv('../data/used_car_testA_20200313.csv', sep=' ', parse_dates=['regDate', 'creatDate'])
    train.drop(['regDate', 'creatDate', 'notRepairedDamage'], axis=1, inplace=True)
    test.drop(['regDate', 'creatDate', 'notRepairedDamage'], axis=1, inplace=True)
#     train.drop(['regDate', 'creatDate', 'notRepairedDamage', 'bodyType', 'fuelType', 'gearbox'], axis=1, inplace=True)
#     test.drop(['regDate', 'creatDate', 'notRepairedDamage', 'bodyType', 'fuelType', 'gearbox'], axis=1, inplace=True)
    
    train.dropna(axis=0, how='any', inplace=True)
    test.dropna(axis=0, how='any', inplace=True)
    features = test.columns.tolist()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    features_numeric = test.select_dtypes(include=numerics).columns.tolist()
    features_non_numeric = [f for f in features if f not in features_numeric]
    return (train,test,features,features_non_numeric)

# 定义一些变换和评判准则

In [211]:
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w
 
def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe
 
def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

# 训练与分析

In [212]:
def XGB_native(train,test,features,features_non_numeric):
    depth = 13
    eta = 0.01
    ntrees = 10
    mcw = 3
    params = {"objective": "reg:linear",
              "booster": "gbtree",
              "eta": eta,
              "max_depth": depth,
              "min_child_weight": mcw,
              "subsample": 0.9,
              "colsample_bytree": 0.7,
              "silent": 1,
              "eval_metric":"mae"
              }
    print("Running with params: " + str(params))
    print("Running with ntrees: " + str(ntrees))
    print("Running with features: " + str(features))
 
    # Train model with local split
    tsize = 0.05
    X_train, X_test = train_test_split(train, test_size=tsize)
    dtrain = xgb.DMatrix(X_train[features], np.log(X_train[goal] + 1))
    dvalid = xgb.DMatrix(X_test[features], np.log(X_test[goal] + 1))
    watchlist = [(dvalid, 'vali-data'), (dtrain, 'train-data')]
    gbm = xgb.train(params, dtrain, ntrees, evals=watchlist, early_stopping_rounds=20, feval=rmspe_xg, verbose_eval=True)
    train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
#     print(train_probs)
    indices = train_probs < 0
    train_probs[indices] = 0
    error = rmspe(np.exp(train_probs) - 1, X_test[goal].values)
    print(error)
    

#     # Predict and Export
#     test_probs = gbm.predict(xgb.DMatrix(test[features]))
#     indices = test_probs < 0
#     test_probs[indices] = 0
#     submission = pd.DataFrame({myid: test[myid], goal: np.int64(np.exp(test_probs) - 1)})
#     if not os.path.exists('result/'):
#         os.makedirs('result/')
#     submission.to_csv("./result/dat-xgb_d%s_eta%s_ntree%s_mcw%s_tsize%s.csv" % (str(depth),str(eta),str(ntrees),str(mcw),str(tsize)) , index=False)
#     # Feature importance
#     if plot:
#         outfile = open('xgb.fmap', 'w')
#         i = 0
#         for feat in features:
#             outfile.write('{0}\t{1}\tq\n'.format(i, feat))
#             i = i + 1
#         outfile.close()
#         importance = gbm.get_fscore(fmap='xgb.fmap')
#         importance = sorted(importance.items(), key=operator.itemgetter(1))
#         df = pd.DataFrame(importance, columns=['feature', 'fscore'])
#         df['fscore'] = df['fscore'] / df['fscore'].sum()
#         # Plotitup
#         plt.figure()
#         df.plot()
#         df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(25, 15))
#         plt.title('XGBoost Feature Importance')
#         plt.xlabel('relative importance')
#         plt.gcf().savefig('Feature_Importance_xgb_d%s_eta%s_ntree%s_mcw%s_tsize%s.png' % (str(depth),str(eta),str(ntrees),str(mcw),str(tsize)))

In [213]:
print("=> 载入数据中...")
train,test,features,features_non_numeric = load_data()
print("=> 处理数据与特征工程...")
# train,test,features,features_non_numeric = process_data(train,test,features,features_non_numeric)
print("=> 使用XGBoost建模...")
XGB_native(train,test,features,features_non_numeric)

=> 载入数据中...
=> 处理数据与特征工程...
=> 使用XGBoost建模...
Running with params: {'objective': 'reg:linear', 'booster': 'gbtree', 'eta': 0.01, 'max_depth': 13, 'min_child_weight': 3, 'subsample': 0.9, 'colsample_bytree': 0.7, 'silent': 1, 'eval_metric': 'mae'}
Running with ntrees: 10
Running with features: ['SaleID', 'name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'regionCode', 'seller', 'offerType', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14']
[0]	vali-data-mae:7.57349	train-data-mae:7.53889	vali-data-rmspe:0.99955	train-data-rmspe:0.99953
Multiple eval metrics have been passed: 'train-data-rmspe' will be used for early stopping.

Will train until train-data-rmspe hasn't improved in 20 rounds.
[1]	vali-data-mae:7.49788	train-data-mae:7.46353	vali-data-rmspe:0.999482	train-data-rmspe:0.999459
[2]	vali-data-mae:7.42301	train-data-mae:7.38893	vali-data-rmspe:0.999411	train-data-rmspe:0.999385
[3]	vali-d