In [1]:
# 导入库
import numpy as np
import pandas as pd
from time import time
from IPython.display import display
import pandas_profiling

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline

In [2]:
# 导入数据
data_train = pd.read_csv("train.csv")
data_store = pd.read_csv("store.csv")
data_test = pd.read_csv("test.csv")

# 显示第一条记录
display(data_train.head())

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [3]:
#交易数据可视化
#data_train.profile_report(style={'full_width':True})

In [4]:
#商店数据可视化
#data_store.profile_report(style={'full_width':True})

In [5]:
#Null值处理
data_store.fillna(0,inplace=True)
data_test.fillna(1, inplace=True)

In [6]:
#确定Null值处理完毕
display(data_train.isnull().sum(),data_train.isnull().sum(),data_store.isnull().sum())

Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

Store                        0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
dtype: int64

In [7]:
#查看字段类型，确定字段类型能够导入模型
display(data_train.dtypes,data_test.dtypes)

Store             int64
DayOfWeek         int64
Date             object
Sales             int64
Customers         int64
Open              int64
Promo             int64
StateHoliday     object
SchoolHoliday     int64
dtype: object

Id                 int64
Store              int64
DayOfWeek          int64
Date              object
Open             float64
Promo              int64
StateHoliday      object
SchoolHoliday      int64
dtype: object

In [8]:
#日期格式转换
data_train['Date'] = pd.to_datetime(data_train['Date'])
data_test['Date'] = pd.to_datetime(data_test['Date'])

In [9]:
#交易数据集与商店数据集合并
train = pd.merge(data_train, data_store, on='Store')
test = pd.merge(data_test, data_store, on='Store')
train = train.sort_values(['Date'],ascending = False)

In [10]:
#由于本项目是预测未来6周销售额，因此在拆分验证集时，也按未来6周进行拆分
#交易数据集中共有1115家商店，按照日期排序后，1115家未来6周的销售数据应为最后1115*7*6行
#训练集与测试集切片
#参考https://www.kaggle.com/c/rossmann-store-sales/discussion/18024

tai_test = train[:6*7*1115]
tai_train = train[6*7*1115:]

In [11]:
#未营业的商店不产生销量，属于噪音数据，应剔除
tai_test = tai_test[(tai_test["Open"] != 0)&(tai_test["Sales"] > 0)]
tai_train = tai_train[(tai_train["Open"] != 0)&(tai_train["Sales"] > 0)]

In [12]:
#创建特征处理函数

def features_handle(data):
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear
    
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)
    
    #替换完成后进行类型转化
    data['StateHoliday'] = data['StateHoliday'].astype('int')
    data['Assortment'] = data['Assortment'].astype('int')
    data['StoreType'] = data['StoreType'].astype('int')
    
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + (data.Month - data.CompetitionOpenSinceMonth)
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['CompetitionOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0)        
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
  
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['month_str'] = data.Month.map(month2str)
    def check(row):
        if isinstance(row['PromoInterval'],str) and row['month_str'] in row['PromoInterval']:
            return 1
        else:
            return 0
        
    data['IsPromoMonth'] =  data.apply(lambda row: check(row),axis=1)

    return data

In [13]:
#特征批量处理
features_handle(tai_train)
features_handle(tai_test)
features_handle(test)

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,...,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,month_str,IsPromoMonth
0,1,1,3,2015-09-17,1.0,1,0,0,3,1,...,0.0,0,2015,9,17,38,84.0,24189.50,Sept,0
1,857,1,2,2015-09-16,1.0,1,0,0,3,1,...,0.0,0,2015,9,16,38,84.0,24189.50,Sept,0
2,1713,1,1,2015-09-15,1.0,1,0,0,3,1,...,0.0,0,2015,9,15,38,84.0,24189.50,Sept,0
3,2569,1,0,2015-09-14,1.0,1,0,0,3,1,...,0.0,0,2015,9,14,38,84.0,24189.50,Sept,0
4,3425,1,6,2015-09-13,0.0,0,0,0,3,1,...,0.0,0,2015,9,13,37,84.0,24189.25,Sept,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41083,37664,1115,2,2015-08-05,1.0,1,0,1,4,3,...,2012.0,"Mar,Jun,Sept,Dec",2015,8,5,32,24188.0,38.50,Aug,0
41084,38520,1115,1,2015-08-04,1.0,1,0,1,4,3,...,2012.0,"Mar,Jun,Sept,Dec",2015,8,4,32,24188.0,38.50,Aug,0
41085,39376,1115,0,2015-08-03,1.0,1,0,1,4,3,...,2012.0,"Mar,Jun,Sept,Dec",2015,8,3,32,24188.0,38.50,Aug,0
41086,40232,1115,6,2015-08-02,0.0,0,0,1,4,3,...,2012.0,"Mar,Jun,Sept,Dec",2015,8,2,31,24188.0,38.25,Aug,0


In [14]:
#丢弃不需要特征
drop_list = ['Date','Customers','Open','PromoInterval','month_str']
tai_train.drop(drop_list,axis=1,inplace =True)
tai_test.drop(drop_list,axis=1,inplace =True)

In [15]:
#生成XGboost训练集与验证集,将销量转化为正态分布
tai_xtrain = tai_train.drop(['Sales'],axis=1 )
tai_ytrain = np.log1p(tai_train.Sales)
tai_xtest = tai_test.drop(['Sales'],axis=1 )
tai_ytest = np.log1p(tai_test.Sales)

In [16]:
#测试集丢弃无用信息
xtest =test.drop(['Date','Id','Open','PromoInterval','month_str'],axis = 1)

In [17]:
#定义评估函数
def rmspe(y_true, y_pred):
    loss = np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true)), axis=0))
    return loss

#XGBoost评估函数输入参数反向
def rmspe_xg(y_pred, y_true):
    return "rmspe", rmspe(np.expm1(y_true.get_label()),np.expm1(y_pred))

In [18]:
import xgboost as xgb
#XGboost参数设置
params = {"objective": "reg:squarederror",
          'learning_rate':0.1,
          "booster" : "gbtree",
          "max_depth": 10,
          "min_child_weight":1,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "gamma":0.8,
          "silent": 1,
          "seed": 10,
          "tree_method":"gpu_hist",
          "gpu_id":0
          }
num_boost_round = 10000

In [19]:
#XGboost训练集与验证集
dtrain = xgb.DMatrix(tai_xtrain, tai_ytrain)
dvalid = xgb.DMatrix(tai_xtest, tai_ytest)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

In [20]:
#得到序列
from itertools import combinations

def grid_dict(param_test):
    combine_list = []
    key_nums = 0
    for i in param_test:
        key_nums += 1
        for j in param_test[i]:
            combine_list.append({i:j})
  
    result = []
    for i in combinations(combine_list, key_nums):
        tmp_dict = {}
        for j in list(i):
            for key in j:
                tmp_dict[key] = j[key]
        if len(tmp_dict) == key_nums:
            result.append(tmp_dict)
    
    return result

def train_model(params,flag = False):
    print("Start Train...")
    start = time()
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist,early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=flag)
    end = time()
    print('time is {:2f} s.'.format(end-start))
    
    print("testing...")
    tai_xtest.sort_index(inplace=True) 
    tai_ytest.sort_index(inplace=True) 
    yhat = gbm.predict(xgb.DMatrix(tai_xtest))
    error = rmspe(np.expm1(tai_ytest), np.expm1(yhat))

    print('RMSPE: {:.6f}'.format(error))
    return error,gbm

def grid_search_xg(params, params_grid):
    iter_num = 1
    grid_list = grid_dict(params_grid)
    gbm = None
    print('Iter nums is {:1f}.'.format(len(grid_list)))
    min_error = 100
    result_params = {}
    for i in grid_list:
        print('Iter No is {:1f}.'.format(iter_num))
        for key in i:
            params[key] = i[key]
        print(params)
        model_result = train_model(params)
        error = model_result[0]
        if error<min_error:
            result_params = i
            min_error = error
            gbm = model_result[1]
        iter_num += 1
    
    print('best_param:')
    print(result_params)
    
    for key in result_params:
        params[key] = result_params[key]
    return gbm     

In [21]:
#基础模型
train_model(params, True)

Start Train...
[0]	train-rmse:7.44334	eval-rmse:7.44788	train-rmspe:0.999525	eval-rmspe:0.999529
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 100 rounds.
[1]	train-rmse:6.70066	eval-rmse:6.70606	train-rmspe:0.99882	eval-rmspe:0.998832
[2]	train-rmse:6.03251	eval-rmse:6.03779	train-rmspe:0.997544	eval-rmspe:0.997569
[3]	train-rmse:5.4311	eval-rmse:5.43759	train-rmspe:0.995402	eval-rmspe:0.995452
[4]	train-rmse:4.88999	eval-rmse:4.89789	train-rmspe:0.992011	eval-rmspe:0.99211
[5]	train-rmse:4.4032	eval-rmse:4.41172	train-rmspe:0.986936	eval-rmspe:0.987105
[6]	train-rmse:3.96519	eval-rmse:3.97338	train-rmspe:0.979725	eval-rmspe:0.979978
[7]	train-rmse:3.57122	eval-rmse:3.57979	train-rmspe:0.969934	eval-rmspe:0.970314
[8]	train-rmse:3.21687	eval-rmse:3.22455	train-rmspe:0.957165	eval-rmspe:0.957653
[9]	train-rmse:2.8982	eval-rmse:2.90641	train-rmspe:0.941148	eval-rmspe:0.941834
[10]	train-rmse:2.61151	

[96]	train-rmse:0.136019	eval-rmse:0.157701	train-rmspe:0.164903	eval-rmspe:0.17275
[97]	train-rmse:0.135451	eval-rmse:0.157137	train-rmspe:0.164374	eval-rmspe:0.172029
[98]	train-rmse:0.135092	eval-rmse:0.156858	train-rmspe:0.164079	eval-rmspe:0.171693
[99]	train-rmse:0.134272	eval-rmse:0.15624	train-rmspe:0.16326	eval-rmspe:0.170951
[100]	train-rmse:0.133787	eval-rmse:0.155877	train-rmspe:0.162553	eval-rmspe:0.170417
[101]	train-rmse:0.133336	eval-rmse:0.155542	train-rmspe:0.161185	eval-rmspe:0.17006
[102]	train-rmse:0.131522	eval-rmse:0.154082	train-rmspe:0.158837	eval-rmspe:0.168379
[103]	train-rmse:0.130872	eval-rmse:0.153574	train-rmspe:0.15822	eval-rmspe:0.167819
[104]	train-rmse:0.130631	eval-rmse:0.153481	train-rmspe:0.157816	eval-rmspe:0.167656
[105]	train-rmse:0.130148	eval-rmse:0.153097	train-rmspe:0.157288	eval-rmspe:0.167202
[106]	train-rmse:0.128565	eval-rmse:0.151844	train-rmspe:0.155724	eval-rmspe:0.165769
[107]	train-rmse:0.128353	eval-rmse:0.151379	train-rmspe:0.1554

[192]	train-rmse:0.101043	eval-rmse:0.13193	train-rmspe:0.119709	eval-rmspe:0.14249
[193]	train-rmse:0.100849	eval-rmse:0.131782	train-rmspe:0.119522	eval-rmspe:0.142305
[194]	train-rmse:0.100777	eval-rmse:0.131654	train-rmspe:0.119404	eval-rmspe:0.142135
[195]	train-rmse:0.100525	eval-rmse:0.131507	train-rmspe:0.118336	eval-rmspe:0.14197
[196]	train-rmse:0.100365	eval-rmse:0.131386	train-rmspe:0.118179	eval-rmspe:0.141833
[197]	train-rmse:0.100184	eval-rmse:0.131413	train-rmspe:0.117944	eval-rmspe:0.141896
[198]	train-rmse:0.100076	eval-rmse:0.131352	train-rmspe:0.117788	eval-rmspe:0.141819
[199]	train-rmse:0.099709	eval-rmse:0.131074	train-rmspe:0.117425	eval-rmspe:0.141515
[200]	train-rmse:0.099531	eval-rmse:0.130928	train-rmspe:0.117195	eval-rmspe:0.141349
[201]	train-rmse:0.099369	eval-rmse:0.130839	train-rmspe:0.116992	eval-rmspe:0.141257
[202]	train-rmse:0.099247	eval-rmse:0.130804	train-rmspe:0.116869	eval-rmspe:0.141221
[203]	train-rmse:0.099099	eval-rmse:0.130687	train-rmspe:

[288]	train-rmse:0.089654	eval-rmse:0.126685	train-rmspe:0.102911	eval-rmspe:0.136595
[289]	train-rmse:0.089586	eval-rmse:0.126672	train-rmspe:0.102751	eval-rmspe:0.136581
[290]	train-rmse:0.089478	eval-rmse:0.12665	train-rmspe:0.102608	eval-rmspe:0.136541
[291]	train-rmse:0.089426	eval-rmse:0.126704	train-rmspe:0.102558	eval-rmspe:0.136603
[292]	train-rmse:0.089361	eval-rmse:0.126685	train-rmspe:0.102464	eval-rmspe:0.136569
[293]	train-rmse:0.089208	eval-rmse:0.12659	train-rmspe:0.102311	eval-rmspe:0.136466
[294]	train-rmse:0.089107	eval-rmse:0.126529	train-rmspe:0.102207	eval-rmspe:0.136393
[295]	train-rmse:0.089033	eval-rmse:0.126539	train-rmspe:0.102131	eval-rmspe:0.136416
[296]	train-rmse:0.088907	eval-rmse:0.126481	train-rmspe:0.101998	eval-rmspe:0.136351
[297]	train-rmse:0.088864	eval-rmse:0.126455	train-rmspe:0.101954	eval-rmspe:0.136328
[298]	train-rmse:0.088762	eval-rmse:0.126407	train-rmspe:0.101828	eval-rmspe:0.136283
[299]	train-rmse:0.088692	eval-rmse:0.126432	train-rmspe

[384]	train-rmse:0.083262	eval-rmse:0.124951	train-rmspe:0.0938	eval-rmspe:0.134681
[385]	train-rmse:0.083202	eval-rmse:0.124932	train-rmspe:0.093741	eval-rmspe:0.134673
[386]	train-rmse:0.083138	eval-rmse:0.124941	train-rmspe:0.093634	eval-rmspe:0.134699
[387]	train-rmse:0.0831	eval-rmse:0.124936	train-rmspe:0.093596	eval-rmspe:0.134698
[388]	train-rmse:0.082995	eval-rmse:0.124906	train-rmspe:0.093487	eval-rmspe:0.134663
[389]	train-rmse:0.082947	eval-rmse:0.12491	train-rmspe:0.093429	eval-rmspe:0.134669
[390]	train-rmse:0.082915	eval-rmse:0.124906	train-rmspe:0.093401	eval-rmspe:0.13467
[391]	train-rmse:0.082886	eval-rmse:0.124912	train-rmspe:0.093373	eval-rmspe:0.134679
[392]	train-rmse:0.082835	eval-rmse:0.124905	train-rmspe:0.093324	eval-rmspe:0.134674
[393]	train-rmse:0.082794	eval-rmse:0.124925	train-rmspe:0.093286	eval-rmspe:0.134708
[394]	train-rmse:0.082749	eval-rmse:0.124916	train-rmspe:0.09323	eval-rmspe:0.134698
[395]	train-rmse:0.082704	eval-rmse:0.124899	train-rmspe:0.09

[480]	train-rmse:0.078731	eval-rmse:0.124015	train-rmspe:0.08745	eval-rmspe:0.133834
[481]	train-rmse:0.078696	eval-rmse:0.124007	train-rmspe:0.087388	eval-rmspe:0.133824
[482]	train-rmse:0.078655	eval-rmse:0.124	train-rmspe:0.087318	eval-rmspe:0.133818
[483]	train-rmse:0.078611	eval-rmse:0.124027	train-rmspe:0.087271	eval-rmspe:0.133831
[484]	train-rmse:0.078575	eval-rmse:0.124018	train-rmspe:0.087232	eval-rmspe:0.133817
[485]	train-rmse:0.078536	eval-rmse:0.124011	train-rmspe:0.087193	eval-rmspe:0.133816
[486]	train-rmse:0.078498	eval-rmse:0.124003	train-rmspe:0.087143	eval-rmspe:0.133811
[487]	train-rmse:0.078468	eval-rmse:0.124005	train-rmspe:0.087108	eval-rmspe:0.133811
[488]	train-rmse:0.078433	eval-rmse:0.12401	train-rmspe:0.086933	eval-rmspe:0.133823
[489]	train-rmse:0.078382	eval-rmse:0.123977	train-rmspe:0.086884	eval-rmspe:0.133774
[490]	train-rmse:0.078324	eval-rmse:0.123948	train-rmspe:0.086826	eval-rmspe:0.133743
[491]	train-rmse:0.078272	eval-rmse:0.123943	train-rmspe:0.

[576]	train-rmse:0.075279	eval-rmse:0.123534	train-rmspe:0.082203	eval-rmspe:0.133273
[577]	train-rmse:0.075241	eval-rmse:0.12353	train-rmspe:0.082156	eval-rmspe:0.133267
[578]	train-rmse:0.07522	eval-rmse:0.12353	train-rmspe:0.082133	eval-rmspe:0.133271
[579]	train-rmse:0.075167	eval-rmse:0.123512	train-rmspe:0.082071	eval-rmspe:0.133255
[580]	train-rmse:0.075141	eval-rmse:0.123497	train-rmspe:0.082042	eval-rmspe:0.133236
[581]	train-rmse:0.075127	eval-rmse:0.123497	train-rmspe:0.082023	eval-rmspe:0.133234
[582]	train-rmse:0.075086	eval-rmse:0.123479	train-rmspe:0.081976	eval-rmspe:0.133214
[583]	train-rmse:0.075056	eval-rmse:0.123475	train-rmspe:0.081941	eval-rmspe:0.133208
[584]	train-rmse:0.07503	eval-rmse:0.123472	train-rmspe:0.081917	eval-rmspe:0.133203
[585]	train-rmse:0.075	eval-rmse:0.123473	train-rmspe:0.081886	eval-rmspe:0.133206
[586]	train-rmse:0.074956	eval-rmse:0.123485	train-rmspe:0.08181	eval-rmspe:0.133224
[587]	train-rmse:0.074911	eval-rmse:0.123482	train-rmspe:0.081

[672]	train-rmse:0.072436	eval-rmse:0.123051	train-rmspe:0.078445	eval-rmspe:0.132728
[673]	train-rmse:0.072399	eval-rmse:0.123066	train-rmspe:0.078405	eval-rmspe:0.132749
[674]	train-rmse:0.072375	eval-rmse:0.123062	train-rmspe:0.07836	eval-rmspe:0.132744
[675]	train-rmse:0.07235	eval-rmse:0.123054	train-rmspe:0.078328	eval-rmspe:0.132735
[676]	train-rmse:0.072328	eval-rmse:0.123046	train-rmspe:0.078297	eval-rmspe:0.132727
[677]	train-rmse:0.072294	eval-rmse:0.123044	train-rmspe:0.078251	eval-rmspe:0.132725
[678]	train-rmse:0.072262	eval-rmse:0.123038	train-rmspe:0.078216	eval-rmspe:0.132719
[679]	train-rmse:0.072234	eval-rmse:0.12304	train-rmspe:0.078189	eval-rmspe:0.132725
[680]	train-rmse:0.072203	eval-rmse:0.123027	train-rmspe:0.07815	eval-rmspe:0.132714
[681]	train-rmse:0.072172	eval-rmse:0.123015	train-rmspe:0.078116	eval-rmspe:0.132703
[682]	train-rmse:0.072153	eval-rmse:0.122998	train-rmspe:0.078091	eval-rmspe:0.132691
[683]	train-rmse:0.072128	eval-rmse:0.122996	train-rmspe:0

[768]	train-rmse:0.069986	eval-rmse:0.122737	train-rmspe:0.07509	eval-rmspe:0.132444
[769]	train-rmse:0.069952	eval-rmse:0.12273	train-rmspe:0.075054	eval-rmspe:0.13244
[770]	train-rmse:0.069927	eval-rmse:0.12273	train-rmspe:0.075022	eval-rmspe:0.132438
[771]	train-rmse:0.069896	eval-rmse:0.122726	train-rmspe:0.074979	eval-rmspe:0.132435
[772]	train-rmse:0.069867	eval-rmse:0.122725	train-rmspe:0.074947	eval-rmspe:0.132436
[773]	train-rmse:0.069839	eval-rmse:0.122728	train-rmspe:0.074912	eval-rmspe:0.132438
[774]	train-rmse:0.069815	eval-rmse:0.122725	train-rmspe:0.074885	eval-rmspe:0.132435
[775]	train-rmse:0.069796	eval-rmse:0.122714	train-rmspe:0.074853	eval-rmspe:0.132423
[776]	train-rmse:0.069787	eval-rmse:0.122715	train-rmspe:0.074844	eval-rmspe:0.132429
[777]	train-rmse:0.069754	eval-rmse:0.12271	train-rmspe:0.074793	eval-rmspe:0.132421
[778]	train-rmse:0.06972	eval-rmse:0.122717	train-rmspe:0.074733	eval-rmspe:0.13243
[779]	train-rmse:0.069692	eval-rmse:0.122706	train-rmspe:0.07

(0.13270790074451297, <xgboost.core.Booster at 0x2eab27ca388>)

In [22]:
#调试max_depth和min_child_weight
param_grid = {'max_depth':range(9,13,1), 'min_child_weight':range(1,5,1)}

In [23]:
grid_search_xg(params, param_grid)

Iter nums is 16.000000.
Iter No is 1.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 9, 'min_child_weight': 1, 'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.8, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0}
Start Train...
time is 138.543119 s.
testing...
RMSPE: 0.134324
Iter No is 2.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 9, 'min_child_weight': 2, 'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.8, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0}
Start Train...
time is 93.616510 s.
testing...
RMSPE: 0.136606
Iter No is 3.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.8, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0}
Start Train...
time is 111.664026 s.
testing...
RMSPE: 0.135904
Iter No 

<xgboost.core.Booster at 0x2eab27d6f48>

In [24]:
#调试Gamma
param_grid = {'gamma':[i/10.0 for i in range(0,10,2)]}

In [25]:
grid_search_xg(params, param_grid)

Iter nums is 5.000000.
Iter No is 1.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.0, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0}
Start Train...
time is 173.579716 s.
testing...
RMSPE: 0.133720
Iter No is 2.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0}
Start Train...
time is 203.449251 s.
testing...
RMSPE: 0.132894
Iter No is 3.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.4, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0}
Start Train...
time is 233.509041 s.
testing...
RMSPE: 0.138107
Iter 

<xgboost.core.Booster at 0x2eab27cb848>

In [26]:
#调试subsample与colsample_bytree
param_grid = {'subsample':[i/100.0 for i in range(75,96,5)],'colsample_bytree':[i/100.0 for i in range(75,96,5)]}

In [27]:
grid_search_xg(params, param_grid)

Iter nums is 25.000000.
Iter No is 1.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.75, 'colsample_bytree': 0.75, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0}
Start Train...
time is 198.438040 s.
testing...
RMSPE: 0.130951
Iter No is 2.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.75, 'colsample_bytree': 0.8, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0}
Start Train...
time is 169.980858 s.
testing...
RMSPE: 0.135903
Iter No is 3.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.75, 'colsample_bytree': 0.85, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0}
Start Train...
time is 214.469355 s.
testing...
RMSPE: 0.141155

<xgboost.core.Booster at 0x2eab27d0408>

In [28]:
#调试reg_alpha
param_grid = {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}

In [29]:
grid_search_xg(params, param_grid)

Iter nums is 5.000000.
Iter No is 1.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.95, 'colsample_bytree': 0.75, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'reg_alpha': 1e-05}
Start Train...
time is 193.041475 s.
testing...
RMSPE: 0.129778
Iter No is 2.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.95, 'colsample_bytree': 0.75, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'reg_alpha': 0.01}
Start Train...
time is 216.621211 s.
testing...
RMSPE: 0.130957
Iter No is 3.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.95, 'colsample_bytree': 0.75, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'reg_alpha': 0.1}
Start 

<xgboost.core.Booster at 0x2eae4ba79c8>

In [30]:
#调试reg_lambda
param_grid = {'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]}

In [31]:
grid_search_xg(params, param_grid)

Iter nums is 5.000000.
Iter No is 1.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.95, 'colsample_bytree': 0.75, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'reg_alpha': 1e-05, 'reg_lambda': 1e-05}
Start Train...
time is 165.912074 s.
testing...
RMSPE: 0.133304
Iter No is 2.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.95, 'colsample_bytree': 0.75, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'reg_alpha': 1e-05, 'reg_lambda': 0.01}
Start Train...
time is 146.330686 s.
testing...
RMSPE: 0.132492
Iter No is 3.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.1, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.95, 'colsample_bytree': 0.75, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hi

<xgboost.core.Booster at 0x2eab27c5a48>

In [32]:
#调试learning_rate
param_grid = {'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.2]}

In [33]:
#最终模型
gbm = grid_search_xg(params, param_grid)

Iter nums is 6.000000.
Iter No is 1.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.01, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.95, 'colsample_bytree': 0.75, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'reg_alpha': 1e-05, 'reg_lambda': 100}
Start Train...
time is 2053.560050 s.
testing...
RMSPE: 0.125774
Iter No is 2.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.03, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.95, 'colsample_bytree': 0.75, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'reg_alpha': 1e-05, 'reg_lambda': 100}
Start Train...
time is 1069.953328 s.
testing...
RMSPE: 0.122620
Iter No is 3.000000.
{'objective': 'reg:squarederror', 'learning_rate': 0.05, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.95, 'colsample_bytree': 0.75, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_

In [34]:
#最终参数
print(params)

{'objective': 'reg:squarederror', 'learning_rate': 0.03, 'booster': 'gbtree', 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.95, 'colsample_bytree': 0.75, 'gamma': 0.2, 'silent': 1, 'seed': 10, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'reg_alpha': 1e-05, 'reg_lambda': 100}


In [None]:
#第一次预测
print("First predictions on the test set")
dtest = xgb.DMatrix(xtest)
test_probs = gbm.predict(dtest)

In [None]:
#导出第一次预测结果
result = pd.DataFrame({"Id": test['Id'], 'Sales': np.expm1(test_probs)})
result.to_csv("Rossmann_submission_1.csv", index=False)

In [None]:
#采用保留数据集进行检测
print("validating")
tai_xtest.sort_index(inplace=True) 
tai_ytest.sort_index(inplace=True) 
yhat = gbm.predict(xgb.DMatrix(tai_xtest))
error = rmspe(np.expm1(tai_ytest), np.expm1(yhat))

print('RMSPE: {:.6f}'.format(error))

In [None]:
# analysis by hold-out set
res = pd.DataFrame(data = ho_ytest)
res['Prediction']=yhat
res = pd.merge(ho_xtest,res, left_index= True, right_index=True)
res['Ratio'] = res.Prediction/res.Sales
res['Error'] =abs(res.Ratio-1)
res['Weight'] = res.Sales/res.Prediction
res.head()

In [None]:
col_1 = ['Sales','Prediction']
col_2 = ['Ratio']
L=np.random.randint( low=1,high = 1115, size = 3 ) 
print('Mean Ratio of predition and real sales data is {}: store all'.format(res.Ratio.mean()))
for i in L:
    s1 = pd.DataFrame(res[res['Store']==i],columns = col_1)
    s2 = pd.DataFrame(res[res['Store']==i],columns = col_2)
    s1.plot(title = 'Comparation of predition and real sales data: store {}'.format(i),figsize=(12,4))
    s2.plot(title = 'Ratio of predition and real sales data: store {}'.format(i),figsize=(12,4))
    print('Mean Ratio of predition and real sales data is {}: store {}'.format(s2.Ratio.mean(),i))

In [None]:
res.sort_values(['Error'],ascending=False,inplace= True)
res[:10]

In [None]:
# whole correction
print("weight correction")
W=[(0.990+(i/1000)) for i in range(20)]
S =[]
for w in W:
    error = rmspe(np.expm1(ho_ytest), np.expm1(yhat*w))
    print('RMSPE for {:.3f}:{:.6f}'.format(w,error))
    S.append(error)
Score = pd.Series(S,index=W)
Score.plot()
BS = Score[Score.values == Score.values.min()]
print ('Best weight for Score:{}'.format(BS))

In [None]:
# correction by store
L=range(1115)
W_ho=[]
W_test=[]
for i in L:
    s1 = pd.DataFrame(res[res['Store']==i+1],columns = col_1)
    s2 = pd.DataFrame(xtest[xtest['Store']==i+1])
    W1=[(0.990+(i/1000)) for i in range(20)]
    S =[]
    for w in W1:
        error = rmspe(np.expm1(s1.Sales), np.expm1(s1.Prediction*w))
        S.append(error)
    Score = pd.Series(S,index=W1)
    BS = Score[Score.values == Score.values.min()]
    a=np.array(BS.index.values)
    b_ho=a.repeat(len(s1))
    b_test=a.repeat(len(s2))
    W_ho.extend(b_ho.tolist())
    W_test.extend(b_test.tolist())

In [None]:
yhat_new = yhat*W_ho
error = rmspe(np.expm1(ho_ytest), np.expm1(yhat_new))
print ('RMSPE for weight corretion {:6f}'.format(error))

In [None]:
print("Make predictions on the test set")
dtest = xgb.DMatrix(xtest)
test_probs = gbm.predict(dtest)

# model1  kaggle private score 0.12647
result = pd.DataFrame({"Id": test['Id'], 'Sales': np.expm1(test_probs)})
result.to_csv("Rossmann_submission_1.csv", index=False)

# model2 kaggle private score 0.11756
result = pd.DataFrame({"Id": test['Id'], 'Sales': np.expm1(test_probs*0.995)})
result.to_csv("Rossmann_submission_2.csv", index=False)

# model3 kaggle private score 0.11292
result = pd.DataFrame({"Id": test['Id'], 'Sales': np.expm1(test_probs*W_test)})
result.to_csv("Rossmann_submission_3.csv", index=False)

In [None]:
# model2 kaggle private score 0.11756
result = pd.DataFrame({"Id": test['Id'], 'Sales': np.expm1(test_probs*0.995)})
result.to_csv("Rossmann_submission_2.csv", index=False)

In [None]:
# model3 kaggle private score 0.11292
result = pd.DataFrame({"Id": test['Id'], 'Sales': np.expm1(test_probs*W_test)})
result.to_csv("Rossmann_submission_3.csv", index=False)