In [55]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 50)

In [28]:
train = pd.read_csv('E:\\DL_datasets\\RossmanData\\train.csv', parse_dates=True)
test = pd.read_csv('E:\\DL_datasets\\RossmanData\\test.csv', parse_dates=True)
store = pd.read_csv('E:\\DL_datasets\\RossmanData\\store.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [29]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [30]:
store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


### 缺失值处理

In [31]:
# train训练数据无缺失值
# test测试集Open列存在11个缺失值
store.isnull().sum()

Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64

In [32]:
train = train[train.Open != 0]
train = train[train.Sales > 0]
print('shape of train:', train.shape)

shape of train: (844338, 9)


In [33]:
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

In [38]:
train['Date'] = pd.to_datetime(train['Date'], format='%Y-%m-%d')
test['Date'] = pd.to_datetime(test['Date'], format='%Y-%m-%d')
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
1,1,4,2015-07-30,5020,546,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
2,1,3,2015-07-29,4782,523,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
3,1,2,2015-07-28,5011,560,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
4,1,1,2015-07-27,6102,612,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,


In [44]:
train.StoreType.unique()

array([3, 1, 4, 2], dtype=int64)

In [51]:
def build_features(features, data):
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])
    
    # Label Encoder some featuers
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    # data.StoreType.replace(mappings, inplace=True)
    # data.Assortment.replace(mappings, inplace=True)
    # data.SchoolHoliday.replace(mappings, inplace=True)
    
    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['Year'] = data.Date.dt.year
    data['WeekOfYear'] = data.Date.dt.weekofyear
    
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
            (data.Month - data.CompetitionOpenSinceMonth)
    
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
            (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0
    
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data

In [52]:
features = []
build_features(features, train)
build_features([], test)
print(features)

['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']


In [70]:
def StateHoliday_le(x):
    if x == 'a':
        return  1
    elif x == 'b':
        return 2
    elif x == 'c':
        return 3
    return 0

train['StateHoliday'] = train.StateHoliday.apply(lambda x: StateHoliday_le(x))
test['StateHoliday'] = test.StateHoliday.apply(lambda x: StateHoliday_le(x))

In [71]:
train.StateHoliday.unique()

array([0, 1, 2, 3], dtype=int64)

In [57]:
test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Month,Day,Year,WeekOfYear,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth
0,1,1,4,2015-09-17,1.0,1,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,,9,17,2015,38,84.0,0.0,Sept,0
1,857,1,3,2015-09-16,1.0,1,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,,9,16,2015,38,84.0,0.0,Sept,0
2,1713,1,2,2015-09-15,1.0,1,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,,9,15,2015,38,84.0,0.0,Sept,0
3,2569,1,1,2015-09-14,1.0,1,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,,9,14,2015,38,84.0,0.0,Sept,0
4,3425,1,7,2015-09-13,0.0,0,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,,9,13,2015,37,84.0,0.0,Sept,0


In [72]:
X_train, X_valid, y_train, y_valid = train_test_split(train[features], train.Sales, test_size=0.2, random_state=925)

In [78]:
params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 300

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_valid, y_valid)

gbm = xgb.XGBRegressor(booster='gbtree', eta=0.3, max_depth=10, subsample=0.9,
                       colsample_bytree=0.7, silent=1, seed=925)
gbm.fit(train[features], train.Sales)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, eta=0.3, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=925, silent=1,
             subsample=0.9, verbosity=1)

In [79]:
result = gbm.predict(test[features])

In [82]:
result_df = pd.DataFrame({'Id':test.Id, 'Sales':result})
result_df.to_csv('submission01.csv', index=False)

In [85]:
test

0       2015-09-17
1       2015-09-16
2       2015-09-15
3       2015-09-14
4       2015-09-13
5       2015-09-12
6       2015-09-11
7       2015-09-10
8       2015-09-09
9       2015-09-08
10      2015-09-07
11      2015-09-06
12      2015-09-05
13      2015-09-04
14      2015-09-03
15      2015-09-02
16      2015-09-01
17      2015-08-31
18      2015-08-30
19      2015-08-29
20      2015-08-28
21      2015-08-27
22      2015-08-26
23      2015-08-25
24      2015-08-24
25      2015-08-23
26      2015-08-22
27      2015-08-21
28      2015-08-20
29      2015-08-19
           ...    
41058   2015-08-30
41059   2015-08-29
41060   2015-08-28
41061   2015-08-27
41062   2015-08-26
41063   2015-08-25
41064   2015-08-24
41065   2015-08-23
41066   2015-08-22
41067   2015-08-21
41068   2015-08-20
41069   2015-08-19
41070   2015-08-18
41071   2015-08-17
41072   2015-08-16
41073   2015-08-15
41074   2015-08-14
41075   2015-08-13
41076   2015-08-12
41077   2015-08-11
41078   2015-08-10
41079   2015