In [1]:
## authored by wuyi

import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt
import datetime
dt = datetime.datetime.now()

In [2]:
def rmspe(y, yhat):
    # y = true data
    # yhat = predication
    return np.sqrt(np.mean(((y - yhat) / y)**2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

In [3]:
monthdic = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

def promo2_indicator(row):
    if row['PromoInterval'] is np.NaN:
        return 0
    try:
        if monthdic[row['Month']-1] == row['PromoInterval']:
            return 1
    except:
        if monthdic[row['Month']-1] in row['PromoInterval']:
            return 1
    return 0

In [4]:
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2SinceWeek',
                     'Promo2SinceYear', 'SchoolHoliday', 'StoreType', 'Assortment',
                    'StateHoliday'])
    
    mapping = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mapping, inplace=True)
    data.Assortment.replace(mapping, inplace=True)
    data.StateHoliday.replace(mapping, inplace=True)
    
    features.extend(['Month', 'Day', 'Year', 'Quarter'])
    data['Month'] = data.Date.apply(lambda x: x.month)
    data['Day'] = data.Date.apply(lambda x: x.day)
    data['Year'] = data.Date.apply(lambda x: x.year)
    data['Quarter'] = data.Date.apply(lambda x: x.quarter)
    
    features.extend(['Promo2Indicator', 'CompetitionTime'])
    data['Promo2Indicator'] = data.apply(promo2_indicator, axis = 1)
    data['CompetitionTime'] = data.Month - data.CompetitionOpenSinceMonth + (data.Year - data.CompetitionOpenSinceYear) * 12

In [5]:
# main script

print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(int),
         'PromoInterval': np.dtype(str),
         'CompetitionDistance':np.dtype(float),
         'StoreType':np.dtype(str),
         'Assortment':np.dtype(str)}
store = pd.read_csv("../data/store.csv")
test = pd.read_csv("../data/test.csv", parse_dates=[3], dtype = types)
train = pd.read_csv("../data/train.csv", parse_dates=[2], dtype = types)
test = pd.merge(test, store, on = "Store", how = "left")
train = pd.merge(train, store, on = "Store", how = "left")
train = train[train.Sales > 0]

features = []
print("build train features")
build_features(features, train)
print("build test features")
build_features([], test)
print(features)



Load the training, test and store data using pandas
build train features
build test features
['Store', 'CompetitionDistance', 'Promo', 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'Month', 'Day', 'Year', 'Quarter', 'Promo2Indicator', 'CompetitionTime']


In [8]:
params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.05,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 1000

# start training
print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

#X_train = train
#y_train = np.log1p(X_train.Sales)
#dtrain = xgb.DMatrix(X_train[features], y_train)


Train a XGBoost model


In [9]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
#watchlist = [(dtrain, 'train')]

gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

# predict
print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)

# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("../sub/submission%d%d%d.csv"%(dt.day, dt.hour, dt.minute),index=False)

Will train until eval error hasn't decreased in 100 rounds.
[0]	train-rmspe:0.999994	eval-rmspe:0.999746
[1]	train-rmspe:0.999981	eval-rmspe:0.999536
[2]	train-rmspe:0.999952	eval-rmspe:0.999259
[3]	train-rmspe:0.999890	eval-rmspe:0.998872
[4]	train-rmspe:0.999769	eval-rmspe:0.998357
[5]	train-rmspe:0.999541	eval-rmspe:0.997682
[6]	train-rmspe:0.999137	eval-rmspe:0.996804
[7]	train-rmspe:0.998434	eval-rmspe:0.995691
[8]	train-rmspe:0.997328	eval-rmspe:0.994296
[9]	train-rmspe:0.995503	eval-rmspe:0.992562
[10]	train-rmspe:0.992966	eval-rmspe:0.990440
[11]	train-rmspe:0.989734	eval-rmspe:0.987883
[12]	train-rmspe:0.985563	eval-rmspe:0.984836
[13]	train-rmspe:0.980688	eval-rmspe:0.981242
[14]	train-rmspe:0.975791	eval-rmspe:0.977053
[15]	train-rmspe:0.971102	eval-rmspe:0.972225
[16]	train-rmspe:0.966241	eval-rmspe:0.966714
[17]	train-rmspe:0.960473	eval-rmspe:0.960459
[18]	train-rmspe:0.953559	eval-rmspe:0.953456
[19]	train-rmspe:0.945611	eval-rmspe:0.945658
[20]	train-rmspe:0.936885	eval

Make predictions on the test set


[999]	train-rmspe:0.143216	eval-rmspe:0.144956
