In [41]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt
import datetime
dt = datetime.datetime.now()

In [42]:
def rmspe(y, yhat):
    # y = true data
    # yhat = predication
    return np.sqrt(np.mean(((y - yhat) / y)**2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

In [43]:
monthdic = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

def promo2_indicator(row):
    if row['PromoInterval'] is np.NaN:
        return 0
    try:
        if monthdic[row['Month']-1] == row['PromoInterval']:
            return 1
    except:
        if monthdic[row['Month']-1] in row['PromoInterval']:
            return 1
    return 0

In [44]:
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2SinceWeek',
                     'Promo2SinceYear', 'SchoolHoliday', 'StoreType', 'Assortment',
                    'StateHoliday'])
    
    mapping = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mapping, inplace=True)
    data.Assortment.replace(mapping, inplace=True)
    data.StateHoliday.replace(mapping, inplace=True)
    
    features.extend(['Month', 'Day', 'Year', 'Quarter'])
    data['Month'] = data.Date.apply(lambda x: x.month)
    data['Day'] = data.Date.apply(lambda x: x.day)
    data['Year'] = data.Date.apply(lambda x: x.year)
    data['Quarter'] = data.Date.apply(lambda x: x.quarter)
    
    features.extend(['Promo2Indicator', 'CompetitionTime'])
    data['Promo2Indicator'] = data.apply(promo2_indicator, axis = 1)
    data['CompetitionTime'] = data.Month - data.CompetitionOpenSinceMonth + (data.Year - data.CompetitionOpenSinceYear) * 12

In [45]:
# main script

print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(int),
         'PromoInterval': np.dtype(str),
         'CompetitionDistance':np.dtype(float),
         'StoreType':np.dtype(str),
         'Assortment':np.dtype(str)}
store = pd.read_csv("../data/store.csv")
test = pd.read_csv("../data/test.csv", parse_dates=[3], dtype = types)
train = pd.read_csv("../data/train.csv", parse_dates=[2], dtype = types)
test = pd.merge(test, store, on = "Store", how = "left")
train = pd.merge(train, store, on = "Store", how = "left")

features = []
print("build train features")
build_features(features, train)
print("build test features")
build_features([], test)
print(features)



Load the training, test and store data using pandas
build train features
build test features
['Store', 'CompetitionDistance', 'Promo', 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'Month', 'Day', 'Year', 'Quarter', 'Promo2Indicator', 'CompetitionTime']


In [46]:
test.isnull().sum()

Id                           0
Store                        0
DayOfWeek                    0
Date                         0
Open                         0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
Month                        0
Day                          0
Year                         0
Quarter                      0
Promo2Indicator              0
CompetitionTime              0
dtype: int64

In [47]:
test.iloc[2,:]

Id                                          1713
Store                                          1
DayOfWeek                                      2
Date                         2015-09-15 00:00:00
Open                                           1
Promo                                          1
StateHoliday                                   0
SchoolHoliday                                  0
StoreType                                      3
Assortment                                     1
CompetitionDistance                         1270
CompetitionOpenSinceMonth                      9
CompetitionOpenSinceYear                    2008
Promo2                                         0
Promo2SinceWeek                                0
Promo2SinceYear                                0
PromoInterval                                  0
Month                                          9
Day                                           15
Year                                        2015
Quarter             

In [48]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Month,Day,Unnamed: 21
0,1,5,2015-07-31,5263,555,1,1,0,1,3,1,1270,9,2008,0,0,0,0,7,31,...
1,1,4,2015-07-30,5020,546,1,1,0,1,3,1,1270,9,2008,0,0,0,0,7,30,...
2,1,3,2015-07-29,4782,523,1,1,0,1,3,1,1270,9,2008,0,0,0,0,7,29,...
3,1,2,2015-07-28,5011,560,1,1,0,1,3,1,1270,9,2008,0,0,0,0,7,28,...
4,1,1,2015-07-27,6102,612,1,1,0,1,3,1,1270,9,2008,0,0,0,0,7,27,...


In [49]:
params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.05,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 7000

# start training
print("Train a XGBoost model")
#X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
#y_train = np.log1p(X_train.Sales)
#y_valid = np.log1p(X_valid.Sales)

X_train = train
y_train = np.log1p(X_train.Sales)

dtrain = xgb.DMatrix(X_train[features], y_train)
#dvalid = xgb.DMatrix(X_valid[features], y_valid)




Train a XGBoost model


In [50]:
y_train.isnull().sum()

0

In [None]:
[a.c1 == 8].index.tolist()

In [40]:
dtrain.get_label()

array([ 8.56864643,  8.52138424,  8.47282314, ...,  8.36590481,
        8.21554756,  0.        ], dtype=float32)

In [None]:
#watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
watchlist = [(dtrain, 'train')]

gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

# predict
print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)

# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("../sub/submission%d%d%d.csv"%(dt.day, dt.hour, dt.minute),index=False)