In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt



In [2]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)



In [3]:
# Gather some features
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear

    # CompetionOpen en PromoOpen from https://www.kaggle.com/ananya77041/rossmann-store-sales/randomforestpython/code
    # Calculate time competition open time in months
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data


In [4]:
print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
print(types)

Load the training, test and store data using pandas
{'Promo2SinceWeek': dtype('int32'), 'CompetitionOpenSinceMonth': dtype('int32'), 'CompetitionOpenSinceYear': dtype('int32'), 'StateHoliday': dtype('<U'), 'SchoolHoliday': dtype('float64'), 'PromoInterval': dtype('<U')}


In [5]:
train = pd.read_csv("train.csv", parse_dates=[2], dtype=types)
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1.0
1,2,5,2015-07-31,6064,625,1,1,0,1.0
2,3,5,2015-07-31,8314,821,1,1,0,1.0
3,4,5,2015-07-31,13995,1498,1,1,0,1.0
4,5,5,2015-07-31,4822,559,1,1,0,1.0


In [6]:
test = pd.read_csv("test.csv" , parse_dates=[3] ,dtype = types)

In [7]:
test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0.0
1,2,3,4,2015-09-17,1.0,1,0,0.0
2,3,7,4,2015-09-17,1.0,1,0,0.0
3,4,8,4,2015-09-17,1.0,1,0,0.0
4,5,9,4,2015-09-17,1.0,1,0,0.0


In [8]:
store = pd.read_csv("store.csv")
store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [9]:
print("Assume store open ,if not provided")
train.fillna(1 ,inplace = True)   #缺失值用1填充
train.head()

Assume store open ,if not provided


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1.0
1,2,5,2015-07-31,6064,625,1,1,0,1.0
2,3,5,2015-07-31,8314,821,1,1,0,1.0
3,4,5,2015-07-31,13995,1498,1,1,0,1.0
4,5,5,2015-07-31,4822,559,1,1,0,1.0


In [10]:
test.fillna(1, inplace=True)

In [11]:
print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero. Simplifies calculation of rmspe")
train = train[train["Sales"] > 0]

Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero. Simplifies calculation of rmspe


In [12]:
print("Join with store")
train = pd.merge(train, store, on='Store')   #按store合并两张表
test = pd.merge(test, store, on='Store')
features = []
train.head()

Join with store


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1.0,c,a,1270.0,9.0,2008.0,0,,,
1,1,4,2015-07-30,5020,546,1,1,0,1.0,c,a,1270.0,9.0,2008.0,0,,,
2,1,3,2015-07-29,4782,523,1,1,0,1.0,c,a,1270.0,9.0,2008.0,0,,,
3,1,2,2015-07-28,5011,560,1,1,0,1.0,c,a,1270.0,9.0,2008.0,0,,,
4,1,1,2015-07-27,6102,612,1,1,0,1.0,c,a,1270.0,9.0,2008.0,0,,,


In [13]:
print("augment features")
build_features(features, train)
build_features([], test)
print(features)


augment features
['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']


In [14]:
params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 300

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

Train a XGBoost model


In [15]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

[0]	train-rmse:5.79368	eval-rmse:5.79362	train-rmspe:0.99684	eval-rmspe:0.996838
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 100 rounds.
[1]	train-rmse:4.06274	eval-rmse:4.06359	train-rmspe:0.981525	eval-rmspe:0.981525
[2]	train-rmse:2.85318	eval-rmse:2.85522	train-rmspe:0.938101	eval-rmspe:0.938162
[3]	train-rmse:2.00968	eval-rmse:2.01188	train-rmspe:0.856666	eval-rmspe:0.85665
[4]	train-rmse:1.42233	eval-rmse:1.42521	train-rmspe:0.744155	eval-rmspe:0.743794
[5]	train-rmse:1.01687	eval-rmse:1.02015	train-rmspe:0.619489	eval-rmspe:0.617979
[6]	train-rmse:0.741152	eval-rmse:0.744603	train-rmspe:0.504218	eval-rmspe:0.50011
[7]	train-rmse:0.552896	eval-rmse:0.556435	train-rmspe:0.411833	eval-rmspe:0.403695
[8]	train-rmse:0.430882	eval-rmse:0.434411	train-rmspe:0.349148	eval-rmspe:0.335541
[9]	train-rmse:0.351865	eval-rmse:0.355058	train-rmspe:0.311269	eval-rmspe:0.292434
[10]	train-rmse:0.307537	eval

[96]	train-rmse:0.103053	eval-rmse:0.108074	train-rmspe:0.119095	eval-rmspe:0.114015
[97]	train-rmse:0.102707	eval-rmse:0.107794	train-rmspe:0.118762	eval-rmspe:0.113714
[98]	train-rmse:0.102403	eval-rmse:0.107497	train-rmspe:0.118483	eval-rmspe:0.113429
[99]	train-rmse:0.102182	eval-rmse:0.107375	train-rmspe:0.118246	eval-rmspe:0.113316
[100]	train-rmse:0.101985	eval-rmse:0.107166	train-rmspe:0.118018	eval-rmspe:0.113035
[101]	train-rmse:0.101662	eval-rmse:0.107018	train-rmspe:0.117568	eval-rmspe:0.112885
[102]	train-rmse:0.101111	eval-rmse:0.106511	train-rmspe:0.117028	eval-rmspe:0.11236
[103]	train-rmse:0.100904	eval-rmse:0.106321	train-rmspe:0.116814	eval-rmspe:0.11217
[104]	train-rmse:0.100463	eval-rmse:0.105916	train-rmspe:0.116377	eval-rmspe:0.111704
[105]	train-rmse:0.100278	eval-rmse:0.105832	train-rmspe:0.116143	eval-rmspe:0.111634
[106]	train-rmse:0.09987	eval-rmse:0.10551	train-rmspe:0.115378	eval-rmspe:0.111375
[107]	train-rmse:0.099467	eval-rmse:0.105347	train-rmspe:0.113

[192]	train-rmse:0.084782	eval-rmse:0.095474	train-rmspe:0.094818	eval-rmspe:0.100191
[193]	train-rmse:0.084737	eval-rmse:0.095447	train-rmspe:0.094778	eval-rmspe:0.100165
[194]	train-rmse:0.084689	eval-rmse:0.095428	train-rmspe:0.094726	eval-rmspe:0.100146
[195]	train-rmse:0.084643	eval-rmse:0.09542	train-rmspe:0.094679	eval-rmspe:0.100116
[196]	train-rmse:0.084531	eval-rmse:0.095386	train-rmspe:0.094535	eval-rmspe:0.100075
[197]	train-rmse:0.084395	eval-rmse:0.095348	train-rmspe:0.094387	eval-rmspe:0.100039
[198]	train-rmse:0.08429	eval-rmse:0.095295	train-rmspe:0.094268	eval-rmspe:0.099955
[199]	train-rmse:0.08421	eval-rmse:0.095323	train-rmspe:0.094142	eval-rmspe:0.099986
[200]	train-rmse:0.084117	eval-rmse:0.09529	train-rmspe:0.094041	eval-rmspe:0.099964
[201]	train-rmse:0.083963	eval-rmse:0.09525	train-rmspe:0.093802	eval-rmspe:0.099922
[202]	train-rmse:0.083864	eval-rmse:0.095232	train-rmspe:0.093657	eval-rmspe:0.099876
[203]	train-rmse:0.083812	eval-rmse:0.095216	train-rmspe:0.

[288]	train-rmse:0.075449	eval-rmse:0.091434	train-rmspe:0.082957	eval-rmspe:0.095936
[289]	train-rmse:0.075336	eval-rmse:0.091398	train-rmspe:0.08192	eval-rmspe:0.095919
[290]	train-rmse:0.075294	eval-rmse:0.091373	train-rmspe:0.08188	eval-rmspe:0.095898
[291]	train-rmse:0.075249	eval-rmse:0.091344	train-rmspe:0.081835	eval-rmspe:0.09586
[292]	train-rmse:0.075129	eval-rmse:0.091277	train-rmspe:0.081704	eval-rmspe:0.095749
[293]	train-rmse:0.075073	eval-rmse:0.091248	train-rmspe:0.081631	eval-rmspe:0.095678
[294]	train-rmse:0.075009	eval-rmse:0.091212	train-rmspe:0.081561	eval-rmspe:0.095645
[295]	train-rmse:0.074955	eval-rmse:0.091202	train-rmspe:0.081499	eval-rmspe:0.095621
[296]	train-rmse:0.074908	eval-rmse:0.091199	train-rmspe:0.081415	eval-rmspe:0.095623
[297]	train-rmse:0.074851	eval-rmse:0.091173	train-rmspe:0.081351	eval-rmspe:0.095597
[298]	train-rmse:0.074835	eval-rmse:0.091175	train-rmspe:0.081337	eval-rmspe:0.095602
[299]	train-rmse:0.074778	eval-rmse:0.091165	train-rmspe:

In [16]:
print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values , np.expm1(yhat))
print('rmspe:{:.6f}'.format(error))


Validating
rmspe:0.095627


In [17]:
print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)

Make predictions on the test set


In [18]:
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_10_submission.csv", index=False)

In [20]:
create_feature_map(features)
importance = gbm.get_fscore(fmap = 'xgb.fmap')
importance = sorted(importance.items() , key = operator.itemgetter(1))

df = pd.DataFrame(importance , columns = ['feature' , 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum

featp = df.plot(kind = 'barh' ,x = 'feature' ,y = 'fscore' , legend = False ,figsize=(6,10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
fig_featp = featp.get_figure()
fig_featp.savefig('feature_importance_xgb.png', bbox_inches = 'tight' , pad_inches = 1)


NameError: name 'gdm' is not defined