In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import timeit
import xgboost as xgb

In [2]:
train1  = pd.read_csv('train_2.csv')
missing = train1.isnull()
train1 = train1.fillna(0)
train1.iloc[:,1:] = train1.iloc[:,1:].astype(np.float32)
n = train1.shape[0]

In [3]:
a = train1.Page.str.split('mediawiki.org|wikimedia.org|wikipedia.org',expand=True)
b = np.where(a.loc[:,2].isnull(),a.loc[:,1],a.loc[:,2])
_,index_array = np.unique(b,return_inverse=True) # access & agent
website_array = [train1.Page.str.contains(web).values.astype(np.int8) \
                 for web in ['mediawiki.org','wikimedia.org','wikipedia.org']] # website
lang_array = train1.Page.str.extract('([a-z]{2}).wikipedia').fillna('nan').values.flatten()
_,lang_array = np.unique(lang_array,return_inverse=True) # language

In [4]:
Page_X = \
np.concatenate([OneHotEncoder(dtype=np.float32,sparse=False).fit_transform(lang_array[:,np.newaxis]),\
                OneHotEncoder(dtype=np.float32,sparse=False).fit_transform(index_array[:,np.newaxis]),\
                np.stack(website_array,1)],1)

In [18]:
date_index=pd.to_datetime(train1.columns[1:]).to_series().reset_index(drop=True)
def extract_date_info(timestamp,mss):
    return np.array([mss, timestamp.day, \
                     timestamp.dayofyear,timestamp.year]+\
                    [1 if i==timestamp.weekday() else 0 for i in range(7)]+\
                    [1 if i==timestamp.month else 0 for i in range(12)])

In [19]:
d = extract_date_info(date_index[3],3).shape[0]

In [21]:
max_ = lambda df,start,interval: np.max(df.iloc[:,start-interval:start].values,1)
min_ = lambda df,start,interval: np.min(df.iloc[:,start-interval:start].values,1)
std_ = lambda df,start,interval: np.std(df.iloc[:,start-interval:start].values,1)
mean_ = lambda df,start,interval: np.mean(df.iloc[:,start-interval:start].values,1)
growth_ = lambda df,start,interval: np.mean(df.iloc[:,start-interval:start],1) -\
                                    np.mean(df.iloc[:,start-2*interval:start-interval],1)

fun_list = [max_,min_,std_,mean_,growth_]
interval_list = [14,30,60]
T = train1.shape[1]

In [None]:
#margain_ = np.median(train1.iloc[:,1:].values,1)
#margain_ = np.where(margain_==0,-20,np.log(margain_))

In [22]:
downSampleRate = 0.5
j = T - 64
Lookback_X = np.stack([fun(train1,j,interval) for fun in fun_list for interval in interval_list],1)
testData = []
testMargain = []
for i in range(j+2,j+64):
    temp = np.concatenate([train1.iloc[:,i].values[:,np.newaxis],\
                           Page_X,\
                           np.broadcast_to(extract_date_info(date_index[i-1],i-j),(n,d)),\
                           Lookback_X,\
                           train1.iloc[:,j-1:j].values,\
                           train1.iloc[:,i-90:i-89].values,\
                           train1.iloc[:,i-120:i-119].values,\
                           train1.iloc[:,i-180:i-179].values,\
                           train1.iloc[:,i-365:i-364].values],1).astype(np.float32)
    nonmissing_index = np.logical_not(missing.iloc[:,i].values)
    random_index = np.random.rand(nonmissing_index.sum())<downSampleRate
    testData.append(temp[nonmissing_index][random_index])
    #testMargain.append(margain_[nonmissing_index][random_index])
    testMargain.append(train1.iloc[:,j-1].values[nonmissing_index][random_index])
testData = np.concatenate(testData,0)
testMargain = np.concatenate(testMargain,0)    

In [23]:
downSampleRate = 0.1
trainData = []
trainMargain = []
for j in range(T-65*2,350,-70):
    Lookback_X = np.stack([fun(train1,j,interval) for fun in fun_list for interval in interval_list],1)
    for i in range(j+2,j+64):
        temp = np.concatenate([train1.iloc[:,i].values[:,np.newaxis],\
                               Page_X,\
                               np.broadcast_to(extract_date_info(date_index[i-1],i-j),(n,d)),\
                               Lookback_X,\
                               train1.iloc[:,j-1:j].values,\
                               train1.iloc[:,i-90:i-89].values,\
                               train1.iloc[:,i-120:i-119].values,\
                               train1.iloc[:,i-180:i-179].values,\
                               train1.iloc[:,i-365:i-364].values],1).astype(np.float32)
        nonmissing_index = np.logical_not(missing.iloc[:,i].values)
        random_index = np.random.rand(nonmissing_index.sum())<downSampleRate
        trainData.append(temp[nonmissing_index][random_index])
        #trainMargain.append(margain_[nonmissing_index][random_index])
        trainMargain.append(train1.iloc[:,j-1].values[nonmissing_index][random_index])
trainData = np.concatenate(trainData,0)
trainMargain = np.concatenate(trainMargain,0)

In [24]:
# Data corruption when creating a a DMatrix with label being a non-contiguous ndarray #2554
dtrain = xgb.DMatrix(np.ascontiguousarray(trainData[:,1:]),label=np.ascontiguousarray(trainData[:,0]))
dtest = xgb.DMatrix(np.ascontiguousarray(testData[:,1:]),label=np.ascontiguousarray(testData[:,0]))

In [25]:
dtrain.set_base_margin(trainMargain)
dtest.set_base_margin(testMargain)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]

after hyper-parameter tuning, find the best num_round

In [26]:
eps = 0.1
floor = 0.1
neg_grad = -1.0

In [27]:
def SMAPE_train(preds, dtrain):
    y = dtrain.get_label()
    temp = (2*y+eps)/(preds+y+eps)**2
    grad = np.where(preds>y,temp,np.where(preds<0,neg_grad,-1*temp))
    #index_ = np.isfinite(grad)
    #non_index_ = np.logical_not(index_)
    #print(preds[non_index_],y[non_index_])
    hess = np.ones_like(grad)
    return grad, hess

def SMAPE_eval(preds, dtrain):
    y = dtrain.get_label()
    summ = np.abs(y) + np.abs(preds)
    return 'SMAPE', 200*np.mean(np.where(summ==0, 0, np.abs(y - preds) / summ))

In [None]:
start_time = timeit.default_timer()
model_gbm = xgb.train(params, dtrain, num_round, watchlist, \
                      verbose_eval=100,\
                      obj=SMAPE_train, feval=SMAPE_eval,xgb_model=model_gbm)
print("--- %s seconds ---" % (timeit.default_timer() - start_time))

Fit final model with both train and test data

In [49]:
# Data corruption when creating a a DMatrix with label being a non-contiguous ndarray #2554
dtrain = xgb.DMatrix(np.ascontiguousarray(np.concatenate([trainData[:,1:],testData[:,1:]],0)),\
                     np.ascontiguousarray(np.concatenate([trainData[:,0],testData[:,0]],0)))

In [50]:
dtrain.set_base_margin(np.ascontiguousarray(np.concatenate([trainMargain,testMargain],0)))
watchlist = [(dtrain, 'train')]

In [33]:
num_round = 1000
params =   {'eta': 1e-1,
            'subsample': 0.05,
            'booster':'gblinear'}

In [34]:
start_time = timeit.default_timer()
model_linear = xgb.train(params, dtrain, num_round, watchlist, \
                      verbose_eval=100,\
                      obj=SMAPE_train, feval=SMAPE_eval)
print("--- %s seconds ---" % (timeit.default_timer() - start_time))

[0]	train-rmse:53776.8	train-SMAPE:50.2912
[100]	train-rmse:53776.8	train-SMAPE:48.7205
[200]	train-rmse:53776.8	train-SMAPE:48.2559
[300]	train-rmse:53776.8	train-SMAPE:48.1895
[400]	train-rmse:53776.8	train-SMAPE:48.1708
[500]	train-rmse:53776.8	train-SMAPE:48.1599
[600]	train-rmse:53776.8	train-SMAPE:48.1524
[700]	train-rmse:53776.8	train-SMAPE:48.1468
[800]	train-rmse:53776.8	train-SMAPE:48.1422
[900]	train-rmse:53776.8	train-SMAPE:48.1392
[999]	train-rmse:53776.8	train-SMAPE:48.1364
--- 887.6806653509993 seconds ---


In [52]:
margin_linear = model_linear.predict(dtrain)

In [53]:
# Data corruption when creating a a DMatrix with label being a non-contiguous ndarray #2554
dtrain = xgb.DMatrix(np.ascontiguousarray(np.concatenate([trainData[:,1:],testData[:,1:]],0)),\
                     np.ascontiguousarray(np.concatenate([trainData[:,0],testData[:,0]],0)))
dtrain.set_base_margin(np.ascontiguousarray(margin_linear))
watchlist = [(dtrain, 'train')]

In [54]:
num_round = 5000
params =   {'eta': 1e-1,
            'max_depth': 8, 
            'max_bin': 32,
            'subsample': 0.01,
            'colsample_bylevel': 0.75, 
            'min_child_weight':100,
            'tree_method':'gpu_hist',
            'gamma': 0}

In [56]:
start_time = timeit.default_timer()
model_gbm = xgb.train(params, dtrain, num_round, watchlist, \
                      verbose_eval=500,\
                      obj=SMAPE_train, feval=SMAPE_eval)
print("--- %s seconds ---" % (timeit.default_timer() - start_time))

[0]	train-rmse:53776.8	train-SMAPE:48.1303
[500]	train-rmse:53776.8	train-SMAPE:47.141
[1000]	train-rmse:53776.8	train-SMAPE:46.7705
[1500]	train-rmse:53776.8	train-SMAPE:46.5474
[2000]	train-rmse:53776.8	train-SMAPE:46.3921
[2500]	train-rmse:53776.8	train-SMAPE:46.2803
[3000]	train-rmse:53776.8	train-SMAPE:46.1924
[3500]	train-rmse:53776.8	train-SMAPE:46.1477
[4000]	train-rmse:53776.8	train-SMAPE:46.2271
[4500]	train-rmse:53776.8	train-SMAPE:46.1765
[4999]	train-rmse:53776.8	train-SMAPE:46.1405
--- 1983.0460360879988 seconds ---


In [None]:
# save model
model_gbm.save_model('GBMs/gbm1')

In [None]:
# load model
model_gbm = xgb.Booster()
model_gbm.load_model('GBMs/gbm1')

Make Submission

In [None]:
date_index = date_index.append(pd.Series(pd.date_range('2017-09-11','2017-11-13')))

In [None]:
date_index = date_index.reset_index(drop=True)

In [None]:
j = T
Lookback_X = np.stack([fun(train1,j,interval) for fun in fun_list for interval in interval_list],1)
testData = []
testMargain = []
mapping_index = []
for i in range(j+2,j+64):
    temp = np.concatenate([Page_X,\
                           np.broadcast_to(extract_date_info(date_index[i-1],i-j),(n,d)),\
                           Lookback_X,\
                           train1.iloc[:,j-1:j].values,\
                           train1.iloc[:,i-90:i-89].values,\
                           train1.iloc[:,i-120:i-119].values,\
                           train1.iloc[:,i-180:i-179].values,\
                           train1.iloc[:,i-365:i-364].values],1).astype(np.float32)

    testData.append(temp)
    testMargain.append(train1.iloc[:,j-1].values)
    mapping_index.append(train1.iloc[:,0].str.cat([str(date_index[i-1])[:10]]*n,'_'))
    
testData = np.concatenate(testData,0)
testMargain = np.concatenate(testMargain,0)
mapping_index = np.concatenate(mapping_index,0)

In [None]:
# Data corruption when creating a a DMatrix with label being a non-contiguous ndarray #2554
dtest = xgb.DMatrix(np.ascontiguousarray(testData))
dtest.set_base_margin(np.ascontiguousarray(testMargain))

In [None]:
yhat = np.maximum(model_gbm.predict(dtest,output_margin=True),0)

In [None]:
yhat = pd.DataFrame({'Page':mapping_index,'Visits':yhat})

In [None]:
key = pd.read_csv('key_2.csv')

In [None]:
key = key.merge(yhat,on='Page')

In [None]:
key.drop('Page',axis=1,inplace=True)

In [None]:
key.to_csv('Submissions/gbm1.csv',index=False)