In [None]:
import pandas as pd
import numpy as np
import cdb
from sklearn.preprocessing import OneHotEncoder
import timeit
import xgboost as xgb

In [None]:
store = cdb.use('mrg')
with store.open("/e737253/train_2.csv",'rb') as f:
    train1  = pd.read_csv(f)

In [None]:
missing = train1.isnull()
train1 = train1.fillna(0)
train1.iloc[:,1:] = train1.iloc[:,1:].astype(np.float32)
n = train1.shape[0]

In [None]:
a = train1.Page.str.split('mediawiki.org|wikimedia.org|wikipedia.org',expand=True)
b = np.where(a.loc[:,2].isnull(),a.loc[:,1],a.loc[:,2])
_,index_array = np.unique(b,return_inverse=True) # access & agent
website_array = [train1.Page.str.contains(web).values.astype(np.int8) \
                 for web in ['mediawiki.org','wikimedia.org','wikipedia.org']] # website
lang_array = train1.Page.str.extract('([a-z]{2}).wikipedia').fillna('nan').values.flatten()
_,lang_array = np.unique(lang_array,return_inverse=True) # language

In [None]:
Page_X = \
np.concatenate([OneHotEncoder(dtype=np.float32,sparse=False).fit_transform(lang_array[:,np.newaxis]),\
                OneHotEncoder(dtype=np.float32,sparse=False).fit_transform(index_array[:,np.newaxis]),\
                np.stack(website_array,1)],1)

In [None]:
date_index=pd.to_datetime(train1.columns[1:]).to_series().reset_index(drop=True)
def extract_date_info(timestamp,mss):
    return np.array([mss, timestamp.weekday(), timestamp.month, timestamp.day, \
                     timestamp.dayofyear,timestamp.year])

In [None]:
d = extract_date_info(date_index[3],3).shape[0]

In [None]:
max_ = lambda df,start,interval: np.max(df.iloc[:,start-interval:start].values,1)
min_ = lambda df,start,interval: np.min(df.iloc[:,start-interval:start].values,1)
std_ = lambda df,start,interval: np.std(df.iloc[:,start-interval:start].values,1)
mean_ = lambda df,start,interval: np.mean(df.iloc[:,start-interval:start].values,1)
growth_ = lambda df,start,interval: np.mean(df.iloc[:,start-interval:start],1) -\
                                    np.mean(df.iloc[:,start-2*interval:start-interval],1)

fun_list = [max_,min_,std_,mean_,growth_]
interval_list = [14,30,60,120]
T = train1.shape[1]

In [None]:
#margain_ = np.median(train1.iloc[:,1:].values,1)
#margain_ = np.where(margain_==0,-20,np.log(margain_))

In [None]:
downSampleRate = 0.5
j = T - 64
Lookback_X = np.stack([fun(train1,j,interval) for fun in fun_list for interval in interval_list],1)
testData = []
testMargain = []
for i in range(j+2,j+64):
    temp = np.concatenate([train1.iloc[:,i].values[:,np.newaxis],\
                           Page_X,\
                           np.broadcast_to(extract_date_info(date_index[i-1],i-j),(n,d)),\
                           Lookback_X,\
                           train1.iloc[:,j-1:j].values,\
                           train1.iloc[:,i-90:i-89].values,\
                           train1.iloc[:,i-120:i-119].values,\
                           train1.iloc[:,i-180:i-179].values,\
                           train1.iloc[:,i-365:i-364].values],1).astype(np.float32)
    nonmissing_index = np.logical_not(missing.iloc[:,i].values)
    random_index = np.random.rand(nonmissing_index.sum())<downSampleRate
    testData.append(temp[nonmissing_index][random_index])
    #testMargain.append(margain_[nonmissing_index][random_index])
    testMargain.append(train1.iloc[:,j-1].values[nonmissing_index][random_index])
testData = np.concatenate(testData,0)
testMargain = np.concatenate(testMargain,0)    

In [None]:
downSampleRate = 0.1
trainData = []
trainMargain = []
for j in range(T-65*2,350,-70):
    Lookback_X = np.stack([fun(train1,j,interval) for fun in fun_list for interval in interval_list],1)
    for i in range(j+2,j+64):
        temp = np.concatenate([train1.iloc[:,i].values[:,np.newaxis],\
                               Page_X,\
                               np.broadcast_to(extract_date_info(date_index[i-1],i-j),(n,d)),\
                               Lookback_X,\
                               train1.iloc[:,j-1:j].values,\
                               train1.iloc[:,i-90:i-89].values,\
                               train1.iloc[:,i-120:i-119].values,\
                               train1.iloc[:,i-180:i-179].values,\
                               train1.iloc[:,i-365:i-364].values],1).astype(np.float32)
        nonmissing_index = np.logical_not(missing.iloc[:,i].values)
        random_index = np.random.rand(nonmissing_index.sum())<downSampleRate
        trainData.append(temp[nonmissing_index][random_index])
        #trainMargain.append(margain_[nonmissing_index][random_index])
        trainMargain.append(train1.iloc[:,j-1].values[nonmissing_index][random_index])
trainData = np.concatenate(trainData,0)
trainMargain = np.concatenate(trainMargain,0)

In [None]:
# Data corruption when creating a a DMatrix with label being a non-contiguous ndarray #2554
dtrain = xgb.DMatrix(np.ascontiguousarray(trainData[:,1:]),label=np.ascontiguousarray(trainData[:,0]))
dtest = xgb.DMatrix(np.ascontiguousarray(testData[:,1:]),label=np.ascontiguousarray(testData[:,0]))

In [None]:
dtrain.set_base_margin(trainMargain)
dtest.set_base_margin(testMargain)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]

Hyper-parameter tuning with random search

In [None]:
para_gen = {'eps':[np.random.choice,[0.1,1,5,10,50]],\
            'neg_grad':[np.random.choice,[-0.1,-1,-5,-10,-50]],\
            'max_depth':[np.random.choice,[4,8,12,16,24,32]],\
            'subsample':[np.random.choice,[0.001,0.0025,0.005,0.01,0.025,0.05]],\
            'colsample_bylevel':[np.random.choice,[0.5,0.75,1]],\
            'gamma':[np.random.choice,[0.1,1,10,100,1000]]}

In [None]:
def generate_(para_gen):
    return {key:item[0](item[1]) for key,item in para_gen.items()}

In [None]:
def RandomSearch(fun,para_gen,dtrain,dtest,iterations):
    # fun needs to have args Xtrain,ytrain,Xtest,ytest
    for _ in range(iterations):
        paras = generate_(para_gen)
        paras_data = paras.copy()
        paras_data['dtrain'] = dtrain
        paras_data['dtest'] = dtest
        paras['score'] = fun(**paras_data)
        print(paras)

In [None]:
def fun_xgb(dtrain,dtest,eps,neg_grad,max_depth,subsample,colsample_bylevel,gamma):
    
    def SMAPE_train(preds, dtrain):
        y = dtrain.get_label()
        temp = (2*y+eps)/(preds+y+eps)**2
        grad = np.where(preds>y,temp,np.where(preds<0,neg_grad,-1*temp))
        #index_ = np.isfinite(grad)
        #non_index_ = np.logical_not(index_)
        #print(preds[non_index_],y[non_index_])
        hess = np.ones_like(grad)
        return grad, hess
    
    num_round = 500
    params =   {'eta': 1e-1,
                'max_bin': 32,       
                'nthread':8,       
                'tree_method':'hist',                
                'max_depth': max_depth, 
                'subsample': subsample,
                'colsample_bylevel': colsample_bylevel, 
                'gamma': gamma
                }
    
    model_gbm = xgb.train(params, dtrain, num_round, obj=SMAPE_train,verbose_eval=False)
    return SMAPE_eval(model_gbm.predict(dtest), dtest)[1]

In [None]:
RandomSearch(fun_xgb,para_gen,dtrain,dtest,100)

after hyper-parameter tuning, find the best num_round

In [None]:
eps = 0.1
floor = 0.1
neg_grad = -1.0

In [None]:
def SMAPE_train(preds, dtrain):
    y = dtrain.get_label()
    temp = (2*y+eps)/(preds+y+eps)**2
    grad = np.where(preds>y,temp,np.where(preds<0,neg_grad,-1*temp))
    #index_ = np.isfinite(grad)
    #non_index_ = np.logical_not(index_)
    #print(preds[non_index_],y[non_index_])
    hess = np.ones_like(grad)
    return grad, hess

def SMAPE_eval(preds, dtrain):
    y = dtrain.get_label()
    summ = np.abs(y) + np.abs(preds)
    return 'SMAPE', 200*np.mean(np.where(summ==0, 0, np.abs(y - preds) / summ))

In [None]:
num_round = 5000
params =   {'eta': 1e-1,
            'max_depth': 8, 
            'max_bin': 32,
            'subsample': 0.025,
            'colsample_bylevel': 0.75, 
            'min_child_weight':100,
            'tree_method':'hist',
            'gamma': 0,
            'nthread':8}

In [None]:
start_time = timeit.default_timer()
model_gbm = xgb.train(params, dtrain, num_round, watchlist, \
                      verbose_eval=100,\
                      obj=SMAPE_train, feval=SMAPE_eval)
print("--- %s seconds ---" % (timeit.default_timer() - start_time))

Fit final model with both train and test data

In [None]:
# Data corruption when creating a a DMatrix with label being a non-contiguous ndarray #2554
dtrain = xgb.DMatrix(np.ascontiguousarray(np.concatenate([trainData[:,1:],testData[:,1:]],0)),\
                     np.ascontiguousarray(np.concatenate([trainData[:,0],testData[:,0]],0)))

In [None]:
dtrain.set_base_margin(np.ascontiguousarray(np.concatenate([trainMargain,testMargain],0)))
watchlist = [(dtrain, 'train')]

In [None]:
num_round = 5000
params =   {'eta': 1e-1,
            'max_depth': 8, 
            'max_bin': 32,
            'subsample': 0.01,
            'colsample_bylevel': 0.75, 
            'min_child_weight':100,
            'tree_method':'hist',
            'gamma': 0,
            'nthread':8}

In [None]:
start_time = timeit.default_timer()
model_gbm = xgb.train(params, dtrain, num_round, watchlist, \
                      verbose_eval=500,\
                      obj=SMAPE_train, feval=SMAPE_eval)
print("--- %s seconds ---" % (timeit.default_timer() - start_time))

Make Submission

In [None]:
date_index.append(pd.Series(pd.date_range('2017-09-11','2017-11-13')))

In [None]:
j = T - 64 # TODO: change this
Lookback_X = np.stack([fun(train1,j,interval) for fun in fun_list for interval in interval_list],1)
testData = []
testMargain = []
mapping_index = []
for i in range(j+2,j+64):
    temp = np.concatenate([Page_X,\
                           np.broadcast_to(extract_date_info(date_index[i-1],i-j),(n,d)),\
                           Lookback_X,\
                           train1.iloc[:,j-1:j].values,\
                           train1.iloc[:,i-90:i-89].values,\
                           train1.iloc[:,i-120:i-119].values,\
                           train1.iloc[:,i-180:i-179].values,\
                           train1.iloc[:,i-365:i-364].values],1).astype(np.float32)

    testData.append(temp)
    testMargain.append(train1.iloc[:,j-1].values)
    mapping_index.append(train1.iloc[:,0].str.cat([str(date_index[i-1])[:10]]*n,'_'))
    
testData = np.concatenate(testData,0)
testMargain = np.concatenate(testMargain,0)
mapping_index = np.concatenate(mapping_index,0)

In [None]:
# Data corruption when creating a a DMatrix with label being a non-contiguous ndarray #2554
dtest = xgb.DMatrix(np.ascontiguousarray(testData))
dtest.set_base_margin(np.ascontiguousarray(testMargain))

In [None]:
yhat = model_gbm.predict(dtest,output_margin=True)

In [231]:
pd.DataFrame({'mapping_index':mapping_index,'':yhat})