In [1]:
import pandas as pd
import numpy as np
import timeit
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

In [2]:
train1 = pd.read_csv('train_2.csv')

In [3]:
missing = train1.isnull()

In [4]:
train1 = train1.fillna(0)

In [5]:
train1.iloc[:,1:] = train1.iloc[:,1:].astype(np.float32)

In [6]:
n = train1.shape[0]

Check projection period and if all in contained in training set

In [None]:
key = pd.read_csv('key_2.csv')
extract_page = key.Page.str.extract('(.+)_....-..-..')
extract_page = np.unique(extract_page)
set(train1.Page) == set(extract_page)
# True

In [10]:
key = pd.read_csv('key_2.csv')
len(set(key.Page.str.slice(-10)))

62

Extract Page information. Since xgboost cannot take categorical feature, we need to "encode" feature as numerical

In [7]:
a = train1.Page.str.split('mediawiki.org|wikimedia.org|wikipedia.org',expand=True)

In [8]:
b = np.where(a.loc[:,2].isnull(),a.loc[:,1],a.loc[:,2])

In [9]:
_,index_array = np.unique(b,return_inverse=True) # access & agent

In [10]:
website_array = [train1.Page.str.contains(web).values.astype(np.int8) \
                 for web in ['mediawiki.org','wikimedia.org','wikipedia.org']] # website

In [11]:
lang_array = train1.Page.str.extract('([a-z]{2}).wikipedia').fillna('nan').values.flatten()

In [12]:
_,lang_array = np.unique(lang_array,return_inverse=True) # language

In [32]:
Page_X = \
np.concatenate([OneHotEncoder(dtype=np.float32,sparse=False).fit_transform(lang_array[:,np.newaxis]),\
                OneHotEncoder(dtype=np.float32,sparse=False).fit_transform(index_array[:,np.newaxis]),\
                np.stack(website_array,1)],1)

Extract date information

In [43]:
date_index=pd.to_datetime(train1.columns[1:]).to_series().reset_index(drop=True)

In [48]:
def extract_date_info(timestamp,mss):
    return np.array([mss, timestamp.weekday(), timestamp.month, timestamp.day, timestamp.dayofyear,timestamp.year])

In [49]:
d = extract_date_info(date_index[0],1).shape[0]

In [53]:
Page_X.shape[1] + d + 20 + 5 +1

47

Create training and test data for GBM

In [27]:
max_ = lambda df,start,interval: np.max(df.iloc[:,start-interval:start].values,1)
min_ = lambda df,start,interval: np.min(df.iloc[:,start-interval:start].values,1)
std_ = lambda df,start,interval: np.std(df.iloc[:,start-interval:start].values,1)
mean_ = lambda df,start,interval: np.mean(df.iloc[:,start-interval:start].values,1)
growth_ = lambda df,start,interval: np.mean(df.iloc[:,start-interval:start],1) -\
                                    np.mean(df.iloc[:,start-2*interval:start-interval],1)

In [12]:
fun_list = [max_,min_,std_,mean_,growth_]

In [13]:
interval_list = [14,30,60,120]

In [38]:
T = train1.shape[1]

In [None]:
lookbackperiod = 5

Test dataset

In [230]:
j = T - 64

In [231]:
Lookback_X = np.stack([fun(train1,j,interval) for fun in fun_list for interval in interval_list],1)

In [232]:
testData = []
for i in range(j+2,j+64):
    temp = np.concatenate([train1.iloc[:,i].values[:,np.newaxis],\
                           Page_X,\
                           np.broadcast_to(extract_date_info(date_index[i-1],i-j),(n,4)),\
                           Lookback_X,\
                           train1.iloc[:,j-lookbackperiod:j].values],1).astype(np.float32)
    nonmissing_index = np.logical_not(missing.iloc[:,i].values)
    testData.append(temp[nonmissing_index])
testData = np.concatenate(testData,0)

Training dataset

In [235]:
downSampleRate = 0.1

In [242]:
trainData = []
for j in range(T-65*2,250,-80):
    Lookback_X = np.stack([fun(train1,j,interval) for fun in fun_list for interval in interval_list],1)
    for i in range(j+2,j+64):
        temp = np.concatenate([train1.iloc[:,i].values[:,np.newaxis],\
                               Page_X,\
                               np.broadcast_to(extract_date_info(date_index[i-1],i-j),(n,4)),\
                               Lookback_X,\
                               train1.iloc[:,j-lookbackperiod:j].values],1).astype(np.float32)
        nonmissing_index = np.logical_not(missing.iloc[:,i].values)
        trainData.append(temp[nonmissing_index][np.random.rand(nonmissing_index.sum())<downSampleRate])
trainData = np.concatenate(trainData,0)

In [None]:
eps = 0.5
floor = 0.1

In [None]:
def SMAPE_train(preds, dtrain):
    y = dtrain.get_label()
    yhat = np.exp(preds)
    temp = (2*y+eps)*yhat/(yhat+y+eps)**2
    grad = np.where(yhat>y,temp,-1*temp)
    hess = np.ones_like(grad)
    return grad, hess

def SMAPE_eval(preds, dtrain):
    y = dtrain.get_label()
    yhat = np.exp(preds)
    yhat = np.where(yhat<floor,0,yhat)
    summ = np.abs(y) + np.abs(yhat)
    return 'SMAPE', 200*np.mean(np.where(summ==0, 0, np.abs(y - yhat) / summ))

In [None]:
dtrain = xgb.DMatrix(X_train,y_train)
dtest = xgb.DMatrix(X_test,y_test)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]

In [None]:
num_round = 1000

In [None]:
params =   {'eta': 1e-2,
            'max_leaves': 48, 
            'max_bin': 32,
            'subsample': 0.01,
            'colsample_bylevel': 0.5, 
            'min_child_weight':100,
            'tree_method':'hist'}

In [None]:
start_time = timeit.default_timer()
model_gbm = xgb.train(params, dtrain, num_round, watchlist, \
                      early_stopping_rounds=10,verbose_eval=100,\
                      obj=SMAPE_train, feval=SMAPE_eval)
print("--- %s seconds ---" % (timeit.default_timer() - start_time))