In [1]:
import pandas as pd
import numpy as np
import gc

from dateutil.relativedelta import relativedelta

In [2]:
train = pd.read_csv("../data/new_train.csv", parse_dates=["date"], dtype={"Page": str, "Visits": 'float32'})
train = train.loc[(train["date"] >= '2016-03-01') & (train["date"] <= '2016-08-31')]
train.fillna(0, inplace=True)
test = pd.read_csv("../data/new_test.csv", parse_dates=["date"], dtype={"Page": str, "Visits": 'float32'})
test.fillna(0, inplace=True)
train["Visits"] = np.log1p(train["Visits"]).astype("float64")
test["Visits"] = np.log1p(test["Visits"]).astype("float64")

In [3]:
def create_features(df, month, target=None):
    if type(target) != type(pd.DataFrame()):
        target = df.loc[(df["date"] >= pd.to_datetime(month)) & (df["date"] <= pd.to_datetime(month) + relativedelta(months=2))]
    temp = df.loc[(train["date"] < pd.to_datetime(month) + relativedelta(days=-10)) & (df["date"] >= pd.to_datetime(month) + relativedelta(days=-40))].copy()
    temp = pd.pivot_table(temp, index=["Page"], values=["Visits"], columns=["date"]).reset_index()
    temp.columns = ["lag_{}".format(i)  if j[1] != "" else j[0] for i, j in enumerate(temp.columns)]

    temp = target.merge(temp, on="Page", how='left')

    temp['Month']     = temp["date"].dt.month
    temp['Month']     = temp['Month'] - temp['Month'].min()
    temp['Day']       = temp["date"].dt.day
    temp['DayOfWeek'] = temp["date"].dt.dayofweek
    return temp
    

In [4]:
new_train = create_features(train, '2016-04-10')
new_val = create_features(train, '2016-06-10')

In [5]:
new_train.head()

Unnamed: 0,Page,date,Visits,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,...,lag_24,lag_25,lag_26,lag_27,lag_28,lag_29,lag_30,Month,Day,DayOfWeek
0,2NE1_zh.wikipedia.org_all-access_spider,2016-04-10,4.276666,2.944439,3.7612,2.772589,1.791759,3.091043,4.043051,2.302585,...,2.197225,2.772589,2.484907,3.044523,4.094345,2.484907,2.944439,0,10,6
1,2PM_zh.wikipedia.org_all-access_spider,2016-04-10,2.995732,2.890372,3.135494,3.295837,2.833213,2.833213,3.367296,2.995732,...,2.70805,2.772589,2.70805,2.772589,3.367296,3.610918,3.178054,0,10,6
2,3C_zh.wikipedia.org_all-access_spider,2016-04-10,2.197225,1.791759,1.098612,0.693147,1.609438,2.079442,1.098612,1.098612,...,2.302585,1.791759,0.693147,1.94591,2.079442,1.609438,1.94591,0,10,6
3,4minute_zh.wikipedia.org_all-access_spider,2016-04-10,2.70805,2.397895,2.564949,2.197225,2.833213,2.639057,2.197225,2.890372,...,2.397895,2.397895,2.833213,2.564949,2.564949,2.639057,3.258096,0,10,6
4,5566_zh.wikipedia.org_all-access_spider,2016-04-10,2.890372,2.833213,2.397895,2.484907,2.944439,3.988984,5.365976,3.044523,...,2.484907,2.484907,2.197225,2.484907,2.70805,2.890372,2.639057,0,10,6


In [6]:
train_cols = [ 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5',
               'lag_6', 'lag_7', 'lag_8', 'lag_9', 'lag_10', 'lag_11', 'lag_12',
               'lag_13', 'lag_14', 'lag_15', 'lag_16', 'lag_17', 'lag_18', 'lag_19',
               'lag_20', 'lag_21', 'lag_22', 'lag_23', 'lag_24', 'lag_25', 'lag_26',
               'lag_27', 'lag_28', 'lag_29', 'lag_30', 'Month', 'Day', 'DayOfWeek']

In [7]:
print("Train:      {} - {}".format(str(new_train["date"].min())[:10], str(new_train["date"].max())[:10]))
print("Validation: {} - {}".format(str(new_val["date"].min())[:10], str(new_val["date"].max())[:10]))

Train:      2016-04-10 - 2016-06-10
Validation: 2016-06-10 - 2016-08-10


In [8]:
import lightgbm as lgb
param = {}
param['application'] = 'regression_l2'
param['learning_rate'] = 0.1
param['feature_fraction'] = 0.5
param['bagging_fraction'] = 0.5
param['bagging_freq'] = 1
param['max_depth'] = 5
param['num_threads'] = 4
param['verbose'] = 0

from numba import jit
import math

@jit
def smape_fast(y_true, y_pred):
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        if b < 1:
            b = 0
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out

def lgb_smape(preds, df):
    labels = df.get_label()
    labels, preds = np.expm1(np.array(labels)), np.expm1(np.array(preds))
    return 'smape', smape_fast(labels, preds), False

In [9]:
lgb_train = lgb.Dataset(new_train[train_cols], label=new_train["Visits"], free_raw_data=False)
lgb_val = lgb.Dataset(new_val[train_cols], label=new_val["Visits"], free_raw_data=False, reference=lgb_train)

model = lgb.train(param, lgb_train, 1000, valid_sets=[lgb_train,lgb_val], feval=lgb_smape, early_stopping_rounds=10)

[1]	training's smape: 121.955	valid_1's smape: 119.773
Training until validation scores don't improve for 10 rounds.
[2]	training's smape: 115.947	valid_1's smape: 113.565
[3]	training's smape: 109.993	valid_1's smape: 107.459
[4]	training's smape: 104.182	valid_1's smape: 101.527
[5]	training's smape: 98.5578	valid_1's smape: 95.7759
[6]	training's smape: 93.205	valid_1's smape: 90.3907
[7]	training's smape: 88.1416	valid_1's smape: 85.3332
[8]	training's smape: 83.4334	valid_1's smape: 80.7206
[9]	training's smape: 79.0633	valid_1's smape: 76.5166
[10]	training's smape: 75.0614	valid_1's smape: 72.6865
[11]	training's smape: 71.4268	valid_1's smape: 69.3088
[12]	training's smape: 68.1542	valid_1's smape: 66.3267
[13]	training's smape: 65.2317	valid_1's smape: 63.6469
[14]	training's smape: 62.6581	valid_1's smape: 61.3667
[15]	training's smape: 60.3924	valid_1's smape: 59.4091
[16]	training's smape: 58.4272	valid_1's smape: 57.7517
[17]	training's smape: 56.7371	valid_1's smape: 56.3

In [10]:
gc.collect()

4515

In [11]:
new_test = create_features(train, '2016-09-10', test)

In [12]:
del train, new_train, test, lgb_train, lgb_val, model

In [13]:
# on my server the optimal number of iterations was 50

lgb_train = lgb.Dataset(new_val[train_cols], label=new_val["Visits"], free_raw_data=False)
lgb_val = lgb.Dataset(new_test[train_cols], label=new_test["Visits"], free_raw_data=False, reference=lgb_train)

model = lgb.train(param, lgb_train, 50, valid_sets=[lgb_train,lgb_val], feval=lgb_smape)

[1]	training's smape: 120.338	valid_1's smape: 121.412
[2]	training's smape: 114.384	valid_1's smape: 115.749
[3]	training's smape: 108.527	valid_1's smape: 110.188
[4]	training's smape: 102.807	valid_1's smape: 104.81
[5]	training's smape: 97.3307	valid_1's smape: 99.5025
[6]	training's smape: 92.135	valid_1's smape: 94.5814
[7]	training's smape: 87.2702	valid_1's smape: 89.9196
[8]	training's smape: 82.7642	valid_1's smape: 85.5888
[9]	training's smape: 78.5866	valid_1's smape: 81.5416
[10]	training's smape: 74.8049	valid_1's smape: 77.8673
[11]	training's smape: 71.3691	valid_1's smape: 74.5653
[12]	training's smape: 68.29	valid_1's smape: 71.5307
[13]	training's smape: 65.5704	valid_1's smape: 68.8594
[14]	training's smape: 63.1586	valid_1's smape: 66.4572
[15]	training's smape: 61.0607	valid_1's smape: 64.3423
[16]	training's smape: 59.2333	valid_1's smape: 62.4975
[17]	training's smape: 57.6566	valid_1's smape: 60.8716
[18]	training's smape: 56.304	valid_1's smape: 59.4732
[19]	t