In [1]:
import pandas as pd
import numpy as np
import gc
import re

from dateutil.relativedelta import relativedelta

In [2]:
train = pd.read_csv("../data/new_train.csv", parse_dates=["date"], dtype={"Page": str, "Visits": 'float32'})
train = train.loc[(train["date"] >= '2016-03-01') & (train["date"] <= '2016-08-31')]
train.fillna(0, inplace=True)
train.reset_index(drop=True, inplace=True)
test = pd.read_csv("../data/new_test.csv", parse_dates=["date"], dtype={"Page": str, "Visits": 'float32'})
test.fillna(0, inplace=True)
test.reset_index(drop=True, inplace=True)
train["Visits"] = np.log1p(train["Visits"]).astype("float64")
test["Visits"] = np.log1p(test["Visits"]).astype("float64")

In [3]:
def get_language(page):
    res = re.search('[a-z][a-z].wikipedia.org',page)
    if res:
        return res[0][0:2]
    return 'na'

def get_language(page):
    res = re.search('[a-z][a-z].wikipedia.org',page)
    if res:
        return res.group(0)[0:2]
    return 'na'

train['lang'] = train["Page"].map(get_language)
test['lang'] = test["Page"].map(get_language)

components = pd.DataFrame([i.split("_")[-3:] for i in train["Page"]])
components.columns = ['Project', 'Access', 'Agent']

train[['Project', 'Access', 'Agent']] = components[['Project', 'Access', 'Agent']]

components = pd.DataFrame([i.split("_")[-3:] for i in test["Page"]])
components.columns = ['Project', 'Access', 'Agent']
test[['Project', 'Access', 'Agent']] = components[['Project', 'Access', 'Agent']]

del components

In [4]:
def create_features(df, month, target=None):
    if type(target) != type(pd.DataFrame()):
        target = df.loc[(df["date"] >= pd.to_datetime(month)) & (df["date"] <= pd.to_datetime(month) + relativedelta(months=2))]
    temp = df.loc[(train["date"] < pd.to_datetime(month) + relativedelta(days=-10)) & (df["date"] >= pd.to_datetime(month) + relativedelta(days=-40))].copy()
    temp = pd.pivot_table(temp, index=["Page"], values=["Visits"], columns=["date"]).reset_index()
    temp.columns = ["lag_{}".format(i)  if j[1] != "" else j[0] for i, j in enumerate(temp.columns)]

    temp = target.merge(temp, on="Page", how='left')

    temp['Month']     = temp["date"].dt.month
    temp['Month']     = temp['Month'] - temp['Month'].min()
    temp['Day']       = temp["date"].dt.day
    temp['DayOfWeek'] = temp["date"].dt.dayofweek
    return temp
    

In [5]:
new_train = create_features(train, '2016-04-10')
new_val = create_features(train, '2016-06-10')

In [6]:
train_cols = [ 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5',
               'lag_6', 'lag_7', 'lag_8', 'lag_9', 'lag_10', 'lag_11', 'lag_12',
               'lag_13', 'lag_14', 'lag_15', 'lag_16', 'lag_17', 'lag_18', 'lag_19',
               'lag_20', 'lag_21', 'lag_22', 'lag_23', 'lag_24', 'lag_25', 'lag_26',
               'lag_27', 'lag_28', 'lag_29', 'lag_30', 'Month', 'Day', 'DayOfWeek', 
               "lang", "Project", "Access", "Agent"]

In [7]:
print("Train:      {} - {}".format(str(new_train["date"].min())[:10], str(new_train["date"].max())[:10]))
print("Validation: {} - {}".format(str(new_val["date"].min())[:10], str(new_val["date"].max())[:10]))

Train:      2016-04-10 - 2016-06-10
Validation: 2016-06-10 - 2016-08-10


In [8]:
label_encodings = {}
for feature in ["lang", "Project", "Access", "Agent"]:
    label_encodings[feature] = {}
    for i, item in enumerate(train[feature].unique()):
        label_encodings[feature][item] = i
    new_train[feature] = new_train[feature].map(label_encodings[feature])
    new_val[feature]   = new_val[feature].map(label_encodings[feature])

In [9]:
import lightgbm as lgb
param = {}
param['application'] = 'regression_l2'
param['learning_rate'] = 0.1
param['feature_fraction'] = 0.5
param['bagging_fraction'] = 0.5
param['bagging_freq'] = 1
param['max_depth'] = 5
param['num_threads'] = 4
param["verbose"] = 0

from numba import jit
import math

@jit
def smape_fast(y_true, y_pred):
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        if b < 1:
            b = 0
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out

def lgb_smape(preds, df):
    labels = df.get_label()
    labels, preds = np.expm1(np.array(labels)), np.expm1(np.array(preds))
    return 'smape', smape_fast(labels, preds), False

In [10]:
lgb_train = lgb.Dataset(new_train[train_cols], label=new_train["Visits"], free_raw_data=False)
lgb_val = lgb.Dataset(new_val[train_cols], label=new_val["Visits"], free_raw_data=False, reference=lgb_train)

model = lgb.train(param, lgb_train, 1000, valid_sets=[lgb_train,lgb_val], feval=lgb_smape, early_stopping_rounds=10)

[1]	training's smape: 121.955	valid_1's smape: 119.773
Training until validation scores don't improve for 10 rounds.
[2]	training's smape: 115.939	valid_1's smape: 113.628
[3]	training's smape: 109.997	valid_1's smape: 107.41
[4]	training's smape: 104.202	valid_1's smape: 101.563
[5]	training's smape: 98.5703	valid_1's smape: 95.8874
[6]	training's smape: 93.1954	valid_1's smape: 90.5326
[7]	training's smape: 88.1198	valid_1's smape: 85.4805
[8]	training's smape: 83.3916	valid_1's smape: 80.8659
[9]	training's smape: 79.0332	valid_1's smape: 76.6281
[10]	training's smape: 75.0285	valid_1's smape: 72.8782
[11]	training's smape: 71.3858	valid_1's smape: 69.4357
[12]	training's smape: 68.1204	valid_1's smape: 66.4626
[13]	training's smape: 65.1969	valid_1's smape: 63.7938
[14]	training's smape: 62.6089	valid_1's smape: 61.4658
[15]	training's smape: 60.3334	valid_1's smape: 59.5009
[16]	training's smape: 58.3473	valid_1's smape: 57.8185
[17]	training's smape: 56.6385	valid_1's smape: 56.4

In [11]:
gc.collect()

4475

In [12]:
new_test = create_features(train, '2016-09-10', test)
for feature in ["lang", "Project", "Access", "Agent"]:
    new_test[feature] = new_test[feature].map(label_encodings[feature])

In [13]:
del train, new_train, test, lgb_train, lgb_val, model

In [14]:
lgb_train = lgb.Dataset(new_val[train_cols], label=new_val["Visits"], free_raw_data=False)
lgb_val = lgb.Dataset(new_test[train_cols], label=new_test["Visits"], free_raw_data=False)

model = lgb.train(param, lgb_train, 38, valid_sets=[lgb_train,lgb_val], feval=lgb_smape)

[1]	training's smape: 120.338	valid_1's smape: 121.412
[2]	training's smape: 114.399	valid_1's smape: 115.726
[3]	training's smape: 108.532	valid_1's smape: 110.142
[4]	training's smape: 102.838	valid_1's smape: 104.787
[5]	training's smape: 97.3527	valid_1's smape: 99.6209
[6]	training's smape: 92.1668	valid_1's smape: 94.6195
[7]	training's smape: 87.3049	valid_1's smape: 89.9616
[8]	training's smape: 82.7832	valid_1's smape: 85.6487
[9]	training's smape: 78.6107	valid_1's smape: 81.5776
[10]	training's smape: 74.8064	valid_1's smape: 77.8106
[11]	training's smape: 71.3679	valid_1's smape: 74.4555
[12]	training's smape: 68.2903	valid_1's smape: 71.4413
[13]	training's smape: 65.5484	valid_1's smape: 68.7282
[14]	training's smape: 63.1435	valid_1's smape: 66.3399
[15]	training's smape: 61.042	valid_1's smape: 64.2198
[16]	training's smape: 59.2098	valid_1's smape: 62.3592
[17]	training's smape: 57.6238	valid_1's smape: 60.7284
[18]	training's smape: 56.2559	valid_1's smape: 59.3224
[1