https://www.kaggle.com/cast42/xgboost-in-python-with-rmspe-v2

https://www.kaggle.com/cast42/xgboost-extra-features


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import gc
from dateutil.relativedelta import relativedelta
import random
import lightgbm as lgb
gc.enable()
from sklearn.linear_model import Ridge

In [None]:
%pylab inline

In [2]:
param = {}
param['application'] = 'regression_l2'
param['learning_rate'] = 0.1
param['feature_fraction'] = 0.5
param['bagging_fraction'] = 0.5
param['bagging_freq'] = 1
param['max_depth'] = 5
param['num_threads'] = 4
param['verbose'] = 0

In [None]:

df = pd.read_csv("data/train_1.csv")
df = pd.melt(df, id_vars='Page', var_name='date', value_name='Visits')
df['date'] = df['date'].astype('datetime64[ns]')
df['Visits'] = df['Visits'].astype('float32')

temp = df.loc[(df["Visits"] > 0) & (df["date"] < '2016-03-01')].groupby(["Page"]).size()
pages = list(temp.index)
df = df.loc[(df["date"] > '2015-03-01') & (df["Page"].isin(pages))]

le = LabelEncoder()
df.loc[:,"Page"] = le.fit_transform(df["Page"])

random.seed(2)
random_pages = random.sample(sorted(df["Page"].unique()), 50000)
df = df.loc[df["Page"].isin(random_pages)]
'

In [3]:
#to speed up experiments
#df.to_csv("data/my_df.csv", index=False, encoding = 'UTF-8')
#df = pd.read_csv("data/my_df.csv", parse_dates=["date"], dtype={"Page":'int64', "Visits": 'float32'})

In [4]:
from numba import jit
import math

@jit
def smape_fast(y_true, y_pred):
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        if b < 1:
            b = 0
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out

def lgb_smape(preds, df):
    labels = df.get_label()
    labels, preds = np.expm1(np.array(labels)), np.expm1(np.array(preds))
    return 'smape', smape_fast(labels, preds), False

In [5]:
  def create_time_features(data):
    print("Creating counters....")
    data['weekday'] = data.date.dt.weekday
    data['is_weekend'] = ((data.date.dt.dayofweek) // 5 == 1).astype('int32')
    data['year'] = data.date.dt.year 
    data['month'] = data.date.dt.month
    data['day'] = data.date.dt.day
    data['dayCount'] = data['date'].apply(lambda x: x.toordinal())
    data['weekOfYear'] = data.date.dt.weekofyear
    return data

In [6]:
#little engeneering
df.Visits.fillna(0, inplace=True)
df = create_time_features(df)
df["Visits_log"] = np.log1p(df["Visits"])
df['yearminusone'] = df['year'] - 1

#add to every datapoint visits from one year ago
df = pd.merge(df, df[['Page','year','month','day','Visits_log']], left_on =['Page','yearminusone','month','day'],\
                     right_on = ['Page','year','month','day'], how = 'left', suffixes=('', '_2015'), sort = False)

df.drop(['yearminusone','year_2015','year'], axis = 1, inplace = True)

df = df.loc[df.date >= '2015-11-11']

gc.collect()

Creating counters....


154

In [None]:
#df.to_pickle("data/my_df.pcl")
#df = pd.read_pickle("data/my_df.pcl")

In [29]:
def prepareData(data, train_start_date, train_end_date, test_start_date, test_end_date):
    data.sort_values(by = "dayCount", inplace = True)
    data = data.reset_index(drop = True)
    
    train_start_index = data.loc[data.date == train_start_date].index[0]
    train_end_index = data.loc[data.date == train_end_date].index[-1]
    test_start_index = data.loc[data.date == test_start_date].index[0]
    test_end_index = data.loc[data.date == test_end_date].index[-1]
   
    last60days_index = data.loc[data.date == (pd.to_datetime(train_end_date) - relativedelta(days = 60))].index[0]
    last30days_index = data.loc[data.date == (pd.to_datetime(train_end_date) - relativedelta(days = 30))].index[0]
    
    
    #dirty hack: делаем в тестовом сете из остатка 11-го месяца - 10-й 
    data.loc[((data.month == 11) & (data.index >=test_start_index) & (data.index <= test_end_index)),"month"] = 10 
    
    # just for simplify
    data.rename(columns={"Visits":"y", "Visits_log":"y_log"}, inplace=True)
    
    print("Calculate averages....")
    # считаем средние только по тренировочной части, чтобы избежать лика
    # 
    temp = pd.DataFrame(data.loc[train_start_index:train_end_index].groupby(['Page','weekday'])['y_log'].median())
    temp.columns = ['weekday_average']
    data = data.join(temp, on =['Page','weekday'], how = 'left', sort = False)
             
    temp = pd.DataFrame(data.loc[train_start_index:train_end_index].groupby(['Page'])['y_log'].median())
    temp.columns = ['Page_average']
    data = data.join(temp, on =['Page'], how = 'left', sort = False)
           
    temp = pd.DataFrame(data.loc[train_start_index:train_end_index].groupby(['Page'])['y_log'].quantile(0.95))
    temp.columns = ['quant_95']
    data = data.join(temp, on =['Page'], how = 'left', sort = False)  
    
    temp = pd.DataFrame(data.groupby(['Page','weekOfYear'])['y_log'].median())
    temp.columns = ['week_0']
    temp['week_10'] = temp.week_0.shift(10)
    temp['week_11'] = temp.week_0.shift(11)
    temp['week_12'] = temp.week_0.shift(12)    
    data = data.join(temp, on =['Page','weekOfYear'], how = 'left', sort = False)
      
    
    #temp = pd.DataFrame(data.loc[last60days_index:train_end_index].groupby(['Page'])['y_log'].std())
    #temp.columns = ['std_dev']
    #data = data.join(temp, on =['Page'], how = 'left', sort = False)  
    
    
      
    #make feature "month number to predict": first of second
    data['test_month_no'] = ((data.month % 2 == 0) + 1).astype('int32')
    
    # фича - для первого месяца предикта подцепляем медиану минус один месяц назад, для второго - минус 2 месяца назад
    temp = pd.DataFrame(data.groupby(['Page','month','test_month_no'])['y_log'].median())
    temp.columns = ['month_0']
    temp['month_1'] = temp.month_0.shift(1)
    temp['month_2'] = temp.month_0.shift(2)
    temp['month_3'] = temp.month_0.shift(3)
    
    temp = temp.reset_index()
    temp['month_1'] = temp.month_0.shift(1)
    temp['month_2'] = temp.month_0.shift(2)
    temp["last_month"] = 0   
    temp.loc[temp.test_month_no == 1,"last_month"] = temp.month_1
    temp.loc[temp.test_month_no == 2,"last_month"] = temp.month_2
    temp.set_index(['Page', 'month', 'test_month_no'], inplace=True)    
    data = data.join(temp, on =['Page','month','test_month_no'], how = 'left', sort = False)
    
    data.sort_values(by = "dayCount", inplace = True)
    data = data.reset_index(drop = True)
    
    features = ['Page','Page_average','date','is_weekend', 'dayCount','weekday_average',\
                'week_10','last_month','month','Visits_log_2015', 'quant_95','test_month_no', 'y_log']
    return data[features]

In [28]:
def train_test_split(data, train_start_date,train_end_date, test_start_date, test_end_date):
    data.sort_values(by = "dayCount", inplace = True)
    data = data.reset_index(drop = True)
    train_start_index = data.loc[data.date == train_start_date].index[0]
    train_end_index = data.loc[data.date == train_end_date].index[-1]
    test_start_index = data.loc[data.date == test_start_date].index[0]
    test_end_index = data.loc[data.date == test_end_date].index[-1]

    print("Splitting to train - test....")
    X_train = data.loc[train_start_index:train_end_index,:].copy()
    
    #remove outliers
    X_train.loc[(X_train.y_log > X_train.quant_95),"y_log"] = X_train.quant_95
       
    y_train = X_train.loc[:,"y_log"]
    X_train = X_train.drop(["y_log"], axis=1)
       
    X_test = data.loc[test_start_index:test_end_index,:].copy()
    y_test = X_test.loc[:,"y_log"]
    X_test = X_test.drop(["y_log"], axis=1)
   
    print("Splitting done")
    return X_train, y_train, X_test, y_test

In [30]:
%%time
#train_start_date = '2016-01-01'
train_start_date = '2016-01-01'
train_end_date  = '2016-08-31'
test_start_date  = '2016-09-10'
test_end_date  = '2016-11-10'
dates = [train_start_date, train_end_date,test_start_date, test_end_date]
dataForModel = prepareData(df.copy(), *dates)
gc.collect()


Calculate averages....
Wall time: 1min 11s


In [34]:
dataForModel.Visits_log_2015.fillna(dataForModel.last_month, inplace = True) 

In [38]:
X_train,y_train,X_test,y_test = train_test_split(dataForModel, *dates)

Splitting to train - test....
Splitting done


In [40]:
# features used to train model 
features = ['Page','is_weekend', 'dayCount','weekday_average',\
                'last_month','test_month_no','month','Visits_log_2015']

lgb_train = lgb.Dataset(X_train[features], label=y_train, free_raw_data=False)
lgb_val = lgb.Dataset(X_test[features], label=y_test, free_raw_data=False, reference=lgb_train)

model = lgb.train(param, lgb_train, 300, valid_sets=[lgb_train,lgb_val], feval=lgb_smape, early_stopping_rounds=50)

[1]	training's smape: 121.984	valid_1's smape: 121.276
Training until validation scores don't improve for 50 rounds.
[2]	training's smape: 115.856	valid_1's smape: 115.299
[3]	training's smape: 109.747	valid_1's smape: 109.381
[4]	training's smape: 109.746	valid_1's smape: 109.413
[5]	training's smape: 105.18	valid_1's smape: 103.849
[6]	training's smape: 100.762	valid_1's smape: 98.4604
[7]	training's smape: 96.533	valid_1's smape: 93.3888
[8]	training's smape: 90.9696	valid_1's smape: 88.269
[9]	training's smape: 85.6938	valid_1's smape: 83.2946
[10]	training's smape: 82.8651	valid_1's smape: 79.9919
[11]	training's smape: 79.6259	valid_1's smape: 76.1508
[12]	training's smape: 79.6227	valid_1's smape: 76.1695
[13]	training's smape: 79.2981	valid_1's smape: 75.8526
[14]	training's smape: 74.8982	valid_1's smape: 72.1568
[15]	training's smape: 70.8534	valid_1's smape: 68.836
[16]	training's smape: 67.1454	valid_1's smape: 65.8523
[17]	training's smape: 63.8359	valid_1's smape: 63.2573