In [6]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import gc

from importlib import reload
import date
reload(date)
from date import *
import model
reload(model)
from model import *
from metric import get_weights, NWRMSLE_log

In [2]:
df = load_data_in_date_range('./data/train_processed.csv', '2017-04-04', '2017-08-15', 110000000)

Reading batch from position 110000000, batch size 10000000...
Filtering 10000000...
Filtered 8659998, mapping...
Mapped, reducing...
Batch done.
Reading batch from position 120000000, batch size 10000000...
Filtering 5497041...
Filtered 5497041, mapping...
Mapped, reducing...
Batch done.
End of dataset is found.


In [3]:
print('Reading additional datasets')
items = pd.read_csv('./data/items_encoded.csv')
stores = pd.read_csv('./data/stores_encoded.csv')

In [4]:
print('Converting data and joining additional data')
df = convert_unit_sales(df)
df = fill_empty_sales(df)
df, cols_categories = extend_dataset(df, items, stores)
gc.collect()
!telegram-send "Data is ready, starting lagged and mean extraction."

[0m

In [None]:
cat_features = items_cols + stores_cols + ['store_nbr','item_nbr','onpromotion']
combinations = list(itertools.combinations(cat_features, 2)) + 
list(itertools.combinations(cat_features, 1)) + 
[['store_nbr','item_nbr','onpromotion'], ['store_nbr','item_nbr']]
print('Adding mean target encoding, categories: {}'.format(combinations))
df, cols_mean = add_mean_encoding(df, combinations)
gc.collect()
df.head()

In [None]:
print('Adding lagged features')
df_prev = df[['item_nbr', 'store_nbr', 'date', 'unit_sales']]
df, cols_lagged = fill_lagged(df, df_prev, 12, 18)
del df_prev
gc.collect()
!telegram-send "Lagged and mean extraction is ready, starting validation."

In [None]:
folds = [('2017-05-01', '2017-06-15', '2017-06-16', '2017-06-30'),
         ('2017-05-16', '2017-06-30', '2017-07-01', '2017-07-15'),
         ('2017-06-01', '2017-07-15', '2017-07-16', '2017-07-31'),
         ('2017-06-16', '2017-07-31', '2017-08-01', '2017-08-15')]

param = {
    'num_leaves':30,
    'objective':'regression_l2',
    'metric':'l2_root',
    'num_threads':4
        }

In [None]:
results = []
x_cols = cols_categories + ['onpromotion'] + cols_lagged + cols_mean
print('X cols: {}'.format(x_cols))

for train_start, train_end, test_start, test_end in folds:
    print("Extracting fold...")
    train = extract_by_date(df, train_start, train_end)
    test= extract_by_date(df, test_start, test_end)
    
    print("Preparing train...")
    train_X = train[x_cols]
    train_y = train['unit_sales']
    train_weights = get_weights(train['item_nbr'])
    train_dataset = lgb.Dataset(train_X, label=train_y, weight=train_weights)
    #del train
    
    print("Preparing test...")
    test_X = test[x_cols]
    test_y = test['unit_sales']
    test_weights = get_weights(test['item_nbr'])
    test_dataset = lgb.Dataset(test_X, label=test_y, weight=test_weights, reference=train_dataset)
    #del test
    
    print("Training!")
    bst = lgb.train(param, 
                    train_dataset, 
                    200,
                    valid_sets=[test_dataset], 
                    early_stopping_rounds=10, 
                    verbose_eval=True, 
                    feature_name=x_cols, 
                    categorical_feature=ext_cols)
    
    test_y_pred = bst.predict(test_X)
    error = NWRMSLE_log(test_y_pred, test_y, test_weights)
    print('Validation error: {}'.format(error))
    
    results.append((bst.best_iteration, error))

In [None]:
!telegram-send "Mean lagged xgb validation finished. Results: $results"

In [None]:
param['task'] = 'prediction'
start = '2017-07-01'
end = '2017-08-15'
num_round = 100
train = extract_by_date(df, start, end)
train, ext_cols = extend_dataset(train, items, stores)
x_cols = ext_cols + ['onpromotion'] + lagged_cols
train_X = train[x_cols]
train_y = train['unit_sales']
train_weights = get_weights(train['item_nbr'])
train_dataset = lgb.Dataset(train_X, label=train_y, weight=train_weights)
bst = lgb.train(param, 
                train_dataset,
                num_round,
                feature_name=x_cols, 
                categorical_feature=ext_cols)



In [None]:
test = pd.read_csv('./data/test_processed.csv', dtype=types)

df_prev = extract_by_date(df, '2017-07-25', '2017-08-15')
test = fill_mean_encoding(test, df_prev)
test = fill_lagged(test, df_prev, 12, 18)

test, ext_cols = extend_dataset(test, items, stores)
x_cols = ext_cols + ['onpromotion'] + lagged_cols
test_X = test[x_cols]
test['unit_sales'] = bst.predict(test_X)

In [None]:
test.sort_values(by='id', inplace=True)
test.ix[test.unit_sales < 0, 'unit_sales'] = 0
test[['id', 'unit_sales']].to_csv('./submissions/lgb_mean_encoded_lagged_0.54.csv.gz', float_format="%.4f", index=False, compression='gzip')
!telegram-send "Submission done."