In [1]:
import pandas as pd
import lightgbm as lgb
from date import *
from importlib import reload
import model
reload(model)
from model import *
from metric import get_weights, NWRMSLE_log
import numpy as np

In [2]:
df = load_data_in_date_range('./data/train_processed.csv', '2017-04-04', '2017-08-15', 110000000)

Reading batch from position 110000000, batch size 10000000...
Filtering 10000000...
Filtered 8659998, mapping...
Mapped, reducing...
Batch done.
Reading batch from position 120000000, batch size 10000000...
Filtering 5497041...
Filtered 5497041, mapping...
Mapped, reducing...
Batch done.
End of dataset is found.


In [3]:
df = convert_unit_sales(df)
df = fill_empty_sales(df)
df, lagged_cols = add_lagged_and_mean_encoding(df, 12, 18)
df.head()

Unnamed: 0,date,store_nbr,item_nbr,id,unit_sales,onpromotion,unit_sales_mean,unit_sales(t-12),unit_sales(t-13),unit_sales(t-14),unit_sales(t-15),unit_sales(t-16),unit_sales(t-17),unit_sales(t-18)
0,1576,46,1695836,113723318.0,3.983762,True,3.759474,0.0,2.519459,3.73618,2.562543,2.337137,4.329911,0.0
1,1576,46,1695837,113723319.0,2.596886,True,1.755228,1.568372,1.610999,1.583318,1.061135,2.051215,2.390052,2.177057
2,1576,46,1695840,113723320.0,1.098612,True,1.730986,2.197225,2.70805,1.791759,1.791759,1.94591,1.609438,1.94591
3,1576,46,1695845,113723321.0,3.044523,True,3.595048,3.610918,3.78419,3.713572,3.135494,3.178054,3.970292,3.688879
4,1576,46,1695846,113723322.0,1.583318,True,1.075955,1.059444,1.029898,1.362078,1.232469,0.733813,1.667117,2.067159


In [4]:
items = pd.read_csv('./data/items_encoded.csv')
stores = pd.read_csv('./data/stores_encoded.csv')

In [5]:
!telegram-send "Data is ready"

In [6]:
folds = [('2017-05-01', '2017-06-15', '2017-06-16', '2017-06-30'),
         ('2017-05-16', '2017-06-30', '2017-07-01', '2017-07-15'),
         ('2017-06-01', '2017-07-15', '2017-07-16', '2017-07-31'),
         ('2017-06-16', '2017-07-31', '2017-08-01', '2017-08-15')]

param = {
    'num_leaves':30,
    'objective':'regression_l2',
    'metric':'l2_root',
    'num_threads':4
        }

In [8]:
results = []

for train_start, train_end, test_start, test_end in folds:
    print("Extracting fold...")
    train = extract_by_date(df, train_start, train_end)
    test= extract_by_date(df, test_start, test_end)
    
    print("Preparing train...")
    train, ext_cols = extend_dataset(train, items, stores)
    x_cols = ext_cols + ['onpromotion'] + lagged_cols
    train_X = train[x_cols]
    train_y = train['unit_sales']
    train_weights = get_weights(train['item_nbr'])
    train_dataset = lgb.Dataset(train_X, label=train_y, weight=train_weights)
    #del train
    
    print("Preparing test...")
    test, _ = extend_dataset(test, items, stores)
    test_X = test[x_cols]
    test_y = test['unit_sales']
    test_weights = get_weights(test['item_nbr'])
    test_dataset = lgb.Dataset(test_X, label=test_y, weight=test_weights, reference=train_dataset)
    #del test
    
    print("Training!")
    bst = lgb.train(param, 
                    train_dataset, 
                    200,
                    valid_sets=[test_dataset], 
                    early_stopping_rounds=10, 
                    verbose_eval=True, 
                    feature_name=x_cols, 
                    categorical_feature=ext_cols)
    
    test_y_pred = bst.predict(test_X)
    error = NWRMSLE_log(test_y_pred, test_y, test_weights)
    print('Validation error: {}'.format(error))
    
    results.append((bst.best_iteration, error))

Extracting fold...
Preparing train...
Preparing test...
Training!




[1]	valid_0's rmse: 0.95709
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's rmse: 0.892796
[3]	valid_0's rmse: 0.837109
[4]	valid_0's rmse: 0.789127
[5]	valid_0's rmse: 0.748215
[6]	valid_0's rmse: 0.713331
[7]	valid_0's rmse: 0.683847
[8]	valid_0's rmse: 0.659181
[9]	valid_0's rmse: 0.638232
[10]	valid_0's rmse: 0.620906
[11]	valid_0's rmse: 0.606442
[12]	valid_0's rmse: 0.594603
[13]	valid_0's rmse: 0.584795
[14]	valid_0's rmse: 0.576736
[15]	valid_0's rmse: 0.570162
[16]	valid_0's rmse: 0.564794
[17]	valid_0's rmse: 0.560443
[18]	valid_0's rmse: 0.556923
[19]	valid_0's rmse: 0.554025
[20]	valid_0's rmse: 0.55167
[21]	valid_0's rmse: 0.549768
[22]	valid_0's rmse: 0.548235
[23]	valid_0's rmse: 0.547009
[24]	valid_0's rmse: 0.545991
[25]	valid_0's rmse: 0.545184
[26]	valid_0's rmse: 0.544562
[27]	valid_0's rmse: 0.543974
[28]	valid_0's rmse: 0.543547
[29]	valid_0's rmse: 0.54322
[30]	valid_0's rmse: 0.542888
[31]	valid_0's rmse: 0.542634
[32]	valid_0's rmse:

In [15]:
!telegram-send "Lagged xgb validation finished. Results: $results"

In [10]:
param['task'] = 'prediction'
start = '2017-07-01'
end = '2017-08-15'
num_round = 100
train = extract_by_date(df, start, end)
train, ext_cols = extend_dataset(train, items, stores)
x_cols = ext_cols + ['onpromotion'] + lagged_cols
train_X = train[x_cols]
train_y = train['unit_sales']
train_weights = get_weights(train['item_nbr'])
train_dataset = lgb.Dataset(train_X, label=train_y, weight=train_weights)
bst = lgb.train(param, 
                train_dataset,
                num_round,
                feature_name=x_cols, 
                categorical_feature=ext_cols)



In [11]:
test = pd.read_csv('./data/test_processed.csv', dtype=types)

df_prev = extract_by_date(df, '2017-07-25', '2017-08-15')
test = fill_mean_encoding(test, df_prev)
test = fill_lagged(test, df_prev, 12, 18)

test, ext_cols = extend_dataset(test, items, stores)
x_cols = ext_cols + ['onpromotion'] + lagged_cols
test_X = test[x_cols]
test['unit_sales'] = bst.predict(test_X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_prev[colname] = df_prev['unit_sales']


In [14]:
test.sort_values(by='id', inplace=True)
test.ix[test.unit_sales < 0, 'unit_sales'] = 0
test[['id', 'unit_sales']].to_csv('./submissions/lgb_lagged_0.54.csv.gz', float_format="%.4f", index=False, compression='gzip')
!telegram-send "Submission done."

In [2]:
submission = pd.read_csv('./submissions/lgb_lagged_0.54.csv.gz', compression='gzip')
submission.ix[submission.unit_sales < 0, 'unit_sales'] = 0
submission['unit_sales'] = np.expm1(submission['unit_sales'])
submission.to_csv('./submissions/lgb_lagged_0.54.csv.gz', float_format="%.4f", index=False, compression='gzip')
!telegram-send "Submission done."