In [1]:
import pandas as pd
import lightgbm as lgb
from date import *
from mapreduce import *
from scipy.sparse import csr_matrix
from metric import get_weights
import numpy as np

In [2]:
def load_data_in_date_range(csv, start, end, position):
    # loading training data
    types = {'id': 'int32',
             'date': 'int16',
             'item_nbr': 'int32',
             'store_nbr': 'int16',
             'unit_sales': 'float32',
             'onpromotion': bool}
    days = get_days_in_range(start, end)
    cols = ['id', 'date', 'store_nbr', 'item_nbr', 'unit_sales', 'onpromotion']
    mapreduce = FilteringMapReduce(lambda df: df[df.date.isin(days)])
    return map_reduce_df(csv, mapreduce, types=types, position=position, cols=cols, verbose=True)

In [3]:
df = load_data_in_date_range('./data/train_processed.csv', '2017-05-01', '2017-08-15', 110000000)

Reading batch from position 110000000, batch size 10000000...
Filtering 10000000...
Filtered 5823749, mapping...
Mapped, reducing...
Batch done.
Reading batch from position 120000000, batch size 10000000...
Filtering 5497041...
Filtered 5497041, mapping...
Mapped, reducing...
Batch done.
End of dataset is found.


In [11]:
# log target
df.ix[df.unit_sales <0, 'unit_sales'] = 0
df['unit_sales'] = np.log1p(df['unit_sales'])

In [5]:
items = pd.read_csv('./data/items_encoded.csv')
stores = pd.read_csv('./data/stores_encoded.csv')


In [6]:
def extend_dataset(df, items, stores):
    df_ext, date_cols = add_date_features(df)
    df_ext = df_ext.merge(items, on='item_nbr')
    df_ext = df_ext.merge(stores, on='store_nbr')
    return df_ext, date_cols + items_cols + stores_cols

def fill_empty_sales(df):
    u_dates = df.date.unique()
    u_stores = df.store_nbr.unique()
    u_items = df.item_nbr.unique()
    df.set_index(["date", "store_nbr", "item_nbr"], inplace=True)
    df = df.reindex(
        pd.MultiIndex.from_product(
            (u_dates, u_stores, u_items),
            names=["date", "store_nbr", "item_nbr"]
        )
    )
    df.loc[:, "unit_sales"].fillna(0, inplace=True)
    df.loc[:, "onpromotion"].fillna(False, inplace=True)
    return df.reset_index()

def extract_train_test(df, train_start, train_end, test_start, test_end):
    train_range = get_days_in_range(train_start, train_end)
    test_range = get_days_in_range(test_start, test_end)
    train = df[df.date.isin(train_range)]
    test = df[df.date.isin(test_range)]
    return train, test

In [7]:
!telegram-send "Data is ready"

In [8]:
# make sense to try previous year values... 
# or just add values from previous year as lagged feature

In [9]:
folds = [('2017-05-01', '2017-06-15', '2017-06-16', '2017-06-30'),
         ('2017-05-16', '2017-06-30', '2017-07-01', '2017-07-15'),
         ('2017-06-01', '2017-07-15', '2017-07-16', '2017-07-31'),
         ('2017-06-16', '2017-07-31', '2017-08-01', '2017-08-15')]

param = {
        'num_leaves':30, 
         'num_trees':100,  
         'metric':'l2_root', 
         'is_sparse':False, 
         'is_training_metric':True
        }
results = []

for train_start, train_end, test_start, test_end in folds:
    print("Extracting fold...")
    train, test = extract_train_test(df,  train_start, train_end, test_start, test_end)
    
    print("Preparing train...")
    train = fill_empty_sales(train)
    train, ext_cols = extend_dataset(train, items, stores)
    x_cols = ext_cols
    train_X = train[x_cols]
    train_y = train['unit_sales']
    train_weights = get_weights(train['item_nbr'])
    train_dataset = lgb.Dataset(train_X, label=train_y, weight=train_weights)
    #del train
    
    print("Preparing test...")
    test = fill_empty_sales(test)
    test, _ = extend_dataset(test, items, stores)
    test_X = test[x_cols]
    test_y = test['unit_sales']
    test_weights = get_weights(test['item_nbr'])
    test_dataset = lgb.Dataset(test_X, label=test_y, weight=test_weights, reference=train_dataset)
    #del test
    
    print("Training!")
    bst = lgb.train(param, 
                    train_dataset, 
                    valid_sets=[test_dataset], 
                    early_stopping_rounds=10, 
                    verbose_eval=True, 
                    feature_name=x_cols, 
                    categorical_feature=x_cols)
    results.append(bst.best_iteration)

Extracting fold...


KeyboardInterrupt: 

In [10]:
!telegram-send "Validation finished. Results: $results."