In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import gc
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)

from importlib import reload
import date
reload(date)
from date import *
import model
reload(model)
from model import *
from metric import get_weights, NWRMSLE_log

In [2]:
df = load_data_in_date_range('./data/train_processed.csv', '2017-04-04', '2017-08-15', 110000000)

Reading batch from position 110000000, batch size 10000000...
Filtering 10000000...
Filtered 8659998, mapping...
Mapped, reducing...
Batch done.
Reading batch from position 120000000, batch size 10000000...
Filtering 5497041...
Filtered 5497041, mapping...
Mapped, reducing...
Batch done.
End of dataset is found.


In [3]:
print('Reading additional datasets')
items = pd.read_csv('./data/items_encoded.csv')
stores = pd.read_csv('./data/stores_encoded.csv')

Reading additional datasets


In [4]:
print('Converting and joining additional data')
df = convert_unit_sales(df)
df = fill_empty_sales(df)
df, cols_categories = extend_dataset(df, items, stores)

Converting and joining additional data


In [None]:
df.dtypes

date             int16
store_nbr        int16
item_nbr         int32
id             float64
unit_sales     float32
onpromotion       bool
weekday          int16
weekend           bool
salary            bool
family           int16
class            int16
perishable        bool
city             int16
state            int16
type             int16
cluster          int16
dtype: object

In [None]:
# cat_features = items_cols + stores_cols + ['store_nbr','item_nbr','onpromotion']
# combinations = list(itertools.combinations(cat_features, 1)) + \
# [['store_nbr','item_nbr'], ['store_nbr','item_nbr', 'onpromotion']]

combinations = [['item_nbr'],
                ['store_nbr','item_nbr'], 
                ['store_nbr','item_nbr', 'onpromotion']]
print('Adding mean target encoding')

df, cols_mean = add_mean_encoding(df, combinations)

print("="*80)

ranges2 = get_one_week_ranges(16, get_date_index_parse('2017-08-15'))
df, cols_mean2 = add_mean_encoding(df, combinations, ranges=ranges2, prefix='1week_')
cols_mean += cols_mean2
df.reset_index(inplace=True)
df = optimize_df_types(df)
df.head()

Adding mean target encoding


In [None]:
df.dtypes

In [8]:
print('Adding lagged features')
df_prev = df[['item_nbr', 'store_nbr', 'date', 'unit_sales']]
df, cols_lagged = fill_lagged(df, df_prev, 12, 14, True)
del df_prev
gc.collect()
df.head()

Adding lagged features


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_prev[colname] = df_prev['unit_sales']


Adding lag 12...


MemoryError: 

In [None]:
!telegram-send "Lagged and mean extraction is ready, starting validation."

In [None]:
df = extract_by_date(df, '2017-05-16', '2017-08-15')
gc.collect()
df.head()

In [None]:
folds = [('2017-05-16', '2017-06-30', '2017-07-01', '2017-07-15'),
         ('2017-06-01', '2017-07-15', '2017-07-16', '2017-07-31'),
         ('2017-06-16', '2017-07-31', '2017-08-01', '2017-08-15')]

param = {
    'num_leaves':30,
    'objective':'regression_l2',
    'metric':'l2_root',
    'num_threads':4
        }

In [None]:
to_remove = {'perishable', 'type', 'city', 'family',
            'mean_unit_sales_by_(state)', 'mean_unit_sales_by_(city)',
            'mean_unit_sales_by_(cluster)', 'mean_unit_sales_by_(type)'}
def remove_cols(cols, remove):
    return list(set(cols) - set(remove))

In [12]:
cols_mean

['mean_unit_sales_by_(item_nbr)',
 'mean_unit_sales_by_(store_nbr+item_nbr)',
 'mean_unit_sales_by_(store_nbr+item_nbr+onpromotion)']

In [13]:
errors = []
bsts = []
mean_c = ['mean_unit_sales_by_(store_nbr+item_nbr+onpromotion)', 
          #'mean_unit_sales_by_(item_nbr)', 
          #'mean_unit_sales_by_(store_nbr+item_nbr)',
          #'mean_unit_sales_by_(store_nbr)'
         ]

x_cols = cols_categories + ['onpromotion'] + cols_lagged + cols_mean
#x_cols = remove_cols(x_cols, to_remove)

cat_cols = cols_categories + ['onpromotion']
#cat_cols = remove_cols(cat_cols, to_remove)

for train_start, train_end, test_start, test_end in folds:
    print("Extracting fold...")
    train = extract_by_date(df, train_start, train_end)
    test= extract_by_date(df, test_start, test_end)
    
    print("Preparing train...")
    train_X = train[x_cols].fillna(0.0)
    train_y = train['unit_sales']
    train_weights = get_weights(train['item_nbr'])
    del train
    gc.collect()
    train_dataset = lgb.Dataset(train_X, label=train_y, weight=train_weights)
    del train_X
    del train_y
    del train_weights
    gc.collect()
    
    print("Preparing test...")
    test_X = test[x_cols].fillna(0.0)
    test_y = test['unit_sales']
    test_weights = get_weights(test['item_nbr'])
    del test
    gc.collect()
    test_dataset = lgb.Dataset(test_X, label=test_y, weight=test_weights, reference=train_dataset)
    gc.collect()

    print("="*80)
    bst = lgb.train(param, 
                    train_dataset, 
                    150,
                    valid_sets=[test_dataset], 
                    early_stopping_rounds=10, 
                    verbose_eval=True, 
                    feature_name=x_cols, 
                    categorical_feature=cat_cols)
    
    del test_dataset
    del train_dataset
    gc.collect()
    
    test_y_pred = bst.predict(test_X)
    error = NWRMSLE_log(test_y_pred, test_y, test_weights)
    print('Validation error: {}'.format(error))
    print("="*80)
    
    errors.append(error)
    bsts.append(bst)

NameError: name 'cols_lagged' is not defined

In [None]:
mean_error = np.mean(errors)
!telegram-send "Mean lagged xgb validation finished. Results: $mean_error"

In [None]:
prev = list(sorted(zip(bsts[0].feature_importance(), x_cols), key=lambda x: x[0]))

In [None]:
lgb.plot_importance(bsts[0])

In [37]:
param['task'] = 'prediction'
start = '2017-07-01'
end = '2017-08-15'
num_round = 150
train = extract_by_date(df, start, end)
train_X = train[x_cols]
train_y = train['unit_sales']
train_weights = get_weights(train['item_nbr'])
train_dataset = lgb.Dataset(train_X, label=train_y, weight=train_weights)
bst = lgb.train(param, 
                train_dataset,
                num_round,
                feature_name=x_cols, 
                categorical_feature=cat_cols)



In [38]:
test = pd.read_csv('./data/test_processed.csv', dtype=types)
test, _ = extend_dataset(test, items, stores)
df_prev = extract_by_date(df, '2017-07-25', '2017-08-15')

test, _ = fill_lagged(test, df_prev, 12, 18)
test, _ = fill_mean_encoding(test, df_prev, combinations)
test.perishable = test.perishable.astype('bool')

test_X = test[x_cols]
test['unit_sales'] = bst.predict(test_X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_prev[colname] = df_prev['unit_sales']


In [39]:
test.sort_values(by='id', inplace=True)
test.ix[test.unit_sales < 0, 'unit_sales'] = 0
test['unit_sales'] = np.expm1(test['unit_sales'])
test[['id', 'unit_sales']].to_csv('./submissions/lgb_mean_encoded_lagged_0.53.csv.gz', float_format="%.4f", index=False, compression='gzip')
!telegram-send "Submission is done."

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


[0m