In [2]:
import pandas as pd
import numpy as np
import os
import gc
from itertools import product
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
%matplotlib inline 

In [4]:
# Objective:
# predict total sales for every product and store in the next month based on lagged features

# NOTE:
# to use tqdm_notebook with python 3 (https://github.com/tqdm/tqdm/issues/187)

### Load data

In [5]:
transactions = pd.read_csv('sales_train.csv.gz')
items = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops = pd.read_csv('shops.csv')

print(transactions.shape)
print(items.shape)
print(item_categories.shape)
print(shops.shape)
transactions.head(3)

(2935849, 6)
(22170, 3)
(84, 2)
(60, 2)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0


### Create Feature Matrix

1. Downcast types to save memory (method taken from Advanced ML HSE)

In [6]:
def downcast_dtypes(df):
    ''' Changes column types in the dataframe: 
        `float64` type to `float32`
        `int64`   type to `int32`
    Args:
        df - pandas data frame
    Returns:
        df - downcasted data frame
    '''
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]

    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    return df

def create_grid(sales, index_cols = ['shop_id', 'item_id', 'date_block_num']):
    # For every month we create a grid from all shops/items combinations from that month
    grid = [] 
    for block_num in sales['date_block_num'].unique():
        cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
        cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    # Turn the grid into pandas dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    # Get aggregated values for (shop_id, item_id, month)
    gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    
    # Join aggregated data to the grid
    all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
    
    # shop-month aggregates
    gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

    # item-month aggregates
    gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)
    
    # Sort the data
    all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
    all_data = downcast_dtypes(all_data)
    return all_data

In [7]:
path = "transactions_all_data_2-1.csv"
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

if os.path.isfile(path):
    data = pd.read_csv(path)
else:
    data = create_grid(transactions, index_cols)
    data.to_csv(path, index=False)
    
gc.collect();

In [8]:
data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item
0,0,19,0,0.0,5578.0,1.0
1,0,27,0,0.0,5578.0,7.0
2,0,28,0,0.0,5578.0,8.0
3,0,29,0,0.0,5578.0,4.0
4,0,32,0,6.0,5578.0,299.0


### Aggregates as a form of mean encoding

In [9]:
# Check if aggregates make sense
# Total sales for item_id 32 for date block 0 is 299
assert(data[(data.item_id == 32) & (data.date_block_num==0)].target.sum()==299)
# Total sales for shop_id 0 for date block 0 is 5578
assert(data[(data.shop_id == 0) & (data.date_block_num==0)].target.sum()==5578)

Note that assigning aggregate values as a feature is a form of mean encoding.

### Get Lagged features

Using 1,3,12 for ranges. (Note: Adding more lagged features resulted in memry issues. Need to address)

In [10]:

# List of columns that we will use to create lags
cols_to_rename = list(data.columns.difference(index_cols)) 
print(cols_to_rename)

# Lags to use
shift_range = [1,3,12]

for month_shift in tqdm_notebook(shift_range):
    
    train_shift = data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    data = pd.merge(data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Don't use old data from year 2013
data = data[data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

data = pd.merge(data, item_category_mapping, how='left', on='item_id')
data = downcast_dtypes(data)
gc.collect();

['target', 'target_item', 'target_shop']





In [11]:
path = "2-1-lagged-features.csv"

if os.path.isfile(path):
    data = pd.read_csv(path)
else:
    data.to_csv(path, index=False)

In [12]:
data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id
0,2,27,12,0.0,890.0,1.0,0.0,4.0,1322.0,0.0,6.0,795.0,1.0,7.0,1146.0,19
1,2,30,12,0.0,890.0,58.0,0.0,47.0,1322.0,0.0,24.0,795.0,0.0,0.0,0.0,40
2,2,31,12,0.0,890.0,15.0,0.0,25.0,1322.0,0.0,25.0,795.0,0.0,0.0,0.0,37
3,2,32,12,1.0,890.0,84.0,0.0,89.0,1322.0,0.0,58.0,795.0,0.0,299.0,1146.0,40
4,2,33,12,1.0,890.0,42.0,1.0,42.0,1322.0,0.0,33.0,795.0,1.0,61.0,1146.0,37


### Train/Valid Split

In [13]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = data['date_block_num']

last_block = dates.max()
print('Valid `date_block_num` is %d' % last_block)

Valid `date_block_num` is 33


In [14]:
['target_item', 'target', 'target_shop', 'date_block_num']

['target_item', 'target', 'target_shop', 'date_block_num']

Split the data into train and validation.

Note: need to drop 'target_item', 'target', 'target_shop', 'date_block_num' as leaving them will introduce look ahead bias.

In [15]:
dates_train = dates[dates <  last_block]
dates_valid  = dates[dates == last_block]

# Drop columns that would introduce look ahead bias
X_train = data.loc[dates <  last_block].drop(to_drop_cols, axis=1)
X_valid =  data.loc[dates == last_block].drop(to_drop_cols, axis=1)

y_train = data.loc[dates <  last_block, 'target'].values
y_valid =  data.loc[dates == last_block, 'target'].values

In [16]:
X_train.head()

Unnamed: 0,shop_id,item_id,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_3,target_item_lag_3,target_shop_lag_3,item_category_id
0,2,27,0.0,4.0,1322.0,0.0,6.0,795.0,19
1,2,30,0.0,47.0,1322.0,0.0,24.0,795.0,40
2,2,31,0.0,25.0,1322.0,0.0,25.0,795.0,37
3,2,32,0.0,89.0,1322.0,0.0,58.0,795.0,40
4,2,33,1.0,42.0,1322.0,0.0,33.0,795.0,37


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

lr = LinearRegression()
lr.fit(X_train.values, y_train)
pred_lr = lr.predict(X_valid.values)

print('Test R-squared for linreg is %f' % r2_score(y_valid, pred_lr))

Test R-squared for linreg is 0.221436


In [18]:
import lightgbm as lgb

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb = model.predict(X_valid)

print('Test R-squared for LightGBM is %f' % r2_score(y_valid, pred_lgb))

Test R-squared for LightGBM is 0.317940


In [20]:
# TODO: 
# 1. Concat train and test, create features, and split into train and test
# 2. Split train into train and validation
# 3. Find model to predict, and then apply model to test.
# 4. Submit for evaluation
