In [1]:
import pandas as pd
import numpy as np
import os
import gc
from itertools import product
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
%matplotlib inline 

In [64]:
# Objective:
# predict total sales for every product and store in the next month based on lagged features and mean encodings

NOTE:
1. adding mean encodings to lagged features (2-0-prediction_lagged_features) from 3-0-mean_encodings notebook
2. result is ~1.02...first pass of mean encoding results in worse performance. I need to check procedure behind mean encoding to make sure its implemented correctly

### Load data

In [65]:
transactions = pd.read_csv('sales_train.csv.gz')
print(transactions.shape)
transactions.head(3)

(2935849, 6)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0


### Create Feature Matrix

1. Downcast types to save memory (method taken from Advanced ML HSE)

In [4]:
def downcast_dtypes(df):
    ''' Changes column types in the dataframe: 
        `float64` type to `float32`
        `int64`   type to `int32`
    Args:
        df - pandas data frame
    Returns:
        df - downcasted data frame
    '''
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]

    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    return df

def create_grid(sales, index_cols = ['shop_id', 'item_id', 'date_block_num']):
    # For every month we create a grid from all shops/items combinations from that month
    grid = [] 
    for block_num in sales['date_block_num'].unique():
        cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
        cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    # Turn the grid into pandas dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    # Get aggregated values for (shop_id, item_id, month)
    gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    
    # Join aggregated data to the grid
    all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
    
    # shop-month aggregates
    gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

    # item-month aggregates
    gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)
    
    # Sort the data
    all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
    all_data = downcast_dtypes(all_data)
    return all_data

In [5]:
path = "transactions_all_data_2-1.csv"
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

if os.path.isfile(path):
    data = pd.read_csv(path)
else:
    data = create_grid(transactions, index_cols)
    data.to_csv(path, index=False)
    
gc.collect();

In [6]:
data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item
0,0,19,0,0.0,5578.0,1.0
1,0,27,0,0.0,5578.0,7.0
2,0,28,0,0.0,5578.0,8.0
3,0,29,0,0.0,5578.0,4.0
4,0,32,0,6.0,5578.0,299.0


In [7]:
data.shape

(10913850, 6)

### Aggregates as a form of mean encoding
Note that assigning aggregate values as a feature is a form of mean encoding.

In [8]:
# Check if aggregates make sense
# Total sales for item_id 32 for date block 0 is 299
assert(data[(data.item_id == 32) & (data.date_block_num==0)].target.sum()==299)
# Total sales for shop_id 0 for date block 0 is 5578
assert(data[(data.shop_id == 0) & (data.date_block_num==0)].target.sum()==5578)


#### Change test set dimensions to match the rest of data

In [9]:
test = pd.read_csv('test.csv')
test.head(3)

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233


In [10]:
test_orig = test.copy()
test['date_block_num'] = np.ones(len(test.index))* 34
test['target'] = np.zeros(len(test.index))
test['target_shop'] = np.zeros(len(test.index))
test['target_item'] = np.zeros(len(test.index))
cols = test.columns.tolist()
print(cols)
test = test[cols]
ID = test['ID']
del test['ID']

test.head()

['ID', 'shop_id', 'item_id', 'date_block_num', 'target', 'target_shop', 'target_item']


Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item
0,5,5037,34.0,0.0,0.0,0.0
1,5,5320,34.0,0.0,0.0,0.0
2,5,5233,34.0,0.0,0.0,0.0
3,5,5232,34.0,0.0,0.0,0.0
4,5,5268,34.0,0.0,0.0,0.0


In [11]:
test.shape

(214200, 6)

In [12]:
all_data = pd.concat([data,test], axis=0)

In [13]:
assert(all_data.shape[0] == data.shape[0] + test.shape[0])

### Get Lagged features

Using 1,3,12 for ranges. (Note: Adding more lagged features resulted in memry issues. Need to address)

In [14]:
data = all_data

def get_lagged(data, shift_range = [1,3,12],index_cols = ['shop_id', 'item_id', 'date_block_num']):
    # List of columns that we will use to create lags
    cols_to_rename = list(data.columns.difference(index_cols)) 

    for month_shift in tqdm_notebook(shift_range):

        train_shift = data[index_cols + cols_to_rename].copy()

        train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift

        foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        data = pd.merge(data, train_shift, on=index_cols, how='left').fillna(0)

    del train_shift

    # Don't use old data from year 2013
    data = data[data['date_block_num'] >= 12] 

    # List of all lagged features
    fit_cols = [col for col in data.columns if col[-1] in [str(item) for item in shift_range]] 
    # We will drop these at fitting stage
    to_drop_cols = list(set(list(data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

    # Category for each item
    item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

    data = pd.merge(data, item_category_mapping, how='left', on='item_id')
    data = downcast_dtypes(data)
    gc.collect();
    return data, to_drop_cols

In [15]:
path = "2-1-lagged-features-all.csv"
if os.path.isfile(path):
    data = pd.read_csv(path)
else:
    data, to_drop_cols = get_lagged(data)
    data.to_csv(path, index=False)

In [16]:
to_drop_cols = ['target_item', 'target', 'target_shop', 'date_block_num']

In [17]:
data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id
0,2,27,12.0,0.0,890.0,1.0,0.0,4.0,1322.0,0.0,6.0,795.0,1.0,7.0,1146.0,19
1,2,30,12.0,0.0,890.0,58.0,0.0,47.0,1322.0,0.0,24.0,795.0,0.0,0.0,0.0,40
2,2,31,12.0,0.0,890.0,15.0,0.0,25.0,1322.0,0.0,25.0,795.0,0.0,0.0,0.0,37
3,2,32,12.0,1.0,890.0,84.0,0.0,89.0,1322.0,0.0,58.0,795.0,0.0,299.0,1146.0,40
4,2,33,12.0,1.0,890.0,42.0,1.0,42.0,1322.0,0.0,33.0,795.0,1.0,61.0,1146.0,37


### Train/Valid/Test Split

In [18]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = data['date_block_num']

last_block = dates.max()
print('Test`date_block_num` is %d' % last_block)

Test`date_block_num` is 34


Split the data into train and validation.

Note: need to drop 'target_item', 'target', 'target_shop', 'date_block_num' as leaving them will introduce look ahead bias.

In [19]:
dates_train = dates[dates <  last_block]
dates_test  = dates[dates == last_block]
dates_valid = dates.unique()[-4:-1] # 31,32,33

# Drop columns that would introduce look ahead bias
X_train = data.loc[dates <  dates_valid[0]]#.drop(to_drop_cols, axis=1)
X_valid = data.loc[dates.isin(dates_valid)]#.drop(to_drop_cols, axis=1)
X_test =  data.loc[dates == last_block]#.drop(to_drop_cols, axis=1)

y_train = data.loc[dates < dates_valid[0], 'target'].values
y_valid = data.loc[dates.isin(dates_valid), 'target'].values
y_test =  data.loc[dates == last_block, 'target'].values



In [20]:
X_test.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id
6425094,5,5037,34.0,0.0,0.0,0.0,0.0,25.0,1052.0,3.0,119.0,1294.0,1.0,65.0,1445.0,19
6425095,5,5320,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55
6425096,5,5233,34.0,0.0,0.0,0.0,1.0,42.0,1052.0,1.0,150.0,1294.0,0.0,0.0,0.0,19
6425097,5,5232,34.0,0.0,0.0,0.0,0.0,28.0,1052.0,1.0,65.0,1294.0,0.0,0.0,0.0,23
6425098,5,5268,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20


### Mean Encodings form Items

In [21]:
# Encode based on train data
target_mean_item = X_train.groupby('item_id').target.mean()
target_mean_shop = X_train.groupby('shop_id').target.mean()
print(target_mean_item.head())
print(target_mean_shop.head())

item_id
0    0.020000
1    0.023810
2    0.019802
3    0.019802
4    0.020000
Name: target, dtype: float64
shop_id
2    0.151596
3    0.134607
4    0.187158
5    0.205499
6    0.407092
Name: target, dtype: float64


In [23]:
# map encodings to train and valid (Takes time to complete)
X_train['item_id_'+ 'mean_encoded'] =  X_train['item_id'].map(target_mean_item)
X_valid['item_id_'+ 'mean_encoded'] =  X_valid['item_id'].map(target_mean_item)
X_train['shop_id_'+ 'mean_encoded'] =  X_train['shop_id'].map(target_mean_shop)
X_valid['shop_id_'+ 'mean_encoded'] =  X_valid['shop_id'].map(target_mean_shop)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [24]:
X_train.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id,item_id_mean_encoded,shop_id_mean_encoded
0,2,27,12.0,0.0,890.0,1.0,0.0,4.0,1322.0,0.0,6.0,795.0,1.0,7.0,1146.0,19,0.03125,0.151596
1,2,30,12.0,0.0,890.0,58.0,0.0,47.0,1322.0,0.0,24.0,795.0,0.0,0.0,0.0,40,0.31978,0.151596
2,2,31,12.0,0.0,890.0,15.0,0.0,25.0,1322.0,0.0,25.0,795.0,0.0,0.0,0.0,37,0.286813,0.151596
3,2,32,12.0,1.0,890.0,84.0,0.0,89.0,1322.0,0.0,58.0,795.0,0.0,299.0,1146.0,40,0.712088,0.151596
4,2,33,12.0,1.0,890.0,42.0,1.0,42.0,1322.0,0.0,33.0,795.0,1.0,61.0,1146.0,37,0.408791,0.151596


In [25]:
# Drop columns that would introduce look ahead bias
X_train.drop(to_drop_cols, axis=1)
X_valid.drop(to_drop_cols, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
X_train.head()

Unnamed: 0,shop_id,item_id,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id,item_id_mean_encoded,shop_id_mean_encoded
0,2,27,0.0,4.0,1322.0,0.0,6.0,795.0,1.0,7.0,1146.0,19,0.03125,0.151596
1,2,30,0.0,47.0,1322.0,0.0,24.0,795.0,0.0,0.0,0.0,40,0.31978,0.151596
2,2,31,0.0,25.0,1322.0,0.0,25.0,795.0,0.0,0.0,0.0,37,0.286813,0.151596
3,2,32,0.0,89.0,1322.0,0.0,58.0,795.0,0.0,299.0,1146.0,40,0.712088,0.151596
4,2,33,1.0,42.0,1322.0,0.0,33.0,795.0,1.0,61.0,1146.0,37,0.408791,0.151596


In [40]:
X_train.describe()

Unnamed: 0,shop_id,item_id,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id,item_id_mean_encoded,shop_id_mean_encoded
count,5753731.0,5753731.0,5753731.0,5753731.0,5753731.0,5753731.0,5753731.0,5753731.0,5753731.0,5753731.0,5753731.0,5753731.0,5753731.0,5753731.0
mean,31.91981,11284.43,0.3353855,16.08588,1781.046,0.3391274,16.22326,1737.201,0.273326,12.42967,1256.143,44.76921,0.3259615,0.3259615
std,17.20857,6270.037,3.54489,105.4839,1925.056,3.569308,109.4036,1982.859,3.148013,98.1477,1952.963,15.57896,1.742787,0.2459848
min,2.0,0.0,-2.0,-18.0,0.0,-4.0,-18.0,0.0,-4.0,-2.0,0.0,2.0,-0.09803922,0.06630884
25%,17.0,5787.0,0.0,1.0,680.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,0.04676754,0.1861488
50%,31.0,11375.0,0.0,4.0,1371.0,0.0,4.0,1371.0,0.0,0.0,0.0,40.0,0.1125,0.2623639
75%,47.0,16585.0,0.0,12.0,2109.0,0.0,12.0,2145.0,0.0,8.0,1981.0,55.0,0.2764505,0.3307689
max,59.0,22169.0,1305.0,12557.0,16318.0,1305.0,12557.0,16318.0,1305.0,12557.0,16318.0,83.0,124.0385,1.334716


#### Fill Nans in Valid set with mean of Train item_id and shop_id # 0.3259615

In [43]:
X_valid.item_id_mean_encoded.fillna(3.259615e-01, inplace=True)
X_valid.shop_id_mean_encoded.fillna(3.259615e-01, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [44]:
# Can visually confirm that count is consistent across features
X_valid.describe()

Unnamed: 0,shop_id,item_id,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id,item_id_mean_encoded,shop_id_mean_encoded
count,671363.0,671363.0,671363.0,671363.0,671363.0,671363.0,671363.0,671363.0,671363.0,671363.0,671363.0,671363.0,671363.0,671363.0
mean,31.558994,11163.238987,0.272738,11.551807,1201.257841,0.265071,11.599685,1055.313285,0.271883,13.725709,1000.39184,46.148277,0.522344,0.323515
std,17.66054,6291.287853,2.260457,56.713475,1305.179881,2.840675,68.095708,1224.921209,3.152367,103.167049,1501.866264,16.684531,2.170487,0.241388
min,2.0,30.0,-1.0,-1.0,0.0,-1.0,-1.0,0.0,-1.0,-1.0,0.0,2.0,0.0,0.066309
25%,16.0,5467.0,0.0,1.0,420.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,0.095349,0.187158
50%,34.0,11373.0,0.0,3.0,1007.0,0.0,3.0,894.0,0.0,0.0,0.0,43.0,0.249451,0.262364
75%,47.0,16199.0,0.0,10.0,1471.0,0.0,10.0,1354.0,0.0,8.0,1558.0,58.0,0.430233,0.330769
max,59.0,22167.0,482.0,3551.0,6867.0,742.0,3768.0,6327.0,697.0,6571.0,8583.0,83.0,124.038462,1.334716


In [45]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

lr = LinearRegression()
lr.fit(X_train.values, y_train)
pred_lr = lr.predict(X_valid.values)

print('Test R-squared for linreg is %f' % r2_score(y_valid, pred_lr))

Test R-squared for linreg is 0.159936


In [46]:
import lightgbm as lgb

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb = model.predict(X_valid)

print('Test R-squared for LightGBM is %f' % r2_score(y_valid, pred_lgb))

Test R-squared for LightGBM is 0.189863


# Create Mean Encodings using all train data


In [47]:
# Create train data using all except test.
X_train_all = data.loc[dates <  last_block]
y_train_all = data.loc[dates < last_block, 'target'].values

In [48]:
# Encode based on train data
target_mean_item_all = X_train_all.groupby('item_id').target.mean()
target_mean_shop_all = X_train_all.groupby('shop_id').target.mean()
print(target_mean_item_all.head())
print(target_mean_shop_all.head())

item_id
0    0.020000
1    0.023810
2    0.019802
3    0.019802
4    0.020000
Name: target, dtype: float64
shop_id
2    0.152520
3    0.134041
4    0.184126
5    0.207204
6    0.397856
Name: target, dtype: float64


In [49]:
# map encodings to train and valid (Takes time to complete)
X_train_all['item_id_'+ 'mean_encoded'] =  X_train_all['item_id'].map(target_mean_item_all)
X_test['item_id_'+ 'mean_encoded'] =  X_test['item_id'].map(target_mean_item_all)
X_train_all['shop_id_'+ 'mean_encoded'] =  X_train_all['shop_id'].map(target_mean_shop_all)
X_test['shop_id_'+ 'mean_encoded'] =  X_test['shop_id'].map(target_mean_shop_all)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [52]:
# Drop columns that would introduce look ahead bias
X_train_all=X_train_all.drop(to_drop_cols, axis=1)
X_test=X_test.drop(to_drop_cols, axis=1)

In [54]:
X_train_all.describe()

Unnamed: 0,shop_id,item_id,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id,item_id_mean_encoded,shop_id_mean_encoded
count,6425094.0,6425094.0,6425094.0,6425094.0,6425094.0,6425094.0,6425094.0,6425094.0,6425094.0,6425094.0,6425094.0,6425094.0,6425094.0,6425094.0
mean,31.88211,11271.76,0.3288394,15.61211,1720.463,0.3313892,15.74014,1665.95,0.2731751,12.5651,1229.419,44.91331,0.3245825,0.3245825
std,17.2567,6272.37,3.433288,101.4998,1878.316,3.500349,105.8537,1929.039,3.148468,98.68491,1912.418,15.70379,1.681631,0.2434457
min,2.0,0.0,-2.0,-18.0,0.0,-4.0,-18.0,0.0,-4.0,-2.0,0.0,2.0,-0.09803922,0.06096435
25%,17.0,5706.0,0.0,1.0,646.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,0.04861111,0.1818296
50%,33.0,11375.0,0.0,4.0,1317.0,0.0,3.0,1299.0,0.0,0.0,0.0,40.0,0.1162791,0.2540483
75%,47.0,16546.0,0.0,11.0,2053.0,0.0,11.0,2089.0,0.0,8.0,1924.0,56.0,0.2800687,0.3347779
max,59.0,22169.0,1305.0,12557.0,16318.0,1305.0,12557.0,16318.0,1305.0,12557.0,16318.0,83.0,119.2435,1.316269


In [53]:
X_test.describe()

Unnamed: 0,shop_id,item_id,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id,item_id_mean_encoded,shop_id_mean_encoded
count,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,198324.0,214200.0
mean,31.642857,11019.398627,0.293413,13.289608,1215.913487,0.278137,11.558287,1051.369412,0.3238,16.214524,1114.512521,46.309608,0.542618,0.313164
std,17.561933,6252.64459,5.550976,75.376194,1344.777606,2.149646,57.106877,1154.292618,4.229684,138.908118,1706.517711,16.716581,2.167434,0.241672
min,2.0,30.0,-1.0,0.0,0.0,-1.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.014388,0.060964
25%,16.0,5381.5,0.0,1.0,383.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,0.079772,0.18183
50%,34.5,11203.0,0.0,4.0,997.0,0.0,3.0,966.0,0.0,0.0,0.0,43.0,0.197701,0.249946
75%,47.0,16071.5,0.0,11.0,1409.0,0.0,10.0,1471.0,0.0,9.0,1785.0,58.0,0.477273,0.32112
max,59.0,22167.0,2253.0,4078.0,6247.0,436.0,3551.0,5714.0,772.0,7721.0,9865.0,83.0,119.243503,1.316269


In [55]:
X_test.item_id_mean_encoded.fillna(3.245825e-01, inplace=True)
X_test.shop_id_mean_encoded.fillna(3.245825e-01, inplace=True)

In [56]:
model = lgb.train(lgb_params, lgb.Dataset(X_train_all, label=y_train_all), 100)
pred_test_lgb = model.predict(X_test)

In [58]:
test_temp = test.copy()

In [59]:
test_temp.target = pred_test_lgb
del test_temp['target_shop']
del test_temp['target_item']
del test_temp['date_block_num']
test_temp.head()

Unnamed: 0,shop_id,item_id,target
0,5,5037,0.561728
1,5,5320,0.215326
2,5,5233,0.763341
3,5,5232,0.464539
4,5,5268,1.108254


In [60]:
submission = test_orig.copy()
submission_temp = submission.merge(test_temp, on=['shop_id','item_id'], how='left')
submission_temp.head()

Unnamed: 0,ID,shop_id,item_id,target
0,0,5,5037,0.561728
1,1,5,5320,0.215326
2,2,5,5233,0.763341
3,3,5,5232,0.464539
4,4,5,5268,1.108254


In [61]:
submission_temp.max()

ID         214199.000000
shop_id        59.000000
item_id     22167.000000
target        355.570568
dtype: float64

In [62]:
sample = pd.read_csv('sample_submission.csv')
final_submission = sample.copy()
final_submission.item_cnt_month = submission_temp.target.clip(0,20)
final_submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.561728
1,1,0.215326
2,2,0.763341
3,3,0.464539
4,4,1.108254


In [63]:
final_submission.to_csv('2-2-v0-submission.csv',index=False)