In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse
import lightgbm as lgb
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm_notebook
from itertools import product
import gc


In [2]:
def downcast_dtype(df):
    float64_cols = [c for c in df if df[c].dtype == 'float64']
    int64_cols = [c for c in df if df[c].dtype == 'int64']
    df[float64_cols] = df[float64_cols].astype(np.float32)
    df[int64_cols] = df[int64_cols].astype(np.int32)
    return df

In [3]:
sales = pd.read_csv('./all/sales_train.csv.gz')
shops = pd.read_csv('./all/shops.csv')
items = pd.read_csv('./all/items.csv')
item_cats = pd.read_csv('./all/item_categories.csv')
tests = pd.read_csv('./all/test.csv.gz')
sample_submission = pd.read_csv("./all/sample_submission.csv.gz")

In [4]:
tests['date_block_num'] = 34
all_data_test = tests

In [5]:
#  init all_data
index_cols = ['shop_id','item_id','date_block_num']
grid = []
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num,'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num,'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops,cur_items,[block_num]])),dtype = 'int32'))
grid = pd.DataFrame(np.vstack(grid),columns = index_cols,dtype = np.int32)
# groupby data ro get shop_item_month aggregates
gb = sales.groupby(index_cols,as_index = False).agg({'item_cnt_day' : { 'target':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(grid,gb,on = index_cols,how = 'left').fillna(0)
all_data_test = pd.merge(all_data_test,gb,on = index_cols,how = 'left').fillna(0)

# group by data get shop_month aggregates
gb = sales.groupby(['shop_id','date_block_num'],as_index = False).agg({'item_cnt_day': { 'target_shop': 'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]

all_data = pd.merge(all_data,gb,how = 'left',on = ['shop_id','date_block_num']).fillna(0)
all_data_test = pd.merge(all_data_test,gb,on = ['shop_id','date_block_num'],how = 'left').fillna(0)

# group by data get item_month aggregates
gb = sales.groupby(['item_id','date_block_num'],as_index = False).agg({'item_cnt_day': {'target_item': 'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]

all_data = pd.merge(all_data,gb,how = 'left',on = ['item_id','date_block_num']).fillna(0)
all_data_test = pd.merge(all_data_test,gb,on = ['item_id','date_block_num'],how = 'left').fillna(0)

all_data = downcast_dtype(all_data)
all_data_test = downcast_dtype(all_data_test)
del grid,gb

gc.collect()



  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


220

In [6]:
all_data_test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,target,target_shop,target_item
0,0,5,5037,34,0.0,0.0,0.0
1,1,5,5320,34,0.0,0.0,0.0
2,2,5,5233,34,0.0,0.0,0.0
3,3,5,5232,34,0.0,0.0,0.0
4,4,5,5268,34,0.0,0.0,0.0


In [7]:
cols_to_rename = list(all_data.columns.difference(index_cols))
shift_range = [1,2,3,4,5,12]
for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    train_shift['date_block_num']  = train_shift['date_block_num']  + month_shift
    foo = lambda x: '{}_lag_{}'.format(x,month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns = foo)
    all_data = pd.merge(all_data,train_shift,on = index_cols,how = 'left').fillna(0)
    all_data_test = pd.merge(all_data_test,train_shift,on = index_cols,how = 'left').fillna(0)
del train_shift
# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12]     




In [8]:
all_data_test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,...,target_shop_lag_3,target_lag_4,target_item_lag_4,target_shop_lag_4,target_lag_5,target_item_lag_5,target_shop_lag_5,target_lag_12,target_item_lag_12,target_shop_lag_12
0,0,5,5037,34,0.0,0.0,0.0,0.0,25.0,1052.0,...,1294.0,1.0,54.0,991.0,1.0,105.0,954.0,1.0,65.0,1445.0
1,1,5,5320,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5,5233,34,0.0,0.0,0.0,1.0,42.0,1052.0,...,1294.0,0.0,37.0,991.0,2.0,119.0,954.0,0.0,0.0,0.0
3,3,5,5232,34,0.0,0.0,0.0,0.0,28.0,1052.0,...,1294.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,5,5268,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#  list of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]]

In [10]:
to_drop_cols = list(set(all_data.columns) - (set(fit_cols) | set(index_cols))) + ['date_block_num']

In [11]:
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()
all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtype(all_data)

all_data_test = pd.merge(all_data_test, item_category_mapping, how='left', on='item_id')
all_data_test = downcast_dtype(all_data_test)
gc.collect();

In [12]:
all_data_test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,...,target_lag_4,target_item_lag_4,target_shop_lag_4,target_lag_5,target_item_lag_5,target_shop_lag_5,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id
0,0,5,5037,34,0.0,0.0,0.0,0.0,25.0,1052.0,...,1.0,54.0,991.0,1.0,105.0,954.0,1.0,65.0,1445.0,19
1,1,5,5320,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55
2,2,5,5233,34,0.0,0.0,0.0,1.0,42.0,1052.0,...,0.0,37.0,991.0,2.0,119.0,954.0,0.0,0.0,0.0,19
3,3,5,5232,34,0.0,0.0,0.0,0.0,28.0,1052.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23
4,4,5,5268,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20


In [13]:
fit_cols

['target_lag_1',
 'target_item_lag_1',
 'target_shop_lag_1',
 'target_lag_2',
 'target_item_lag_2',
 'target_shop_lag_2',
 'target_lag_3',
 'target_item_lag_3',
 'target_shop_lag_3',
 'target_lag_4',
 'target_item_lag_4',
 'target_shop_lag_4',
 'target_lag_5',
 'target_item_lag_5',
 'target_shop_lag_5',
 'target_lag_12',
 'target_item_lag_12',
 'target_shop_lag_12']

In [14]:
# Normallize and Category encoding target encoding 
scaler = StandardScaler()
scaler.fit(all_data[fit_cols])
all_data[fit_cols] = scaler.transform(all_data[fit_cols])
all_data_test[fit_cols] = scaler.transform(all_data_test[fit_cols])

In [15]:
dates = all_data['date_block_num']
last_block = dates.max()

In [16]:
dates_train = dates[dates < last_block]
dates_test = dates[dates == last_block]
X_train = all_data.loc[dates < last_block].drop(to_drop_cols,axis = 1)
X_test = all_data.loc[dates == last_block].drop(to_drop_cols,axis = 1)
X_result = all_data_test.drop(to_drop_cols + ['ID'],axis = 1)
y_train = all_data.loc[dates < last_block,'target'].values
y_test = all_data.loc[dates == last_block,'target'].values

In [17]:
X_result.head()


Unnamed: 0,shop_id,item_id,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_2,target_item_lag_2,target_shop_lag_2,target_lag_3,target_item_lag_3,...,target_lag_4,target_item_lag_4,target_shop_lag_4,target_lag_5,target_item_lag_5,target_shop_lag_5,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id
0,5,5037,-0.09578,0.092492,-0.355884,0.195214,0.912071,-0.317708,0.762384,0.975496,...,0.186006,0.347587,-0.332976,0.186243,0.808325,-0.33195,0.23085,0.531337,0.112727,19
1,5,5320,-0.09578,-0.153814,-0.915961,-0.096476,-0.151573,-0.890777,-0.094673,-0.148697,...,-0.093083,-0.143973,-0.84034,-0.092191,-0.141887,-0.818117,-0.086764,-0.127325,-0.642861,55
2,5,5233,0.195486,0.25998,-0.355884,0.778593,0.621986,-0.317708,0.191013,1.268353,...,-0.093083,0.192837,-0.332976,0.464676,0.93502,-0.33195,-0.086764,-0.127325,-0.642861,19
3,5,5232,-0.09578,0.122048,-0.355884,-0.096476,0.312562,-0.317708,0.191013,0.465358,...,-0.093083,-0.143973,-0.84034,-0.092191,-0.141887,-0.818117,-0.086764,-0.127325,-0.642861,23
4,5,5268,-0.09578,-0.153814,-0.915961,-0.096476,-0.151573,-0.890777,-0.094673,-0.148697,...,-0.093083,-0.143973,-0.84034,-0.092191,-0.141887,-0.818117,-0.086764,-0.127325,-0.642861,20


In [18]:
lr = LinearRegression()
lr.fit(X_train.values,y_train)
pred_lr = lr.predict(X_test.values)
pred_lr_result = lr.predict(X_result.values)
print('Test R-squared for linreg is %f' % np.sqrt(mean_squared_error(y_test,pred_lr.clip(0,20))))

Test R-squared for linreg is 5.223397


In [19]:
lgb_params = {
    'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
}

In [20]:
model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb = model.predict(X_test)
pred_lgb_result = model.predict(X_result.values)
print('Test R-squared for LightGBM is %f' % np.sqrt(mean_squared_error(y_test,pred_lgb.clip(0,20))))

Test R-squared for LightGBM is 5.213277


In [21]:
X_test_level2 = np.c_[pred_lr,pred_lgb]

In [22]:
X_result_level2 = np.c_[pred_lr_result,pred_lgb_result]

In [23]:
dates_train_level2 = dates_train[dates_train.isin([27,28,29,30,31,32])]

In [24]:
y_train_level2 = y_train[dates_train.isin([27,28,29,30,31,32])]

In [25]:
X_train_level2 = np.zeros([y_train_level2.shape[0],2])

In [26]:
for cur_block_num in [27,28,29,30,31,32]:
    '''
        1. Split `X_train` into parts
           Remember, that corresponding dates are stored in `dates_train` 
        2. Fit linear regression 
        3. Fit LightGBM and put predictions          
        4. Store predictions from 2. and 3. in the right place of `X_train_level2`. 
           You can use `dates_train_level2` for it
           Make sure the order of the meta-features is the same as in `X_test_level2`
    '''      
    
    #  YOUR CODE GOES HERE
    X_train_to = all_data.loc[dates < cur_block_num].drop(to_drop_cols,axis = 1)
    X_test_to =  all_data.loc[dates == cur_block_num].drop(to_drop_cols,axis = 1)
    
    y_train_to = all_data.loc[dates < cur_block_num,'target'].values
    lr.fit(X_train_to.values,y_train_to)
    pred_lr = lr.predict(X_test_to)
    
    model = lgb.train(lgb_params,lgb.Dataset(X_train_to,label = y_train_to),100)
    pred_lgb = model.predict(X_test_to)
    
    X_train_level2[dates_train_level2 == cur_block_num] = np.c_[pred_lr, pred_lgb] 

In [27]:
# Stacking
lr.fit(X_train_level2,y_train_level2)
train_preds = lr.predict(X_train_level2)
test_preds = lr.predict(X_test_level2)
test_preds_result = lr.predict(X_result_level2)
print('Test R-squared for LR is %f' % np.sqrt(mean_squared_error(y_train_level2,train_preds.clip(0,20))))
print('Test R-squared for LightGBM is %f' % np.sqrt(mean_squared_error(y_test,test_preds.clip(0,20))))

Test R-squared for LR is 3.844827
Test R-squared for LightGBM is 5.206881


In [28]:
print(test_preds_result.clip(0,20))

[0.51962674 0.16322984 0.94199138 ... 0.04097054 0.04558106 0.04111713]


In [29]:
release = test_preds_result.clip(0,20)

In [30]:
tests  = tests.drop('date_block_num',axis = 1)

In [31]:
sample_submission.columns


Index([u'ID', u'item_cnt_month'], dtype='object')

In [32]:
tests['item_cnt_month'] = release

In [33]:
my_submission = tests[['ID','item_cnt_month']]

In [34]:
my_submission.to_csv('./all/to_win_kaggle.csv',index=False)

In [35]:
sample_submission.columns

Index([u'ID', u'item_cnt_month'], dtype='object')

In [36]:
my_submission.columns

Index([u'ID', u'item_cnt_month'], dtype='object')

In [37]:
my_submission.head(10)

Unnamed: 0,ID,item_cnt_month
0,0,0.519627
1,1,0.16323
2,2,0.941991
3,3,0.2869
4,4,1.50913
5,5,0.470937
6,6,0.998705
7,7,0.186497
8,8,1.221333
9,9,0.357727


In [38]:
sample_submission

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5
5,5,0.5
6,6,0.5
7,7,0.5
8,8,0.5
9,9,0.5
