In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse
import lightgbm as lgb
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook
from itertools import product
import gc


In [11]:
def downcast_dtype(df):
    float64_cols = [c for c in df if df[c].dtype == 'float64']
    int64_cols = [c for c in df if df[c].dtype == 'int64']
    df[float64_cols] = df[float64_cols].astype(np.float32)
    df[int64_cols] = df[int64_cols].astype(np.int32)
    return df

In [12]:
sales = pd.read_csv('./all/sales_train.csv.gz')
shops = pd.read_csv('./all/shops.csv')
items = pd.read_csv('./all/items.csv')
item_cats = pd.read_csv('./all/item_categories.csv')
tests = pd.read_csv('./all/test.csv.gz')
sample_submission = pd.read_csv("./all/sample_submission.csv.gz")

In [13]:
tests['date_block_num'] = 34
all_data_test = tests

In [14]:
# Category grouped
l_cat = list(item_cats.item_category_name)

for ind in range(0,1):

    l_cat[ind] = 'PC Headsets / Headphones'

for ind in range(1,8):

    l_cat[ind] = 'Access'

l_cat[8] = 'Tickets (figure)'

l_cat[9] = 'Delivery of goods'

for ind in range(10,18):

    l_cat[ind] = 'Consoles'

for ind in range(18,25):

    l_cat[ind] = 'Consoles Games'

l_cat[25] = 'Accessories for games'

for ind in range(26,28):

    l_cat[ind] = 'phone games'

for ind in range(28,32):

    l_cat[ind] = 'CD games'

for ind in range(32,37):

    l_cat[ind] = 'Card'

for ind in range(37,43):

    l_cat[ind] = 'Movie'

for ind in range(43,55):

    l_cat[ind] = 'Books'

for ind in range(55,61):

    l_cat[ind] = 'Music'

for ind in range(61,73):

    l_cat[ind] = 'Gifts'

for ind in range(73,79):

    l_cat[ind] = 'Soft'

for ind in range(79,81):

    l_cat[ind] = 'Office'

for ind in range(81,83):

    l_cat[ind] = 'Clean'

l_cat[83] = 'Elements of a food'

le = LabelEncoder()
item_cats['item_cat_id_fix'] = le.fit_transform(l_cat)

In [15]:
#  init all_data
index_cols = ['shop_id','item_id','date_block_num']
grid = []
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num,'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num,'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops,cur_items,[block_num]])),dtype = 'int32'))
grid = pd.DataFrame(np.vstack(grid),columns = index_cols,dtype = np.int32)
# groupby data ro get shop_item_month aggregates
gb = sales.groupby(index_cols,as_index = False).agg({'item_cnt_day' : { 'target':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(grid,gb,on = index_cols,how = 'left').fillna(0)
all_data_test = pd.merge(all_data_test,gb,on = index_cols,how = 'left').fillna(0)

# group by data get shop_month aggregates
gb = sales.groupby(['shop_id','date_block_num'],as_index = False).agg({'item_cnt_day': { 'target_shop': 'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]

all_data = pd.merge(all_data,gb,how = 'left',on = ['shop_id','date_block_num']).fillna(0)
all_data_test = pd.merge(all_data_test,gb,on = ['shop_id','date_block_num'],how = 'left').fillna(0)

# group by data get item_month aggregates
gb = sales.groupby(['item_id','date_block_num'],as_index = False).agg({'item_cnt_day': {'target_item': 'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]

all_data = pd.merge(all_data,gb,how = 'left',on = ['item_id','date_block_num']).fillna(0)
all_data_test = pd.merge(all_data_test,gb,on = ['item_id','date_block_num'],how = 'left').fillna(0)

all_data = downcast_dtype(all_data)
all_data_test = downcast_dtype(all_data_test)
del grid,gb

gc.collect()



  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


133

In [16]:
all_data_test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,target,target_shop,target_item
0,0,5,5037,34,0.0,0.0,0.0
1,1,5,5320,34,0.0,0.0,0.0
2,2,5,5233,34,0.0,0.0,0.0
3,3,5,5232,34,0.0,0.0,0.0
4,4,5,5268,34,0.0,0.0,0.0


In [17]:
cols_to_rename = list(all_data.columns.difference(index_cols))
shift_range = [1,2,3,4,5,12]
for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    train_shift['date_block_num']  = train_shift['date_block_num']  + month_shift
    foo = lambda x: '{}_lag_{}'.format(x,month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns = foo)
    all_data = pd.merge(all_data,train_shift,on = index_cols,how = 'left').fillna(0)
    all_data_test = pd.merge(all_data_test,train_shift,on = index_cols,how = 'left').fillna(0)
del train_shift
# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12]     

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




In [18]:
all_data_test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,...,target_shop_lag_3,target_lag_4,target_item_lag_4,target_shop_lag_4,target_lag_5,target_item_lag_5,target_shop_lag_5,target_lag_12,target_item_lag_12,target_shop_lag_12
0,0,5,5037,34,0.0,0.0,0.0,0.0,25.0,1052.0,...,1294.0,1.0,54.0,991.0,1.0,105.0,954.0,1.0,65.0,1445.0
1,1,5,5320,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5,5233,34,0.0,0.0,0.0,1.0,42.0,1052.0,...,1294.0,0.0,37.0,991.0,2.0,119.0,954.0,0.0,0.0,0.0
3,3,5,5232,34,0.0,0.0,0.0,0.0,28.0,1052.0,...,1294.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,5,5268,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
#  list of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]]

In [20]:
to_drop_cols = list(set(all_data.columns) - (set(fit_cols) | set(index_cols))) + ['date_block_num']

In [22]:
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()
le = LabelEncoder()
le.fit(items['item_category_id'])
item_category_mapping['item_category_id'] = le.transform(items['item_category_id'])

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = pd.merge(all_data,item_cats[['item_cat_id_fix', 'item_category_id']],how = 'left',on = 'item_category_id')

all_data = downcast_dtype(all_data)

all_data_test = pd.merge(all_data_test, item_category_mapping, how='left', on='item_id')
all_data_test = pd.merge(all_data_test,item_cats[['item_cat_id_fix', 'item_category_id']],how = 'left',on = 'item_category_id')
all_data_test = downcast_dtype(all_data_test)
gc.collect();

In [23]:
all_data_test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,...,target_item_lag_4,target_shop_lag_4,target_lag_5,target_item_lag_5,target_shop_lag_5,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id,item_cat_id_fix
0,0,5,5037,34,0.0,0.0,0.0,0.0,25.0,1052.0,...,54.0,991.0,1.0,105.0,954.0,1.0,65.0,1445.0,19,7
1,1,5,5320,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55,12
2,2,5,5233,34,0.0,0.0,0.0,1.0,42.0,1052.0,...,37.0,991.0,2.0,119.0,954.0,0.0,0.0,0.0,19,7
3,3,5,5232,34,0.0,0.0,0.0,0.0,28.0,1052.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23,7
4,4,5,5268,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20,7


In [51]:
# Mean Encoding
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [52]:
# mean target encoding 
mean_target_cols = ['item_id','shop_id','item_category_id','item_cat_id_fix']
for col in mean_target_cols:
    train_enc,test_enc = target_encode(all_data[col],all_data_test[col],all_data['target'])
    all_data = pd.concat([all_data,train_enc],axis = 1)
    all_data_test = pd.concat([all_data_test,test_enc],axis = 1)

In [53]:
fit_cols

['target_lag_1',
 'target_item_lag_1',
 'target_shop_lag_1',
 'target_lag_2',
 'target_item_lag_2',
 'target_shop_lag_2',
 'target_lag_3',
 'target_item_lag_3',
 'target_shop_lag_3',
 'target_lag_4',
 'target_item_lag_4',
 'target_shop_lag_4',
 'target_lag_5',
 'target_item_lag_5',
 'target_shop_lag_5',
 'target_lag_12',
 'target_item_lag_12',
 'target_shop_lag_12']

In [54]:
# Normallize and Category encoding target encoding 
scaler = StandardScaler()
scaler.fit(all_data[fit_cols])
all_data[fit_cols] = scaler.transform(all_data[fit_cols])
all_data_test[fit_cols] = scaler.transform(all_data_test[fit_cols])

In [55]:
dates = all_data['date_block_num']
last_block = dates.max()

In [56]:
dates_train = dates[dates < last_block]
dates_test = dates[dates == last_block]
X_train = all_data.loc[dates < last_block].drop(to_drop_cols,axis = 1)
X_test = all_data.loc[dates == last_block].drop(to_drop_cols,axis = 1)
X_result = all_data_test.drop(to_drop_cols + ['ID'],axis = 1)
y_train = all_data.loc[dates < last_block,'target'].values
y_test = all_data.loc[dates == last_block,'target'].values

In [57]:
type(dates_train)

pandas.core.series.Series

In [58]:
X_result.head()


Unnamed: 0,shop_id,item_id,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_2,target_item_lag_2,target_shop_lag_2,target_lag_3,target_item_lag_3,...,target_shop_lag_5,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id,item_cat_id_fix,item_id_mean,shop_id_mean,item_category_id_mean,item_cat_id_fix_mean
0,5,5037,-0.09578,0.092492,-0.355884,0.195214,0.912071,-0.317708,0.762384,0.975496,...,-0.33195,0.23085,0.531337,0.112727,19,7,1.950845,0.207204,0.600482,0.701291
1,5,5320,-0.09578,-0.153814,-0.915961,-0.096476,-0.151573,-0.890777,-0.094673,-0.148697,...,-0.818117,-0.086764,-0.127325,-0.642861,55,12,0.324582,0.207204,0.224931,0.171937
2,5,5233,0.195486,0.25998,-0.355884,0.778593,0.621986,-0.317708,0.191013,1.268353,...,-0.33195,-0.086764,-0.127325,-0.642861,19,7,1.656863,0.207204,0.600482,0.701291
3,5,5232,-0.09578,0.122048,-0.355884,-0.096476,0.312562,-0.317708,0.191013,0.465358,...,-0.818117,-0.086764,-0.127325,-0.642861,23,7,1.093023,0.207204,0.61083,0.701291
4,5,5268,-0.09578,-0.153814,-0.915961,-0.096476,-0.151573,-0.890777,-0.094673,-0.148697,...,-0.818117,-0.086764,-0.127325,-0.642861,20,7,0.324582,0.207204,1.837239,0.701291


In [59]:
lr = LinearRegression()
lr.fit(X_train.values,y_train)
pred_lr = lr.predict(X_test.values)
pred_lr_result = lr.predict(X_result.values)
print('Test R-squared for linreg is %f' % np.sqrt(mean_squared_error(y_test,pred_lr.clip(0,20))))

Test R-squared for linreg is 5.208394


In [60]:
lgb_params = {
    'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
}

In [61]:
model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb = model.predict(X_test)
pred_lgb_result = model.predict(X_result.values)
print('Test R-squared for LightGBM is %f' % np.sqrt(mean_squared_error(y_test,pred_lgb.clip(0,20))))

Test R-squared for LightGBM is 5.188480


In [62]:
X_test_level2 = np.c_[pred_lr,pred_lgb]

In [63]:
X_result_level2 = np.c_[pred_lr_result,pred_lgb_result]

In [64]:
dates_train_level2 = dates_train[dates_train.isin([27,28,29,30,31,32])]

In [65]:
y_train_level2 = y_train[dates_train.isin([27,28,29,30,31,32])]

In [66]:
X_train_level2 = np.zeros([y_train_level2.shape[0],2])

In [67]:
for cur_block_num in [27,28,29,30,31,32]:
    '''
        1. Split `X_train` into parts
           Remember, that corresponding dates are stored in `dates_train` 
        2. Fit linear regression 
        3. Fit LightGBM and put predictions          
        4. Store predictions from 2. and 3. in the right place of `X_train_level2`. 
           You can use `dates_train_level2` for it
           Make sure the order of the meta-features is the same as in `X_test_level2`
    '''      
    
    #  YOUR CODE GOES HERE
    X_train_to = all_data.loc[dates < cur_block_num].drop(to_drop_cols,axis = 1)
    X_test_to =  all_data.loc[dates == cur_block_num].drop(to_drop_cols,axis = 1)
    
    y_train_to = all_data.loc[dates < cur_block_num,'target'].values
    lr.fit(X_train_to.values,y_train_to)
    pred_lr = lr.predict(X_test_to)
    
    model = lgb.train(lgb_params,lgb.Dataset(X_train_to,label = y_train_to),100)
    pred_lgb = model.predict(X_test_to)
    
    X_train_level2[dates_train_level2 == cur_block_num] = np.c_[pred_lr, pred_lgb] 

In [68]:
# Stacking
lr.fit(X_train_level2,y_train_level2)
train_preds = lr.predict(X_train_level2)
test_preds = lr.predict(X_test_level2)
test_preds_result = lr.predict(X_result_level2)
print('Test R-squared for LR is %f' % np.sqrt(mean_squared_error(y_train_level2,train_preds.clip(0,20))))
print('Test R-squared for LightGBM is %f' % np.sqrt(mean_squared_error(y_test,test_preds.clip(0,20))))

Test R-squared for LR is 3.787661
Test R-squared for LightGBM is 5.182799


In [70]:
release = test_preds_result.clip(0,20)
tests['item_cnt_month'] = release
my_submission = tests[['ID','item_cnt_month']]
my_submission.to_csv('./all/to_win_kaggle.csv',index=False)

In [72]:
my_submission.columns

Index(['ID', 'item_cnt_month'], dtype='object')