<a href="https://colab.research.google.com/github/svgkat/hse/blob/master/predict_future_sales.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import os
import warnings

from itertools import product
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import r2_score
import gc 
%matplotlib inline 
pd.set_option('display.max_rows',600)
pd.set_option('display.max_columns',50)
warnings.simplefilter(action='ignore', category=FutureWarning)

def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    int8_cols = [col for col in df.columns if col in ['shop_id','date_block_num','item_category_id'] ]
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"] 
    int_cols = list(set(int_cols) - set(int8_cols))
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float16)
    df[int_cols]   = df[int_cols].astype(np.int16)
    df[int8_cols] = df[int8_cols].astype(np.int8)
    
    return df

os.environ['KAGGLE_USERNAME'] = "svgkat"
os.environ['KAGGLE_KEY'] = "85ffe6ac1c5ea0226c2f7a857e9dc63e"

In [76]:
sales = pd.read_csv('/content/predict_future_sales/sales_train.csv.gz')
shops = pd.read_csv('/content/predict_future_sales/shops.csv')
items = pd.read_csv('/content/predict_future_sales/items.csv')
item_cats = pd.read_csv('/content/predict_future_sales/item_categories.csv')
test = pd.read_csv('/content/predict_future_sales/test.csv.gz')    

sales = downcast_dtypes(sales)
shops = downcast_dtypes(shops)
items= downcast_dtypes(items)
item_cats = downcast_dtypes(item_cats)
test = downcast_dtypes(test)

test.drop(columns='ID',inplace=True)
test['date_block_num'] = sales.date_block_num.max() + 1

#adding some month and year features
sales.date=sales.date.apply(lambda x:datetime.datetime.strptime(x, '%d.%m.%Y'))
sales['month'] = sales.date.dt.month
sales['year'] = sales.date.dt.year
sales_month_year = sales[['date_block_num','month','year']].drop_duplicates() 
sales_month_year = sales_month_year.reset_index().drop(columns='index',axis=1)
##adding the month year row corresponding to test data
sales_month_year.loc[34] = [34]+[11]+[2015]
sales_month_year = pd.get_dummies(sales_month_year,columns=['year'])
sales_month_year = downcast_dtypes(sales_month_year)

# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

#Adding the test data to ensure that the test data is also part of the 
#feature engineering process
grid = pd.concat([grid, test])

# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect()

## Creating return features ##
sales_ret = sales[sales.item_cnt_day < 0]

## return numbers for shop-month
gb = sales_ret.groupby(by=['shop_id','date_block_num'],as_index=False).agg({'item_cnt_day':{'trg_ret_shop_per_mth':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

del gb
gc.collect()

##return numbers for item-month

gb = sales_ret.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'trg_ret_item_mth':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

del gb
gc.collect()

##return numbers for shop-item-month

gb = sales_ret.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'trg_ret_shop_item_mth':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
all_data = pd.merge(all_data, gb, how='left', on=index_cols).fillna(0)

del gb,sales_ret
gc.collect()

0

In [77]:
cols_to_rename = list(all_data.columns.difference(index_cols)) 

#shift_range = [1, 2, 3, 4, 5, 12]
shift_range = [1,2,3,6,9,12]
for month_shift in shift_range:
	print('Processing:',month_shift)
	train_shift = all_data[index_cols + cols_to_rename].copy()
	train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
	foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
	train_shift = train_shift.rename(columns=foo)
	all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift
gc.collect()

Processing: 1
Processing: 2
Processing: 3
Processing: 6
Processing: 9
Processing: 12


22

In [0]:
all_data = all_data[all_data['date_block_num'] >= 12] 

In [81]:
# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

print('The number of items in the dataset:',items.item_name.count())

The number of items in the dataset: 22170


In [82]:
## 22170 is a huge number to vectorize. I' am taking a concious decision to 
## not vectorize the item names. 

## Note: Having more features is crashing the coursera runtime/colab runtime
## so selecting only the top 10 features using the max_features parameter

vectorizer_item_cat = TfidfVectorizer(min_df=3,max_features=5)
X = vectorizer_item_cat.fit_transform(item_cats.item_category_name)
print('Count of features:',len(vectorizer_item_cat.get_feature_names()))
print(vectorizer_item_cat.get_feature_names())
df_item_cat_feat = pd.DataFrame(X.toarray())
for ind in range(len(vectorizer_item_cat.get_feature_names())):
  item_cats['tfidf_feat_itemcat_'+str(ind)] = df_item_cat_feat[ind]
del vectorizer_item_cat,X
df_item_cat_feat = downcast_dtypes(df_item_cat_feat)
gc.collect()

#merge the items and item_cats data frame so as to join with all_data
item_item_cats = pd.merge(items.drop(columns=['item_name']),item_cats.drop(columns=['item_category_name']),how='left',on='item_category_id')
item_item_cats = downcast_dtypes(item_item_cats)
del df_item_cat_feat
gc.collect()

all_data = pd.merge(all_data,item_item_cats,how='left',on='item_id')
all_data = downcast_dtypes(all_data)
del item_item_cats
gc.collect()


Count of features: 5
['игры', 'книги', 'консоли', 'подарки', 'цифра']


0

In [83]:
##generate tfidf features from shop and add it to all_data 
vectorizer = TfidfVectorizer(min_df=2,max_features=5)
X = vectorizer.fit_transform(shops.shop_name)
print('Count of features:',len(vectorizer.get_feature_names()))
print(vectorizer.get_feature_names())

df_shop_features = pd.DataFrame(X.toarray())
for ind in range(len(vectorizer.get_feature_names())):
  shops['tfidf_feat_shop_'+str(ind)] = df_shop_features[ind]

## Merge the shop features to all_data
all_data = pd.merge(all_data,shops.drop(columns=['shop_name']),how='left',on='shop_id')
all_data = downcast_dtypes(all_data)
del vectorizer,df_shop_features, X
gc.collect()


Count of features: 5
['мега', 'москва', 'тк', 'трц', 'тц']


0

In [0]:
## Adding the item_category_id to the drop cols list so that it 
## it is not part of the training
to_drop_cols = to_drop_cols + ['item_category_id']

In [0]:
## Add the month and year features from sales_month_year
all_data=pd.merge(all_data,sales_month_year,how='left',on='date_block_num')


In [0]:
##Prepping the data for training
dates = all_data['date_block_num']
test_block = dates.unique()[-1]

dates_train = dates[dates < test_block]
dates_test = dates[dates == test_block]

X_train = all_data.loc[dates < test_block].drop(to_drop_cols,axis = 1)
X_test = all_data.loc[dates == test_block].drop(to_drop_cols,axis=1)

y_train = all_data.loc[dates <  test_block, 'target'].values

In [87]:
del sales
gc.collect()

308

In [88]:
#First level models
lin_reg = LinearRegression()
lin_reg.fit(X_train.values,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [0]:
pred_lr = lin_reg.predict(X_test.values).clip(0,20)

In [0]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)

In [0]:
## Clipping the values withing the range (0,20) as observed in the kernel submitted 
## by Denis Larionov
pred_lgb = model.predict(X_test).clip(0,20)


In [0]:
##test meta features 
X_test_level2 = np.c_[pred_lr, pred_lgb] 

In [0]:
##Train meta features
dates_train_level2 = dates_train[dates_train.isin([ 28, 29, 30, 31, 32,33])]

# That is how we get target for the 2nd level dataset
y_train_level2 = y_train[dates_train.isin([ 28, 29, 30, 31, 32, 33])]

X_train_level2 = np.zeros([y_train_level2.shape[0], 2])

In [96]:
# Now fill `X_train_level2` with metafeatures
for cur_block_num in [ 28, 29, 30, 31, 32, 33]:
    
    print(cur_block_num)
    
    '''
        1. Split `X_train` into parts
           Remember, that corresponding dates are stored in `dates_train` 
        2. Fit linear regression 
        3. Fit LightGBM and put predictions          
        4. Store predictions from 2. and 3. in the right place of `X_train_level2`. 
           You can use `dates_train_level2` for it
           Make sure the order of the meta-features is the same as in `X_test_level2`
    '''      
    #  YOUR CODE GOES HERE
    X_train_cur_block_num  = all_data.loc[dates <  cur_block_num].drop(to_drop_cols, axis=1)
    X_test_cur_block_num = all_data.loc[dates ==  cur_block_num].drop(to_drop_cols, axis=1)
    
    y_train_cur_block_num = all_data.loc[dates <  cur_block_num,'target'].values
    y_test_cur_block_num = all_data.loc[dates == cur_block_num, 'target'].values
    
    lrg = LinearRegression()
    lrg.fit(X_train_cur_block_num.values, y_train_cur_block_num)
    pred_lr_cur_block_num = lrg.predict(X_test_cur_block_num.values).clip(0,20)
    
    lgb_model = lgb.train(lgb_params, lgb.Dataset(X_train_cur_block_num, label=y_train_cur_block_num), 100)
    pred_lgb_cur_block_num = lgb_model.predict(X_test_cur_block_num).clip(0,20)
    
    X_train_level2[dates_train_level2.isin([cur_block_num])] = np.c_[pred_lr_cur_block_num, pred_lgb_cur_block_num] 

28
29
30
31
32
33


In [0]:
y_train_level2 = np.array(y_train_level2, dtype = np.float64)
X_train_level2= np.array(X_train_level2, dtype = np.float64)

In [0]:
alphas_to_try = np.linspace(0, 1, 1000)
r2_scores = np.array([r2_score(y_train_level2, np.dot(X_train_level2, [alpha, 1 - alpha])) for alpha in (alphas_to_try)])

In [100]:
print(r2_scores.argmax())
best_alpha = alphas_to_try[r2_scores.argmax()]# YOUR CODE GOES HERE
r2_train_simple_mix = r2_scores[r2_scores.argmax()]# YOUR CODE GOES HERE
print('Best alpha: %f; Corresponding r2 score on train: %f' % (best_alpha, r2_train_simple_mix))

142
Best alpha: 0.142142; Corresponding r2 score on train: 0.083854


In [0]:
test_preds = np.dot(X_test_level2,[best_alpha,(1-best_alpha)]).clip(0,20)

In [0]:
test['item_cnt_month'] = test_preds
test['ID'] = test.index.to_list()
test[['ID','item_cnt_month']].to_csv("submission.csv", index=False)

In [104]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f submission.csv -m "convex mix with neg return items"

100% 5.35M/5.35M [00:00<00:00, 15.4MB/s]
Successfully submitted to Predict Future Sales