In [1]:
# process data from item sales prediction
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [2]:
DATA_FOLDER = './data/'

transactions    = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv'))
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

In [3]:
print(transactions.head())
print(items.head())
print(item_categories.head())
print(shops.head())

         date  date_block_num  shop_id  item_id  item_price  item_cnt_day
0  02.01.2013               0       59    22154      999.00           1.0
1  03.01.2013               0       25     2552      899.00           1.0
2  05.01.2013               0       25     2552      899.00          -1.0
3  06.01.2013               0       25     2554     1709.05           1.0
4  15.01.2013               0       25     2555     1099.00           1.0
                                           item_name  item_id  \
0          ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.)         D        0   
1  !ABBYY FineReader 12 Professional Edition Full...        1   
2      ***В ЛУЧАХ СЛАВЫ   (UNV)                    D        2   
3    ***ГОЛУБАЯ ВОЛНА  (Univ)                      D        3   
4        ***КОРОБКА (СТЕКЛО)                       D        4   

   item_category_id  
0                40  
1                76  
2                40  
3                40  
4                40  
        item_category_name  item_

In [4]:
# extract date format to day, month, year format

transactions[['day','month','year']] = transactions.date.str.split(".", expand=True)
df = transactions.copy()
df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,day,month,year
0,02.01.2013,0,59,22154,999.0,1.0,2,1,2013
1,03.01.2013,0,25,2552,899.0,1.0,3,1,2013
2,05.01.2013,0,25,2552,899.0,-1.0,5,1,2013
3,06.01.2013,0,25,2554,1709.05,1.0,6,1,2013
4,15.01.2013,0,25,2555,1099.0,1.0,15,1,2013


In [5]:
# getting monthly revenue data per shop and item
df['revenue'] = df['item_cnt_day']*df['item_price']
df2 = df.groupby(['shop_id','item_id', 'date_block_num'])
df['shopItemMonthlyRevenue'] = df2['revenue'].transform('sum')
df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,day,month,year,revenue,shopItemMonthlyRevenue
0,02.01.2013,0,59,22154,999.0,1.0,2,1,2013,999.0,999.0
1,03.01.2013,0,25,2552,899.0,1.0,3,1,2013,899.0,0.0
2,05.01.2013,0,25,2552,899.0,-1.0,5,1,2013,-899.0,0.0
3,06.01.2013,0,25,2554,1709.05,1.0,6,1,2013,1709.05,1709.05
4,15.01.2013,0,25,2555,1099.0,1.0,15,1,2013,1099.0,1099.0


In [6]:
df2 = df.copy()
df2.drop(['day','item_cnt_day','revenue','date'],axis=1, inplace=True)
df2.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,month,year,shopItemMonthlyRevenue
0,0,59,22154,999.0,1,2013,999.0
1,0,25,2552,899.0,1,2013,0.0
2,0,25,2552,899.0,1,2013,0.0
3,0,25,2554,1709.05,1,2013,1709.05
4,0,25,2555,1099.0,1,2013,1099.0


In [16]:
#split to test and training partitions
split = train_test_split(df2, test_size=0.2)
df_tr = split[0]
df_test = split[1]
print(len(df_tr))
print(len(df_test))
print(df_tr.head())
print(df_test.head())


2348679
587170
         date_block_num  shop_id  item_id  item_price month  year  \
888390                8       31    19460       299.0    09  2013   
1142120              11       25     2931        99.0    12  2013   
1458174              14       51    12714        72.0    03  2014   
700105                6       54    11861        58.0    07  2013   
2220877              23       51    16790      1804.0    12  2014   

         shopItemMonthlyRevenue  
888390                    299.0  
1142120                   396.0  
1458174                    72.0  
700105                    174.0  
2220877                 10613.3  
         date_block_num  shop_id  item_id  item_price month  year  \
2456552              25        5    14019       149.0    02  2015   
2254252              23       28     6497       799.0    12  2014   
460501                4       25     7402       199.0    05  2013   
444781                4       27     4163      1590.0    05  2013   
81286                

In [17]:
# mean encoding via expanding mean
columns = ['item_price']#df2.columns[:-1]
target = ['shopItemMonthlyRevenue']#df2.columns[-1]

for col in columns:
    cumsum = df_tr.groupby(col)[target].cumsum() - df_tr[target]    
    cumcnt = df_tr.groupby(col)[target].cumcount()
    df_tr[col + '_mean_enc'] = (cumsum/(cumcnt+0.01))
print(df_tr.head())

MemoryError: Unable to allocate array with shape (2348679, 2348679) and data type float64

In [27]:

# # Mean encoding for the features
# # use k-fold cross validation for regularization
# # this approach is the update approach since 0.18
# y_target = df2.shopItemMonthlyRevenue.values
# skf = StratifiedKFold(n_splits=5)
# for tr_ind, val_ind in skf.split(np.zeros(len(y_target)), y_target):
#     train = df2.iloc[tr_ind]
#     val  = df2.iloc[val_ind]
    



In [28]:
# feature normalization - for non-tree models
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
df3 = df2.drop_duplicates().copy()
df3.head()
scaler = StandardScaler()
df3[['item_price','shop_id','month', 'year','item_id']] = scaler.fit_transform(df3[['item_price','shop_id','month', 'year','item_id']])
df3.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,month,year,shopItemMonthlyRevenue
0,0,1.59392,1.866115,0.089007,-1.489505,-1.028017,999.0
1,0,-0.470441,-1.272997,0.030918,-1.489505,-1.028017,0.0
3,0,-0.470441,-1.272677,0.501469,-1.489505,-1.028017,1709.05
4,0,-0.470441,-1.272517,0.147096,-1.489505,-1.028017,1099.0
5,0,-0.470441,-1.271076,-0.288572,-1.489505,-1.028017,349.0


In [37]:
# fill zeros for all nan values
df3.fillna(0.01)
# mean encoding for column using expanding mean feature
cumsum = df3.groupby('item_id').shopItemMonthlyRevenue.cumsum() - df3.shopItemMonthlyRevenue
cumcnt = df3.groupby('item_id').cumcount()
train_new = df3.copy()
train_new['item_id'+'_mean_target'] = cumsum/(cumcnt+0.01)
print(train_new)

         date_block_num   shop_id   item_id  item_price     month      year  \
0                     0  1.593920  1.866115    0.089007 -1.489505 -1.028017   
1                     0 -0.470441 -1.272997    0.030918 -1.489505 -1.028017   
3                     0 -0.470441 -1.272677    0.501469 -1.489505 -1.028017   
4                     0 -0.470441 -1.272517    0.147096 -1.489505 -1.028017   
5                     0 -0.470441 -1.271076   -0.288572 -1.489505 -1.028017   
...                 ...       ...       ...         ...       ...       ...   
2935840              33 -0.470441 -0.495185   -0.317617  1.110592  1.544750   
2935841              33 -0.470441 -0.497748   -0.288572  1.110592  1.544750   
2935842              33 -0.470441 -0.499189   -0.056216  1.110592  1.544750   
2935846              33 -0.470441 -0.487178   -0.288572  1.110592  1.544750   
2935847              33 -0.470441 -0.490221   -0.317617  1.110592  1.544750   

         shopItemMonthlyRevenue  item_id_mean_targe

In [40]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
# get data
# df4 = df2.drop_duplicates().copy()
df4 = df3.copy()
print(df4.head())
x_all = df4[['shop_id','item_id','item_price','month','year']].values
y_all = df4['shopItemMonthlyRevenue'].values
print(x_all.shape)
print(y_all.shape)

x_train, x_test, y_train, y_test = train_test_split(x_all,y_all,test_size=0.1)
# get model
clf = GradientBoostingRegressor(loss='lad',n_estimators=5000, learning_rate=0.1, max_depth = 3)
clf.fit(x_test, y_test)
clf.score(x_test, y_test)


   date_block_num   shop_id   item_id  item_price     month      year  \
0               0  1.593920  1.866115    0.089007 -1.489505 -1.028017   
1               0 -0.470441 -1.272997    0.030918 -1.489505 -1.028017   
3               0 -0.470441 -1.272677    0.501469 -1.489505 -1.028017   
4               0 -0.470441 -1.272517    0.147096 -1.489505 -1.028017   
5               0 -0.470441 -1.271076   -0.288572 -1.489505 -1.028017   

   shopItemMonthlyRevenue  
0                  999.00  
1                    0.00  
3                 1709.05  
4                 1099.00  
5                  349.00  
(1739022, 5)
(1739022,)


0.02090768446132807

False