In [2]:
import numpy as np
import pandas as pd

from itertools import product
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
def downcast_dtypes(df):
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]

    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)

    return df

In [4]:
import os

data_folder = "/home/wenceslai/Documents/predict_sales_kaggle"

train = pd.read_csv(os.path.join(data_folder, 'sales_train.csv'))
item_cats = pd.read_csv(os.path.join(data_folder, 'items.csv'))
df_sub = pd.read_csv(os.path.join(data_folder, 'sample_submission.csv'))
test = pd.read_csv(os.path.join(data_folder, 'test.csv'))


In [5]:
#fixing some abnormalities in the data

train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]

# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()
train.loc[train.item_price<0, 'item_price'] = median


In [6]:
"""
creating every possible pair of shop_id and item_id even if there are no sales of it in current month
we are doing this to mimic the distribution in the test data
"""

matrix = []
cols = ['date_block_num','shop_id','item_id']

for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))

matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix.sort_values(cols,inplace=True)


In [7]:
#test vals
test['date_block_num'] = 34
test = test.drop('ID', axis=1)
test = test[['date_block_num', 'shop_id', 'item_id']]


In [8]:
#aggregating item_cnt_day for whole month as in test set
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20)
                                .astype(np.float16))

all_data = pd.concat([matrix, test])

In [None]:
M_sales = all_data.groupby('date_block_num')['month_sum'].sum()
plt.plot(range(34), M_sales)

# Lag features

In [9]:
def lag_features(df, lags, col):
    subset = df[['date_block_num', 'shop_id', 'item_id', col]]
    for lag in lags:
        shifted = subset.copy()
        shifted.cols = [['date_block_num', 'shop_id', 'item_id', col+"_lag_"+str(lag)]]
        shifted.rename(columns={col : col+"_lag_"+str(lag)}, inplace=True)
        shifted['date_block_num'] += lag
        df = df.merge(shifted, on=['date_block_num', 'shop_id', 'item_id'], how='left').fillna(0)
    del shifted

    return df


In [10]:

all_data = downcast_dtypes(all_data)

all_data = lag_features(all_data, [1, 2, 3, 5, 12], 'item_cnt_month')

all_data.head()


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_5,item_cnt_month_lag_12
0,0,2,19,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2,27,1.0,0.0,0.0,0.0,0.0,0.0
2,0,2,28,0.0,0.0,0.0,0.0,0.0,0.0
3,0,2,29,0.0,0.0,0.0,0.0,0.0,0.0
4,0,2,32,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
#adding categories
all_data = all_data.merge(item_cats.drop('item_name', axis=1), on='item_id', how='left')


# Mean encoding

to avoid overfitting we encode features based on one month

In [13]:
temp = all_data.groupby(['date_block_num']).agg({'item_cnt_month' : ['mean']})
temp.columns = ['date_block_num_meanenc']
temp.reset_index(inplace=True)

all_data = all_data.merge(temp, on=['date_block_num'], how='left')
all_data['date_block_num_meanenc'] = all_data['date_block_num_meanenc'].astype('float16')

all_data = lag_features(all_data, [1], 'date_block_num_meanenc')
all_data = all_data.drop('date_block_num_meanenc', axis=1)


In [14]:
temp = all_data.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month' : ['mean']})
temp.columns = ['item_id_meanenc']
temp.reset_index(inplace=True)

all_data = all_data.merge(temp, on=['date_block_num', 'item_id'], how='left')
all_data['item_id_meanenc'] = all_data['item_id_meanenc'].astype('float16')

all_data = lag_features(all_data, [1], 'item_id_meanenc')
all_data = all_data.drop('item_id_meanenc', axis=1)


In [15]:
temp = all_data.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month' : ['mean']})
temp.columns = ['shop_id_meanenc']
temp.reset_index(inplace=True)

all_data = all_data.merge(temp, on=['date_block_num', 'shop_id'], how='left')
all_data['shop_id_meanenc'] = all_data['shop_id_meanenc'].astype('float16')

all_data = lag_features(all_data, [1, 2, 3, 5, 12], 'shop_id_meanenc')
all_data = all_data.drop('shop_id_meanenc', axis=1)


In [16]:
temp = all_data.groupby(['date_block_num', 'item_category_id']).agg({'item_cnt_month' : ['mean']})
temp.columns = ['item_category_id_meanenc']
temp.reset_index(inplace=True)

all_data = all_data.merge(temp, on=['date_block_num', 'item_category_id'], how='left')
all_data['item_category_id_meanenc'] = all_data['item_category_id_meanenc'].astype('float16')

all_data = lag_features(all_data, [1], 'item_category_id_meanenc')
all_data = all_data.drop('item_category_id_meanenc', axis=1)


In [26]:
all_data.isnull().sum()

date_block_num                    0
shop_id                           0
item_id                           0
item_cnt_month                    0
item_cnt_month_lag_1              0
item_cnt_month_lag_2              0
item_cnt_month_lag_3              0
item_cnt_month_lag_5              0
item_cnt_month_lag_12             0
item_category_id                  0
date_block_num_meanenc_lag_1      0
item_id_meanenc_lag_1             0
shop_id_meanenc_lag_1             0
shop_id_meanenc_lag_2             0
shop_id_meanenc_lag_3             0
shop_id_meanenc_lag_5             0
shop_id_meanenc_lag_12            0
item_category_id_meanenc_lag_1    0
avg_price_total                   0
dtype: int64

# Price and special features

In [19]:
#adding prices
prices = train.groupby('item_id').agg({'item_price' : ['mean']})
prices.columns = ['avg_price_total']
prices = prices.reset_index()

all_data = all_data.merge(prices, on='item_id', how='left')
all_data = all_data.fillna(0)

In [None]:
"""
this left 20% nan values so i left it out of final dataframe

prices = train.groupby(['date_block_num', 'item_id']).agg({'item_price' : ['mean']})
prices.columns = ['avg_price_month']
prices = prices.reset_index()
prices.head()

all_data = all_data.merge(prices, on=['date_block_num', 'item_id'], how='left')
"""

In [27]:
#adding seasonality in forms of month
all_data['month'] = all_data['date_block_num'] % 12

# Training and Ensembling

In [None]:
val_tresh = 34 #34 if we do not want to validate

all_data = all_data[all_data['date_block_num'] > 11] #because maximum lag used 12 we can delete

X_train = all_data[all_data['date_block_num'] < val_tresh].drop('item_cnt_month', axis=1).values
y_train = all_data.loc[all_data['date_block_num'] < val_tresh]['item_cnt_month'].values


X_val = all_data[all_data['date_block_num'] == val_tresh].drop('item_cnt_month', axis=1).values
y_val = all_data[all_data['date_block_num'] == val_tresh]['item_cnt_month'].values

X_test = all_data[all_data['date_block_num'] == 34].drop('item_cnt_month', axis=1).values

y_train = y_train.clip(0, 20)
y_val = y_val.clip(0, 20)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape)


In [36]:
#XGBoost 
from xgboost import XGBRegressor 

model_xgb = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300,
    colsample_bytree=0.8,
    subsample=0.8,
    eta=0.3,
    seed=42)

model_xgb.fit(
    X_train,
    y_train,
    n_estimators=42, 
    #eval_metric="rmse",
    #eval_set=[(X_train, y_train), (X_val, y_val)]
    #early_stopping_rounds = 9
    verbose=True
    )


[0]	validation_0-rmse:1.12786	validation_1-rmse:1.12269
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 9 rounds.
[1]	validation_0-rmse:1.09062	validation_1-rmse:1.09086
[2]	validation_0-rmse:1.06099	validation_1-rmse:1.06533
[3]	validation_0-rmse:1.02533	validation_1-rmse:1.04373
[4]	validation_0-rmse:0.999671	validation_1-rmse:1.02597
[5]	validation_0-rmse:0.978927	validation_1-rmse:1.01196
[6]	validation_0-rmse:0.963874	validation_1-rmse:0.999842
[7]	validation_0-rmse:0.946149	validation_1-rmse:0.991435
[8]	validation_0-rmse:0.932768	validation_1-rmse:0.982304
[9]	validation_0-rmse:0.922496	validation_1-rmse:0.975509
[10]	validation_0-rmse:0.913073	validation_1-rmse:0.969145
[11]	validation_0-rmse:0.903857	validation_1-rmse:0.964049
[12]	validation_0-rmse:0.897408	validation_1-rmse:0.96053
[13]	validation_0-rmse:0.890994	validation_1-rmse:0.95532
[14]	validation_0-rmse:0.885384	valida

XGBRegressor(colsample_bytree=0.8, eta=0.3, max_depth=8, min_child_weight=300,
             n_estimators=1000, seed=42, subsample=0.8)

In [30]:
#Neural Network
from keras import models
from keras import layers
from tensorflow.keras import backend as K

model_nn = models.Sequential()
model_nn.add(layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model_nn.add(layers.Dense(32, activation='relu',))
model_nn.add(layers.Dense(1))

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

model_nn.compile(
    optimizer='rmsprop',
    loss=root_mean_squared_error,
    metrics=[root_mean_squared_error]
)


Using TensorFlow backend.






In [33]:
model_nn.fit(
    X_train, y_train,
    #validation_data=(X_val, y_val),
    epochs=1,
    batch_size=64,
    verbose=True
)


Train on 6186922 samples, validate on 238172 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff46c2dfb10>

In [None]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(n_jobs=-1, verbose=1, n_estimators=50, max_depth=25, random_state=18)
model_rf.fit(X_train, y_train)

from sklearn.metrics import mean_squared_error

y_preds = model_rf.predict(X_val).clip(0, 20)

rmse = np.sqrt(mean_squared_error(y_val, y_preds))
print("validation RMSE:", rmse)


In [48]:
preds_nn.ravel().shape

(214200,)

In [49]:
preds_xgb = model_xgb.predict(X_test).clip(0, 20) #ensebling
preds_nn = model_nn.predict(X_test).clip(0, 20)

preds = (model_xgb.predict(X_test).clip(0, 20) + model_nn.predict(X_test).clip(0, 20).squeeze()) / 2

df_sub['item_cnt_month'] = preds #round?
df_sub.to_csv(os.path.join(data_folder, 'sub_out_nn'), index=False)
print("done")

done
