In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from itertools import product
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBRegressor
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

sys.version_info(major=3, minor=7, micro=6, releaselevel='final', serial=0)

In [2]:
def downcast_dtypes(df):
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [19]:
import os
data_folder = "/home/wenceslai/Documents/predict_sales_kaggle"

train = pd.read_csv(os.path.join(data_folder, 'sales_train.csv'))
cats = pd.read_csv(os.path.join(data_folder, 'items.csv'))
df_sub = pd.read_csv(os.path.join(data_folder, 'sample_submission.csv'))
test = pd.read_csv(os.path.join(data_folder, 'test.csv'))

In [20]:
matrix = []
cols = ['date_block_num','shop_id','item_id']

for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))

matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
#matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
#matrix['item_id'] = matrix['item_id'].astype(np.int16)
#matrix.sort_values(cols,inplace=True)

In [21]:
#test vals
test['date_block_num'] = 34
test = test.drop('ID', axis=1)
test = test[['date_block_num', 'shop_id', 'item_id']]

In [22]:
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20)
                                .astype(np.float16))

In [23]:
def lag_features(df, lags, col):
    subset = df[['date_block_num', 'shop_id', 'item_id', col]]
    for lag in lags:
        shifted = subset.copy()
        shifted.cols = [['date_block_num', 'shop_id', 'item_id', col+"_lag_"+str(lag)]]
        shifted.rename(columns={col : col+"_lag_"+str(lag)}, inplace=True)
        shifted['date_block_num'] += lag
        df = df.merge(shifted, on=['date_block_num', 'shop_id', 'item_id'], how='left').fillna(0)
    del shifted
    
    return df

In [24]:
all_data = pd.concat([matrix, test])
all_data = downcast_dtypes(all_data)

lags = [1, 2, 3, 4, 5, 12]

all_data = lag_features(all_data, lags, 'item_cnt_month')

all_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_4,item_cnt_month_lag_5,item_cnt_month_lag_12
0,0,59,22154,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,59,2552,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,59,2554,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,59,2555,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,59,2564,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
all_data.isnull().sum()

date_block_num           0
shop_id                  0
item_id                  0
item_cnt_month           0
item_cnt_month_lag_1     0
item_cnt_month_lag_2     0
item_cnt_month_lag_3     0
item_cnt_month_lag_4     0
item_cnt_month_lag_5     0
item_cnt_month_lag_12    0
dtype: int64

In [22]:
X = matrix.drop('item_cnt_month', axis=1).values

y = matrix['item_cnt_month'].values
y = y.clip(0., 20.)

print(X.shape, y.shape)

#from sklearn.model_selection import train_test_split
#X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)

from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(n_jobs=-1, verbose=1, n_estimators=50, max_depth=20, random_state=18) #rs was 18
model_rf.fit(X_train, y_train)

(10913850, 3) (10913850,)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  5.3min finished


RandomForestRegressor(max_depth=15, n_estimators=50, n_jobs=-1, random_state=18,
                      verbose=1)

# Training

In [29]:
val_tresh = 34 #34 if we do not want to validate

X_train = all_data[all_data['date_block_num'] < val_tresh].drop('item_cnt_month', axis=1).values
y_train = all_data.loc[all_data['date_block_num'] < val_tresh]['item_cnt_month'].values


X_val = all_data[all_data['date_block_num'] == val_tresh].drop('item_cnt_month', axis=1).values
y_val = all_data[all_data['date_block_num'] == val_tresh]['item_cnt_month'].values

X_test = all_data[all_data['date_block_num'] == 34].drop('item_cnt_month', axis=1).values

y_train = y_train.clip(0, 20)
y_val = y_val.clip(0, 20)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape)

(10913850, 9) (10913850,) (214200, 9) (214200,) (214200, 9)


In [30]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(n_jobs=-1, verbose=1, n_estimators=50, max_depth=25, random_state=18) #rs was 18
model_rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.8min finished


RandomForestRegressor(max_depth=25, n_estimators=50, n_jobs=-1, random_state=18,
                      verbose=1)

In [37]:
from sklearn.metrics import mean_squared_error

y_preds = model_rf.predict(X_val).clip(0, 20)

rmse = np.sqrt(mean_squared_error(y_val, y_preds))
print("validation RMSE:", rmse)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.7s
validation RMSE: 1.09239565502689
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    2.3s finished


In [32]:
preds = model_rf.predict(X_test).clip(0, 20)

df_sub['item_cnt_month'] = preds #round?
df_sub.to_csv(os.path.join(data_folder, 'sub_out_lags'), index=False)
print("done")

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.3s finished
done
