In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

from sklearn import metrics

In [3]:
PATH = "data/grocery/"

In [4]:
!ls {PATH}

holidays_events.csv   oil.csv               test.csv
items-1.csv           sample_submission.csv train.csv
items.csv             stores.csv            transactions.csv


The dependent variable that we will be trying to predict is, how many units of which kind of product will be sold through which store everyday during a two week period.

The dataset is a relational dataset. 

In [5]:
types = {'id': 'int64',
        'item_nbr': 'int32',
        'store_nbr': 'int8',
        'unit_sales': 'float32',
        'onpromotion': 'object'}

In [7]:
%%time
df_all = pd.read_csv(f'{PATH}train.csv', parse_dates=['date'], dtype=types, infer_datetime_format=True)

CPU times: user 1min 51s, sys: 20 s, total: 2min 11s
Wall time: 2min 18s


In [8]:
df_all.onpromotion.fillna(False, inplace=True)
df_all.onpromotion = df_all.onpromotion.map({'False': False, 'True': True})
df_all.onpromotion = df_all.onpromotion.astype(bool)

%time df_all.to_feather('tmp/raw_groceries')

CPU times: user 1.65 s, sys: 5.52 s, total: 7.17 s
Wall time: 8.11 s


In [9]:
%time df_all.describe(include='all')

CPU times: user 33.1 s, sys: 13.5 s, total: 46.6 s
Wall time: 50.5 s


Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
count,125497000.0,125497040,125497000.0,125497000.0,125497000.0,125497040
unique,,1684,,,,2
top,,2017-07-01 00:00:00,,,,False
freq,,118194,,,,96028767
first,,2013-01-01 00:00:00,,,,
last,,2017-08-15 00:00:00,,,,
mean,62748520.0,,27.46458,972769.2,8.554856,
std,36227880.0,,16.33051,520533.6,23.60515,
min,0.0,,1.0,96995.0,-15372.0,
25%,31374260.0,,12.0,522383.0,2.0,


In [6]:
df_test = pd.read_csv(f'{PATH}test.csv', parse_dates=['date'], dtype=types, infer_datetime_format=True)

df_test.onpromotion.fillna(False, inplace=True)
df_test.onpromotion = df_test.onpromotion.map({'False': False, 'True': True})
df_test.onpromotion = df_test.onpromotion.astype(bool)
df_test.describe(include='all')

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion
count,3370464.0,3370464,3370464.0,3370464.0,3370464
unique,,16,,,2
top,,2017-08-27 00:00:00,,,False
freq,,210654,,,3171867
first,,2017-08-16 00:00:00,,,
last,,2017-08-31 00:00:00,,,
mean,127182300.0,,27.5,1244798.0,
std,972969.3,,15.58579,589836.2,
min,125497000.0,,1.0,96995.0,
25%,126339700.0,,14.0,805321.0,


Remember: For training and testing datasets, if we have dates included then the dates from both the datasets must not overlap, i.e. both the datasets must be generated at different instances in time.

In [6]:
import feather
df_all = feather.read_dataframe('tmp/df_all_groceries')
df_test = feather.read_dataframe('tmp/df_test_groceries')

In [7]:
df_all.tail()

Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
125497035,125497035,54,2089339,1.609438,False,2017,8,33,15,1,227,False,False,False,False,False,False,1502755200
125497036,125497036,54,2106464,0.693147,True,2017,8,33,15,1,227,False,False,False,False,False,False,1502755200
125497037,125497037,54,2110456,5.26269,False,2017,8,33,15,1,227,False,False,False,False,False,False,1502755200
125497038,125497038,54,2113914,5.293305,True,2017,8,33,15,1,227,False,False,False,False,False,False,1502755200
125497039,125497039,54,2116416,1.098612,False,2017,8,33,15,1,227,False,False,False,False,False,False,1502755200


In [9]:
df_all.unit_sales = np.log1p(np.clip(df_all.unit_sales,0,None))

In [10]:
%time add_datepart(df_all, 'date')

CPU times: user 1min 10s, sys: 42.7 s, total: 1min 52s
Wall time: 2min 2s


In [36]:
df_all_last = df_all.tail(10000000)

In [37]:
df_all_last.head()

Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
115497040,115497040,22,1091368,1.791759,False,2017,5,19,13,5,133,False,False,False,False,False,False,1494633600
115497041,115497041,22,1091369,1.098612,False,2017,5,19,13,5,133,False,False,False,False,False,False,1494633600
115497042,115497042,22,1094238,1.386294,False,2017,5,19,13,5,133,False,False,False,False,False,False,1494633600
115497043,115497043,22,1096235,0.693147,False,2017,5,19,13,5,133,False,False,False,False,False,False,1494633600
115497044,115497044,22,1098624,1.94591,False,2017,5,19,13,5,133,False,False,False,False,False,False,1494633600


In [11]:
set_rf_samples(1_000_000)

In [38]:
def split_vals(a, n): return a[:n].copy(), a[n:].copy()

In [39]:
# n_valid = len(df_test)
# n_trn = len(df_all)-n_valid
train, valid = split_vals(df_all_last, 9000000)
train.shape, valid.shape
# len(df_all_last), len(df_test)

((9000000, 18), (1000000, 18))

In [40]:
%%time
trn, y, nas = proc_df(train, 'unit_sales')
val, y_val, y_val_nas = proc_df(valid, 'unit_sales')
y

CPU times: user 4 s, sys: 2.66 s, total: 6.66 s
Wall time: 8.02 s


In [46]:
len(trn) == len(y)

True

In [47]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
    
def print_score(m):
    res = [rmse(m.predict(x), y), rmse(m.predict(val), y_val), m.score(x, y), m.score(val, y_val)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

    

In [15]:
set_rf_samples(1_000_000)

In [19]:
%%time
df_all.to_feather('tmp/df_all_groceries')
df_test.to_feather('tmp/df_test_groceries')
# trn.to_feather('tmp/trn_groceries')
# y.to_feather('tmp/y_groceries')
# val.to_feather('tmp/val_groceries')
# y_val.to_feather('tmp/y_val_groceries')

CPU times: user 2.1 s, sys: 16.5 s, total: 18.6 s
Wall time: 25.9 s


In random forest model that dataframe we pass is converted to a numpy array and we will be setting up the hyperparameters by running the model multiple times so we ourselves will convert the dataframe to a numpy array to save time for every model we run.

In [43]:
%time x = np.array(trn, dtype=np.float32)

CPU times: user 7.81 s, sys: 3.43 s, total: 11.2 s
Wall time: 11.5 s


In [50]:
m = RandomForestRegressor(n_estimators=20, min_samples_leaf=100, n_jobs=2)
%time m.fit(x,y)

CPU times: user 1min 58s, sys: 787 ms, total: 1min 59s
Wall time: 1min


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=100, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=2,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [51]:
%time print_score(m)

[0.7372104436855567, 0.7344431859579128, 0.2856577733471555, 0.25929176314003366]
CPU times: user 24.8 s, sys: 807 ms, total: 25.6 s
Wall time: 14.3 s


In [53]:
m = RandomForestRegressor(n_estimators=20, min_samples_leaf=10, n_jobs=2)
%time m.fit(x,y)

CPU times: user 2min 23s, sys: 667 ms, total: 2min 24s
Wall time: 1min 12s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=2,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [54]:
 print_score(m)

[0.6336153712996828, 0.6507296261657226, 0.472314969429821, 0.41852382955807754]


In [55]:
m = RandomForestRegressor(n_estimators=20, min_samples_leaf=3, n_jobs=-1)
%time m.fit(x,y)
print_score(m)

CPU times: user 4min 4s, sys: 2.25 s, total: 4min 7s
Wall time: 1min 6s
[0.5769845483394681, 0.6272797410762504, 0.5624257763549991, 0.4596772103093478]
