In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

pd.set_option('display.max_columns', 500)

In [None]:
sales = pd.read_csv("data/source/sales_train.csv")
sales["date"] = sales["date"].apply(lambda x: datetime.strptime(x, "%d.%m.%Y"))
display(sales.head(5))
sales.shape

In [None]:
items = pd.read_csv("data/source/items.csv")
cats = pd.read_csv("data/source/item_categories.csv")

items = items.join(cats.set_index("item_category_id"), how="left", on="item_category_id")

items

In [None]:
shops = pd.read_csv("data/source/shops.csv")

shops.head(5)

In [None]:
test = pd.read_csv("data/source/test.csv")
test

In [None]:
# we merge all shop_id, item_id combinations from train and test set
shop_item = sales[["shop_id", "item_id"]].drop_duplicates().join(
    test.set_index(["shop_id", "item_id"]), on=["shop_id", "item_id"], how="outer"
)[["shop_id", "item_id"]].reset_index(drop=True)

# add item and shop metadata
shop_item = shop_item.join(items.set_index("item_id"), how="left", on="item_id")
shop_item = shop_item.join(shops.set_index("shop_id"), on="shop_id", how="left")

shop_item

In [None]:
shop_item = shop_item.merge(pd.DataFrame({"date_block_num": range(34)}), how="cross")

#reduce memory usage
print(shop_item.info())

for col in ["shop_id", "item_id", "item_category_id","date_block_num"]:
    shop_item[col] = shop_item[col].astype("Int16")
    
shop_item.info()

In [None]:
# aggregate sales by month
sales_monthly = sales.groupby(["date_block_num", "shop_id", "item_id"]).agg({
    "item_cnt_day": ["sum", "min", "max"],
    "date": ["min", "max", "count"],
    "item_price": ["mean", "max", "min"]
}).reset_index()

# flatten column names an fill na
sales_monthly.columns = [c[0] if c[1] == "" else c[1] + "_" + c[0] for c in sales_monthly.columns]
sales_monthly = sales_monthly.fillna(0) # should only affect std of breakouts of count 1

# replace min and max dates by their day of month
sales_monthly["min_date"] = sales_monthly.min_date.apply(lambda x: x.day)
sales_monthly["max_date"] = sales_monthly.max_date.apply(lambda x: x.day)

sales_monthly = sales_monthly.rename({
    "sum_item_cnt_day": "item_cnt_month",
}, axis=1)

sales_monthly

In [None]:
#reduce memory usage
print(sales_monthly.info())

for col in ["item_cnt_month", "min_item_cnt_day", "max_item_cnt_day",
           "min_date", "max_date", "count_date"]:
    sales_monthly[col] = sales_monthly[col].astype("Int16")
    
sales_monthly.info()

In [None]:
# extend to all item_id, shop_id combinations and add metadata
join_keys = ["shop_id", "item_id", "date_block_num"]
sales_monthly = shop_item.join(sales_monthly.set_index(join_keys),
                               on=join_keys, how="left")

sales_monthly

In [None]:
sales_monthly.info()

In [None]:
sales_monthly.to_pickle("data/processed/sales_monthly_full.pickle")

In [None]:
sales_monthly =sales_monthly.sort_values(by=["item_id", "shop_id", "date_block_num"])

In [None]:
sales_monthly_wide = sales_monthly.set_index([
    "shop_id",
    "item_id",
    "item_name",
    "item_category_id",
    "item_category_name",
    "shop_name",
    "date_block_num",
]).unstack()

sales_monthly_wide = sales_monthly_wide.reset_index().fillna(0)

sales_monthly_wide.info()

In [None]:
sales_monthly_wide.columns = [
    c[0] if c[1] == '' else (
        c[0] + "_" + str(c[1])
    ).replace("day", "month") for c in sales_monthly_wide.columns
]

In [None]:
sales_monthly_wide

In [None]:
sales_monthly_wide.to_pickle("data/processed/sales_monthly_wide_full.pickle")

# Models

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

pd.set_option('display.max_columns', 500)

sales_monthly_wide = pd.read_pickle("data/processed/sales_monthly_wide_full.pickle")

In [2]:
def get_train_set_for_month(month, n, X_only=False):
    cols = list(sales_monthly_wide.columns)
    
    train_cols = ["shop_id", "item_id", "item_category_id"]
    cols = cols[6:]
    train_cols += [c for c in cols if (
        int(c.split("_")[-1]) < month and
        int(c.split("_")[-1]) > month - n
    )]
    X = sales_monthly_wide[train_cols]
    if not X_only:
        y = sales_monthly_wide[[f"item_cnt_month_{month}"]]
        return X.values, y.values.reshape(-1), train_cols

    return X.values, train_cols

In [4]:
import numpy as np

def clip(array):
    return np.array([min(x, 20) for x in array])

clip(np.array([1, 2, 34, 45, 45, -111]))

array([   1,    2,   20,   20,   20, -111])

In [5]:
%%time

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

rf = RandomForestRegressor(
    n_estimators = 20,
    max_depth=10,
    min_samples_leaf=100,
    n_jobs=-1,  # -1 is all processors used
)

for i, month in enumerate([33, 32, 31, 30, 29]):
    if i == 0:
        X, y, cols = get_train_set_for_month(month, 13)
        continue
    
    X_inc, y_inc, _ = get_train_set_for_month(month, 13)
    X = np.concatenate((X, X_inc), axis=0)
    y = np.concatenate((y, y_inc), axis=0)
    
    del X_inc, y_inc
    
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

rf.fit(X_train, clip(y_train))

print(mean_squared_error(y_true=clip(y_train), y_pred=clip(rf.predict(X_train)), squared=False))
print(mean_squared_error(y_true=clip(y_test), y_pred=clip(rf.predict(X_test)), squared=False))

0.5546793051465301
0.5730118878875057
CPU times: user 34min 34s, sys: 41.7 s, total: 35min 16s
Wall time: 19min 37s


In [6]:
help(RandomForestRegressor)

Help on class RandomForestRegressor in module sklearn.ensemble._forest:

class RandomForestRegressor(ForestRegressor)
 |  RandomForestRegressor(n_estimators=100, *, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)
 |  
 |  A random forest regressor.
 |  
 |  A random forest is a meta estimator that fits a number of classifying
 |  decision trees on various sub-samples of the dataset and uses averaging
 |  to improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is controlled with the `max_samples` parameter if
 |  `bootstrap=True` (default), otherwise the whole dataset is used to build
 |  each tree.
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------

In [7]:
"""
rf = RandomForestRegressor(
    n_estimators = 20,
    max_depth=9,
    min_samples_leaf=100
)

for i, month in enumerate([33, 32, 31, 30]):
    if i == 0:
        X, y, cols = get_train_set_for_month(month, 13)
        continue
    
    X_inc, y_inc, _ = get_train_set_for_month(month, 13)
    X = np.concatenate((X, X_inc), axis=0)
    y = np.concatenate((y, y_inc), axis=0)
    
    del X_inc, y_inc
    
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

rf.fit(X_train, clip(y_train))

0.5685285035910349
0.5749121905190944
"""

'\nrf = RandomForestRegressor(\n    n_estimators = 20,\n    max_depth=9,\n    min_samples_leaf=100\n)\n\nfor i, month in enumerate([33, 32, 31, 30]):\n    if i == 0:\n        X, y, cols = get_train_set_for_month(month, 13)\n        continue\n    \n    X_inc, y_inc, _ = get_train_set_for_month(month, 13)\n    X = np.concatenate((X, X_inc), axis=0)\n    y = np.concatenate((y, y_inc), axis=0)\n    \n    del X_inc, y_inc\n    \nX_train, X_test, y_train, y_test = train_test_split(\n    X, y, test_size=0.33, random_state=42\n)\n\nrf.fit(X_train, clip(y_train))\n\n0.5685285035910349\n0.5749121905190944\n'

In [9]:
X_train.nbytes / 1000 ** 2

1567.481616

In [10]:
feature_imp = list(zip(cols, rf.feature_importances_))

feature_imp.sort(key= lambda x: x[1], reverse=True)

feature_imp

[('item_cnt_month_32', 0.6970814975510731),
 ('count_date_32', 0.04899934674008707),
 ('max_item_price_31', 0.028890211170356533),
 ('item_cnt_month_31', 0.026693971834871662),
 ('item_cnt_month_25', 0.024199761948950717),
 ('max_item_price_32', 0.016116520292818237),
 ('max_date_32', 0.01381310199974906),
 ('item_cnt_month_30', 0.013735322436655133),
 ('item_cnt_month_27', 0.01122911422884782),
 ('min_date_32', 0.009514050459640022),
 ('max_item_cnt_month_32', 0.009346789507486781),
 ('mean_item_price_32', 0.007883511921417521),
 ('count_date_23', 0.007475998639983135),
 ('item_category_id', 0.0071488923190803615),
 ('count_date_25', 0.006799354731528827),
 ('item_cnt_month_24', 0.006720643569510539),
 ('item_id', 0.006294735450272981),
 ('count_date_31', 0.005084288997249555),
 ('count_date_30', 0.004352664857021647),
 ('item_cnt_month_26', 0.003969669457451572),
 ('item_cnt_month_28', 0.003567898143935424),
 ('count_date_29', 0.003196342163968162),
 ('mean_item_price_30', 0.00260124

In [11]:
# predict for Nov!

X, cols = get_train_set_for_month(34, 13, X_only=True)

In [12]:
X

array([[0, 30, 40, ..., 0.0, 0.0, 0.0],
       [0, 31, 37, ..., 0.0, 0.0, 0.0],
       [0, 32, 40, ..., 0.0, 0.0, 0.0],
       ...,
       [59, 22164, 37, ..., 0.0, 0.0, 0.0],
       [59, 22166, 54, ..., 0.0, 0.0, 0.0],
       [59, 22167, 49, ..., 0.0, 0.0, 0.0]], dtype=object)

In [13]:
y_pred = rf.predict(X)

In [14]:
y_pred

array([0.02934008, 0.02934008, 0.02934008, ..., 0.1719791 , 0.02993272,
       0.02993272])

In [15]:
X = pd.DataFrame(X, columns=cols)[["shop_id", "item_id"]]
X["item_cnt_month"] = clip(y_pred)
X

Unnamed: 0,shop_id,item_id,item_cnt_month
0,0,30,0.029340
1,0,31,0.029340
2,0,32,0.029340
3,0,33,0.029340
4,0,35,0.029340
...,...,...,...
526915,59,22162,0.261748
526916,59,22163,0.029933
526917,59,22164,0.171979
526918,59,22166,0.029933


In [16]:
X.describe()

Unnamed: 0,item_cnt_month
count,526920.0
mean,0.110043
std,0.430638
min,0.02934
25%,0.029933
50%,0.029933
75%,0.033297
max,17.643013


In [17]:
submission = pd.read_csv("data/source/test.csv")

submission = submission.join(
    X.set_index(["shop_id", "item_id"]), on=["shop_id", "item_id"], how="left"
)[["ID", "item_cnt_month"]]

display(submission)

submission.to_csv("data/submission_full_2.csv", index=False)

Unnamed: 0,ID,item_cnt_month
0,0,0.641952
1,1,0.029340
2,2,1.081769
3,3,0.112062
4,4,0.292670
...,...,...
214195,214195,0.293442
214196,214196,0.031264
214197,214197,0.029933
214198,214198,0.029933


In [18]:
"""TODO's:

- Feature enginnering on names of shops, items, item_cats
- features for price delta
- features on overall shop and item performance
- add model pickle section
"""

"TODO's:\n\n- Feature enginnering on names of shops, items, item_cats\n- features for price delta\n- features on overall shop and item performance\n- add model pickle section\n"