In [56]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

from fastai.imports import *
from fastai.structured import *

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from IPython.display import display, FileLink

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
df_trn_raw = pd.read_csv("data/fsales-train.csv.gz")

In [58]:
df_trn = df_trn_raw.copy()

In [59]:
df_trn.dtypes

date               object
date_block_num      int64
shop_id             int64
item_id             int64
item_price        float64
item_cnt_day      float64
dtype: object

### Supplemental info (all in RUS)

In [5]:
df_items = pd.read_csv("data/fsales-items.csv")
df_icats = pd.read_csv("data/fsales-item-categories.csv")
df_shops = pd.read_csv("data/fsales-shops.csv")

In [6]:
df_items.head(3)

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40


In [7]:
df_icats.head(3)

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2


In [8]:
df_shops.head(3)

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2


### Convert datetime

In [60]:
%time df_trn["date"] = pd.to_datetime(df_trn["date"], format="%d.%m.%Y")

CPU times: user 6.7 s, sys: 20 ms, total: 6.72 s
Wall time: 6.7 s


In [61]:
df_trn["date"].describe()

count                 2935849
unique                   1034
top       2013-12-28 00:00:00
freq                     9434
first     2013-01-01 00:00:00
last      2015-10-31 00:00:00
Name: date, dtype: object

In [62]:
%time add_datepart(df_trn, "date")

CPU times: user 1.49 s, sys: 244 ms, total: 1.74 s
Wall time: 1.74 s


In [63]:
df_trn.head(3).T

Unnamed: 0,0,1,2
date_block_num,0,0,0
shop_id,59,25,25
item_id,22154,2552,2552
item_price,999,899,899
item_cnt_day,1,1,-1
Year,2013,2013,2013
Month,1,1,1
Week,1,1,1
Day,2,3,5
Dayofweek,2,3,5


In [64]:
n_trn = len(df_trn) - 53514

In [65]:
def split_val(a, n): return a[:n], a[n:]

In [67]:
X, y, nas = proc_df(df_trn, "item_cnt_day")

In [68]:
X_trn, X_val = split_val(X, n_trn)
y_trn, y_val = split_val(y, n_trn)

In [83]:
len(X_trn)

2882335

In [84]:
set_rf_samples(50000)

In [85]:
m = RandomForestRegressor(n_jobs=-1, oob_score=True)
%time m.fit(X_trn, y_trn)

CPU times: user 12.9 s, sys: 1.12 s, total: 14 s
Wall time: 8.76 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=True, random_state=None, verbose=0, warm_start=False)

In [86]:
def rmse(x, y): return math.sqrt(np.mean((x - y) ** 2))
def rf_eval(m, x_trn, y_trn, x_val, y_val):
    print("Trn RMSE: {}\nVal RMSE: {}\nTrn Score: {}\nVal Score: {}\n".format(
        rmse(m.predict(x_trn), y_trn),
        rmse(m.predict(x_val), y_val),
        m.score(x_trn, y_trn),
        m.score(x_val, y_val)
    ))
    if hasattr(m, "oob_score"):
        print("OOB Score: {}".format(m.oob_score_))

In [87]:
rf_eval(m, X_trn, y_trn, X_val, y_val)

Trn RMSE: 2.103207045575498
Val RMSE: 19.296070949174318
Trn Score: 0.151440229776815
Val Score: -2.899956756727332

OOB Score: 0.11703551033714188


In [91]:
val_preds = m.predict(X_val[:100])
np.round(val_preds)

array([1., 1., 1., 1., 1., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 2., 1.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 4., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 2., 2., 2., 2., 1., 2., 2., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

Can't sell 0.1 of a thing.

### Predict most recent month

In [13]:
df_lm = df_trn[(df_trn["Year"] == 2015) & (df_trn["Month"] == 10)].copy()
df_lm.shape

(53514, 18)

In [14]:
df_lm = df_lm[["shop_id", "item_id", "item_cnt_day"]].copy()

In [19]:
df_lm_sum = pd.DataFrame({"item_cnt_month": df_lm.groupby(["shop_id", "item_id"]).size()}).reset_index()
df_lm_sum.head(3)

Unnamed: 0,shop_id,item_id,item_cnt_month
0,2,31,1
1,2,486,3
2,2,787,1


In [20]:
df_test_raw = pd.read_csv("data/fsales-test.csv.gz")

In [21]:
df_subm = pd.DataFrame(df_test_raw.copy())
df_subm["item_cnt_month"] = 0
df_subm.head(3)

Unnamed: 0,ID,shop_id,item_id,item_cnt_month
0,0,5,5037,0
1,1,5,5320,0
2,2,5,5233,0


In [24]:
df_subm.loc[(df_subm["shop_id"] == 2)].sort_values("item_id").head(3)

Unnamed: 0,ID,shop_id,item_id,item_cnt_month
22987,22987,2,30,0
20994,20994,2,31,0
20995,20995,2,32,0


In [40]:
%%time
for idx, row in df_lm_sum.iterrows():
    df_subm["item_cnt_month"][(df_subm["shop_id"] == row.shop_id) & (df_subm["item_id"] == row.item_id)] = row.item_cnt_month

CPU times: user 1min 54s, sys: 28 ms, total: 1min 54s
Wall time: 1min 54s


In [43]:
df_samp_subm = pd.read_csv("data/fsales-sample-subm.csv.gz")
df_samp_subm.head(3)

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5


In [45]:
del df_subm["shop_id"]
del df_subm["item_id"]

In [48]:
%ls data

conductors-rf-subm.csv          house-rf-subm.csv
[0m[01;31mconductors-sample-subm.csv.zip[0m  house-sample-subm.csv
[01;31mconductors-test.csv.zip[0m         house-test.csv
[01;31mconductors-train.csv.zip[0m        house-train.csv
fsales-item-categories.csv      sample_submission.csv
fsales-items.csv                test.csv
[01;31mfsales-sample-subm.csv.gz[0m       titanic-rf2-submission.csv
fsales-shops.csv                titanic-rf3-submission.csv
[01;31mfsales-test.csv.gz[0m              titanic-rf-submission.csv
[01;31mfsales-train.csv.gz[0m             titanic-sample-subm.csv
house-extrees-subm.csv          titanic-test.csv
house-rfhp-subm.csv             titanic-train.csv
house-rfscaled-subm.csv         train.csv


In [52]:
df_subm.to_csv("data/fsales-lm-subm.csv", index=False)
FileLink("data/fsales-lm-subm.csv")