In [15]:
import pandas as pd
import joblib as jb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

### Generate labels

In [2]:
month_to_predict = 32

In [3]:
items = pd.read_csv("data/items.csv")
items_categories = pd.read_csv("data/item_categories.csv")
sales_train = pd.read_csv("data/sales_train.csv")
shops = pd.read_csv("data/shops.csv")

In [4]:
df = pd.merge(items, items_categories, on="item_category_id", how="inner")
df = pd.merge(df, sales_train, on="item_id", how="inner")
df.head()

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40,Кино - DVD,01.09.2014,20,54,58.0,1.0
1,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,24.08.2014,19,54,58.0,1.0
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,12.11.2014,22,54,58.0,1.0
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,05.07.2014,18,54,100.0,1.0
4,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,26.08.2014,19,54,58.0,1.0


In [5]:
def label_df(df, month_to_label):
    to_label = df[df["date_block_num"] == month_to_label]
    labeled = to_label.groupby(["shop_id", "item_id"]).agg({"item_cnt_day":"sum"}).reset_index()
    return labeled

In [6]:
labeled = label_df(df, month_to_predict)

In [7]:
labeled.head()

Unnamed: 0,shop_id,item_id,item_cnt_day
0,2,33,1.0
1,2,486,1.0
2,2,792,1.0
3,2,975,1.0
4,2,1090,1.0


In [8]:
features = pd.read_csv("generated/features_train.csv")
features.head()

Unnamed: 0,item_id,shop_id,item_category_id,total_categories_of_shop,sold_of_category_last_month,total_shops_of_category,items_sold_last_month,purchases_item_in_shop_last_month,item_share,average_sales_for_item_and_shop_by_month,month_last_purchase,item_price_max,item_price_mean,item_price_min,diff_price_max_min
0,0,54,40,59,0.0,57,0.0,0.0,0.0,1.0,20,58.0,58.0,58.0,0.0
1,1,55,76,12,169.0,1,1976.0,0.0,0.0,1.2,21,4490.0,4490.0,4490.0,0.0
2,2,54,40,59,0.0,57,0.0,0.0,0.0,1.0,22,58.0,58.0,58.0,0.0
3,3,54,40,59,0.0,57,0.0,0.0,0.0,1.0,19,100.0,79.0,58.0,42.0
4,4,54,40,59,0.0,57,0.0,0.0,0.0,1.0,20,58.0,58.0,58.0,0.0


In [9]:
print(len(features))
print(len(labeled))
features = pd.merge(features, labeled, on=["item_id", "shop_id"], how="right")
print(len(features))
features.sample(10)

411840
29678
29678


Unnamed: 0,item_id,shop_id,item_category_id,total_categories_of_shop,sold_of_category_last_month,total_shops_of_category,items_sold_last_month,purchases_item_in_shop_last_month,item_share,average_sales_for_item_and_shop_by_month,month_last_purchase,item_price_max,item_price_mean,item_price_min,diff_price_max_min,item_cnt_day
11057,8455,58,7.0,58.0,6.0,50.0,1738.0,0.0,0.0,1.6,26.0,4290.0,3494.177548,2448.7,1841.3,2.0
10504,7863,41,28.0,59.0,36.0,57.0,892.0,0.0,0.0,7.0,26.0,1299.0,938.515604,529.07,769.93,1.0
22452,19794,35,40.0,62.0,147.0,57.0,1653.0,0.0,0.0,3.2,29.0,299.0,251.545851,85.16,213.84,1.0
14967,12563,22,55.0,61.0,74.0,55.0,1044.0,0.0,0.0,1.428571,30.0,199.0,147.047093,75.0,124.0,1.0
19935,16549,44,55.0,62.0,87.0,55.0,769.0,0.0,0.0,1.142857,25.0,229.0,197.795845,110.0,119.0,1.0
11771,9206,35,61.0,62.0,20.0,54.0,1653.0,0.0,0.0,1.0,29.0,1799.0,1796.028323,1619.0,180.0,1.0
27696,3558,41,,,,,,,,,,,,,,1.0
14892,12541,21,55.0,61.0,140.0,55.0,1622.0,1.0,0.000617,1.0,31.0,199.0,175.256531,75.0,124.0,1.0
11636,8899,31,37.0,62.0,320.0,57.0,5714.0,1.0,0.000175,3.333333,31.0,799.0,760.915492,515.69,283.31,2.0
28103,3992,45,,,,,,,,,,,,,,1.0


In [10]:
features.fillna(0, inplace=True)
features.sample(10)

Unnamed: 0,item_id,shop_id,item_category_id,total_categories_of_shop,sold_of_category_last_month,total_shops_of_category,items_sold_last_month,purchases_item_in_shop_last_month,item_share,average_sales_for_item_and_shop_by_month,month_last_purchase,item_price_max,item_price_mean,item_price_min,diff_price_max_min,item_cnt_day
7295,5363,50,23.0,59.0,82.0,57.0,1081.0,0.0,0.0,1.333333,12.0,2199.0,1060.376948,298.0,1901.0,1.0
22070,19291,25,67.0,65.0,61.0,58.0,4675.0,1.0,0.000214,1.25,31.0,999.0,992.587308,671.58,327.42,1.0
4226,3442,39,23.0,56.0,70.0,57.0,910.0,1.0,0.001099,1.0,31.0,1349.25,969.1916,899.0,450.25,3.0
11918,9356,7,70.0,60.0,65.0,58.0,1409.0,5.0,0.003549,2.666667,31.0,399.0,394.830465,198.79,200.21,2.0
19356,16138,48,65.0,52.0,51.0,58.0,1308.0,0.0,0.0,1.0,29.0,679.0,633.545922,399.0,280.0,1.0
26307,314,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5871,4357,35,30.0,62.0,198.0,57.0,1653.0,1.0,0.000605,3.4,31.0,399.0,394.851811,220.03,178.97,1.0
7166,5271,57,19.0,61.0,95.0,57.0,2780.0,0.0,0.0,1.25,30.0,1199.0,729.379254,399.5,799.5,1.0
13915,11354,37,20.0,62.0,115.0,52.0,1248.0,23.0,0.018429,23.0,31.0,3999.0,3902.092334,2982.77,1016.23,2.0
17021,14400,37,37.0,62.0,77.0,57.0,1248.0,1.0,0.000801,1.0,31.0,999.0,308.542222,148.0,851.0,1.0


In [11]:
target_col = "item_cnt_day"
ids_cols = ["item_id", "shop_id"]
predictors = [x for x in features.columns if x != target_col and x not in ids_cols]
predictors

['item_category_id',
 'total_categories_of_shop',
 'sold_of_category_last_month',
 'total_shops_of_category',
 'items_sold_last_month',
 'purchases_item_in_shop_last_month',
 'item_share',
 'average_sales_for_item_and_shop_by_month',
 'month_last_purchase',
 'item_price_max',
 'item_price_mean',
 'item_price_min',
 'diff_price_max_min']

In [12]:
X = features[predictors]
y = features[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05)

regressor = RandomForestRegressor()

regressor.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [13]:
preds = regressor.predict(X_test)

In [14]:
mean_squared_error(y_true=y_test, y_pred=preds)

108.84969408468235

In [16]:
jb.dump(regressor, "models/1-basic_random_forest_regressor.pkl")

['models/1-basic_random_forest_regressor.pkl']