# Imports

In [20]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor

In [140]:
train = pd.read_csv('Data/Grocery/sales_train.csv.gz', parse_dates=['date'])
shops = pd.read_csv('Data/Grocery/shops.csv')
items = pd.read_csv('Data/Grocery/items.csv')
test = pd.read_csv('Data/Grocery/test.csv.gz')

In [29]:
train.shape

(2935849, 6)

In [30]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-02-01,0,59,22154,999.0,1.0
1,2013-03-01,0,25,2552,899.0,1.0
2,2013-05-01,0,25,2552,899.0,-1.0
3,2013-06-01,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


Merge with items to extract the category id

In [223]:
df_raw = train.merge(items, on='item_id')
df_raw = df_raw.drop('item_name', axis=1)

In [224]:
df_raw.shape

(2935849, 7)

In [225]:
df_raw.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
0,2013-02-01,0,59,22154,999.0,1.0,37
1,2013-01-23,0,24,22154,999.0,1.0,37
2,2013-01-20,0,27,22154,999.0,1.0,37
3,2013-02-01,0,25,22154,999.0,1.0,37
4,2013-03-01,0,25,22154,999.0,1.0,37


# Preprocessing

In [197]:
pd.set_option('float_format', '{:f}'.format)

In [226]:
df_raw.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.569911,33.001728,10197.227057,890.853233,1.242641,40.001383
std,9.422988,16.226973,6324.297354,1729.799631,2.618834,17.100759
min,0.0,0.0,0.0,-1.0,-22.0,0.0
25%,7.0,22.0,4476.0,249.0,1.0,28.0
50%,14.0,31.0,9343.0,399.0,1.0,40.0
75%,23.0,47.0,15684.0,999.0,1.0,55.0
max,33.0,59.0,22169.0,307980.0,2169.0,83.0


In [227]:
date_att = ['day','dayofweek','dayofyear','days_in_month','is_month_end','is_month_start','is_quarter_end','is_quarter_start','is_year_end',
 'is_year_start','month','quarter','week','weekday','weekofyear','year']

for i in date_att:
    df_raw['sale_'+i] = getattr(df_raw['date'].dt, i)
df_raw.drop('date', axis = 1, inplace=True)

In [228]:
df_raw.shape

(2935849, 22)

In [229]:
def display_all(df):
    with pd.option_context('display.max_rows', 1000, 'display.max_columns', 1000):
        display(df)

In [231]:
display_all(df_raw.head())

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,sale_day,sale_dayofweek,sale_dayofyear,sale_days_in_month,sale_is_month_end,sale_is_month_start,sale_is_quarter_end,sale_is_quarter_start,sale_is_year_end,sale_is_year_start,sale_month,sale_quarter,sale_week,sale_weekday,sale_weekofyear,sale_year
0,0,59,22154,999.0,1.0,37,1,4,32,28,False,True,False,False,False,False,2,1,5,4,5,2013
1,0,24,22154,999.0,1.0,37,23,2,23,31,False,False,False,False,False,False,1,1,4,2,4,2013
2,0,27,22154,999.0,1.0,37,20,6,20,31,False,False,False,False,False,False,1,1,3,6,3,2013
3,0,25,22154,999.0,1.0,37,1,4,32,28,False,True,False,False,False,False,2,1,5,4,5,2013
4,0,25,22154,999.0,1.0,37,1,4,60,31,False,True,False,False,False,False,3,1,9,4,9,2013


In [232]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2935849 entries, 0 to 2935848
Data columns (total 22 columns):
date_block_num           int64
shop_id                  int64
item_id                  int64
item_price               float64
item_cnt_day             float64
item_category_id         int64
sale_day                 int64
sale_dayofweek           int64
sale_dayofyear           int64
sale_days_in_month       int64
sale_is_month_end        bool
sale_is_month_start      bool
sale_is_quarter_end      bool
sale_is_quarter_start    bool
sale_is_year_end         bool
sale_is_year_start       bool
sale_month               int64
sale_quarter             int64
sale_week                int64
sale_weekday             int64
sale_weekofyear          int64
sale_year                int64
dtypes: bool(6), float64(2), int64(14)
memory usage: 397.6 MB


Split the target variable from the rest of the dataset

In [208]:
def change_df(df, y):
    return df.drop(y, axis = 1), df[y]

In [233]:
df, y = change_df(df_raw, 'item_cnt_day')

In [234]:
def split_data(df, n):
    return df[:n].copy(), df[n:].copy()

In [235]:
# Take a test set of the same size of the test set on kaggle
test_size = 214200
n_size = len(df_raw) - test_size
df_trn, df_valid  = split_data(df_raw, n_size)
X_train, X_valid = split_data(df, n_size)
y_train, y_valid = split_data(y, n_size)

In [236]:
X_train.shape, y_train.shape, X_valid.shape

((2721649, 21), (2721649,), (214200, 21))

In [238]:
m = RandomForestRegressor(n_estimators=5,n_jobs=-1)
m.fit(X_train, y_train)
pred = m.predict(y_valid)

ValueError: Expected 2D array, got 1D array instead:
array=[1. 1. 1. ... 1. 1. 1.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
from sklearn.metrics import mean_squared_error as rmse