In [18]:
import pandas as pd
import numpy as np

from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load input data

In [2]:
train = pd.read_csv("../data/train.csv")
store = pd.read_csv("../data/store.csv")

merged_train_store = pd.merge(store, train, on='Store', how='left')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Drop rows and Cols

In [3]:
# Remove column with zero sales
zero_sales_mask = (merged_train_store.Sales == 0) | merged_train_store.Sales.isnull()
nonzero_sales = merged_train_store.loc[~zero_sales_mask]

In [4]:
# Drop the Customers column
nonzero_sales = nonzero_sales.drop(['Customers', 'Open'], axis=1)

In [5]:
# Convert date to datetime
nonzero_sales.Date = pd.to_datetime(nonzero_sales.Date)

In [6]:
# Drop NULLs from some columns
nonzero_sales = nonzero_sales.dropna(axis=0, subset=['CompetitionDistance','StateHoliday','SchoolHoliday','Promo'])

# Missing value imputation

In [7]:
# Drop NULLS from some
nonzero_sales.isnull().sum()

Store                             0
StoreType                         0
Assortment                        0
CompetitionDistance               0
CompetitionOpenSinceMonth    143023
CompetitionOpenSinceYear     143023
Promo2                            0
Promo2SinceWeek              222774
Promo2SinceYear              222774
PromoInterval                222774
Date                              0
DayOfWeek                     13497
Sales                             0
Promo                             0
StateHoliday                      0
SchoolHoliday                     0
dtype: int64

In [8]:
# Filter out columns where the competitionsinceyear is below 1990
mask_1990 = nonzero_sales.CompetitionOpenSinceYear < 1990
nonzero_sales = nonzero_sales[~mask_1990]

In [10]:
# Impute CompetitionOpenSinceMonth / CompetitionOpenSinceYear
mean_impute_cols = ['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear']
mean_impute = np.floor(nonzero_sales[mean_impute_cols].mean())
nonzero_sales[mean_impute_cols] = nonzero_sales[mean_impute_cols].fillna(mean_impute, axis=0)

In [12]:
# Impute Promo2SinceWeek/Year
zero_impute_cols = ['Promo2SinceWeek', 'Promo2SinceYear']
zero_impute = 0
nonzero_sales[zero_impute_cols] = nonzero_sales[zero_impute_cols].fillna(zero_impute, axis=0)

In [14]:
# Impute Promo Interval
string_impute_cols = ['PromoInterval']
string_impute = 'unavailable'
nonzero_sales[string_impute_cols] = nonzero_sales[string_impute_cols].fillna(string_impute, axis=0)

# Encoding

In [19]:
# Store
store_freq = Counter(nonzero_sales.Store)
nonzero_sales['Store_fenc'] = nonzero_sales.Store.map(store_freq)

In [20]:
# StoreType
nonzero_sales = pd.get_dummies(nonzero_sales, columns=['StoreType'], drop_first=True)

In [21]:
# Assortment
enc_assort = {'a':1, 'b':2, 'c':3}
nonzero_sales['Assortment_orenc'] = nonzero_sales.Assortment.map(enc_assort)

In [22]:
# Date
nonzero_sales['Month'] = nonzero_sales['Date'].dt.month
nonzero_sales['DayOfMonth'] = nonzero_sales['Date'].dt.day
nonzero_sales['Year'] = nonzero_sales['Date'].dt.year
nonzero_sales['DayOfWeek'] = nonzero_sales['Date'].dt.dayofweek
nonzero_sales['WeekOfYear'] = nonzero_sales['Date'].dt.weekofyear

In [23]:
nonzero_sales.isnull().sum()

Store                        0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
Date                         0
DayOfWeek                    0
Sales                        0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
Store_fenc                   0
StoreType_b                  0
StoreType_c                  0
StoreType_d                  0
Assortment_orenc             0
Month                        0
DayOfMonth                   0
Year                         0
WeekOfYear                   0
dtype: int64

In [24]:
# CompetitionOpenSince[Month / Year]
nonzero_sales['CompetitionOpenSincePeriod'] = (12 * (nonzero_sales['Year'] -
                                                     nonzero_sales['CompetitionOpenSinceYear']
                                                    )
                                              ) + (nonzero_sales['Month'] - 
                                                   nonzero_sales['CompetitionOpenSinceMonth']
                                                  )

In [25]:
# Promo2Since[Week / Year]
nonzero_sales['Promo2SincePeriod'] = (52 * (nonzero_sales['Year'] - 
                                            nonzero_sales['Promo2SinceYear']
                                           ) + (nonzero_sales['WeekOfYear'] - 
                                                nonzero_sales['Promo2SinceWeek']
                                               )
                                     )

In [26]:
# PromoInterval
nonzero_sales = pd.get_dummies(nonzero_sales, 
                               columns=['PromoInterval'], 
                               drop_first=True)

In [27]:
# StateHoliday
state_holiday_enc = {'0':0, 0.0:0, 'a':1, 'b':1, 'c':1}
nonzero_sales['StateHoliday_benc'] = nonzero_sales.StateHoliday.map(state_holiday_enc)

# Drop the encoded columns

In [30]:
drop_encoded_cols = ['Store', 'Assortment', 'Date', 
                     'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 
                     'Promo2SinceWeek', 'Promo2SinceYear', 
                     'StateHoliday']

In [31]:
final_df = nonzero_sales.drop(drop_encoded_cols, axis=1)

# Train test split

In [32]:
X = final_df.drop('Sales', axis=1)
y = final_df[['Sales']]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

# Model

In [34]:
rf = RandomForestRegressor(n_estimators=100, 
                           min_samples_leaf=3, 
                           max_depth=5, 
                           random_state=42)

In [35]:
rf.fit(X_train, y_train)

  rf.fit(X_train, y_train)


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [36]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [37]:
y_pred = rf.predict(X_test)

rf_base_metric = metric(y_pred, y_test.values)

print("Metric for baseline prediction = {}".format(rf_base_metric))

Metric for baseline prediction = 49.27381478129825
