In [1]:
import pandas as pd
import numpy as np

from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load input data

In [2]:
train = pd.read_csv("../data/train.csv")
store = pd.read_csv("../data/store.csv")

merged_train_store = pd.merge(store, train, on='Store', how='left')

  interactivity=interactivity, compiler=compiler, result=result)


# Drop rows and Cols

In [3]:
# Remove column with zero sales
zero_sales_mask = (merged_train_store.Sales == 0) | merged_train_store.Sales.isnull()
nonzero_sales = merged_train_store.loc[~zero_sales_mask]

In [4]:
# Drop the Customers column
nonzero_sales = nonzero_sales.drop(['Customers', 'Open'], axis=1)

In [5]:
# Convert date to datetime
nonzero_sales.Date = pd.to_datetime(nonzero_sales.Date)

In [6]:
# Drop NULLs from some columns
nonzero_sales = nonzero_sales.dropna(axis=0, subset=['CompetitionDistance','StateHoliday','SchoolHoliday','Promo'])

In [7]:
# Outliers

# Filter out columns where the competitionsinceyear is below 1990
mask_1990 = nonzero_sales.CompetitionOpenSinceYear < 1990
nonzero_sales = nonzero_sales[~mask_1990]

# Helper functions: Missing value imputation

In [8]:
def missing_report(df, percentage=False):
    if percentage == False:
        report = df.isnull().sum()
    else:
        report = round(df.isnull().sum() / df.shape[0] * 100, 2)
    return report

############################
# Missing value imputation #
############################
def const_imputation(df, columns, values):
    df[columns] = df[columns].fillna(values, axis=0)
    return df

def mean_imputation(df, columns, enforce_int=False):
    if enforce_int:
        mean_imp_values = np.floor(df[columns].mean())
    else:
        mean_imp_values = df[columns].mean()
    return df.fillna(mean_imp_values, axis=0), mean_imp_values

##############################
# Categorical value encoding #
##############################
def freq_encoding(df, column):
    fenc_values = Counter(df[column])
    new_column = column + '_fenc'
    df[new_column] = df[column].map(fenc_values)
    return df, fenc_values

def mean_encoding(df, column):
    menc_values = df.groupby(by = column).mean()['Sales']
    df[column + '_menc'] = df[column].map(menc_values)
    return df, menc_values

def ordinal_encoding(df, column, ordinal_dict):
    df[column + '_orenc'] = df[column].map(ordinal_dict)
    return df

In [9]:
missing_report(nonzero_sales, percentage=True)

Store                         0.00
StoreType                     0.00
Assortment                    0.00
CompetitionDistance           0.00
CompetitionOpenSinceMonth    31.67
CompetitionOpenSinceYear     31.67
Promo2                        0.00
Promo2SinceWeek              49.33
Promo2SinceYear              49.33
PromoInterval                49.33
Date                          0.00
DayOfWeek                     2.98
Sales                         0.00
Promo                         0.00
StateHoliday                  0.00
SchoolHoliday                 0.00
dtype: float64

In [10]:
# Impute CompetitionOpenSinceMonth / CompetitionOpenSinceYear
nonzero_sales, mean_imp_values = mean_imputation(nonzero_sales, 
                                                 ['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'])

In [11]:
# Impute Promo2SinceWeek/Year
nonzero_sales = const_imputation(nonzero_sales,['Promo2SinceWeek', 'Promo2SinceYear'],values=0)

In [None]:
# Impute Promo Interval
nonzero_sales = const_imputation(nonzero_sales,['PromoInterval'],values='unavailable')

# Encoding

In [12]:
# Store
nonzero_sales, fenq_values = freq_encoding(nonzero_sales, 'Store')
nonzero_sales, menc_values = mean_encoding(nonzero_sales, 'Store')

In [20]:
# StoreType
nonzero_sales = pd.get_dummies(nonzero_sales, columns=['StoreType'], drop_first=True)

In [15]:
# Assortment
nonzero_sales = ordinal_encoding(nonzero_sales, 'Assortment', {'a':1, 'b':2, 'c':3})

In [16]:
missing_report(nonzero_sales, percentage=True)

Store                        0.00
StoreType                    0.00
Assortment                   0.00
CompetitionDistance          0.00
CompetitionOpenSinceMonth    0.00
CompetitionOpenSinceYear     0.00
Promo2                       0.00
Promo2SinceWeek              0.00
Promo2SinceYear              0.00
PromoInterval                0.00
Date                         0.00
DayOfWeek                    2.98
Sales                        0.00
Promo                        0.00
StateHoliday                 0.00
SchoolHoliday                0.00
Store_fenc                   0.00
Store_menc                   0.00
Assortment_orenc             0.00
dtype: float64

In [22]:
# Date
nonzero_sales['Month'] = nonzero_sales['Date'].dt.month
nonzero_sales['DayOfMonth'] = nonzero_sales['Date'].dt.day
nonzero_sales['Year'] = nonzero_sales['Date'].dt.year
nonzero_sales['DayOfWeek'] = nonzero_sales['Date'].dt.dayofweek
nonzero_sales['WeekOfYear'] = nonzero_sales['Date'].dt.weekofyear

In [23]:
nonzero_sales.isnull().sum()

Store                        0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
Date                         0
DayOfWeek                    0
Sales                        0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
Store_fenc                   0
StoreType_b                  0
StoreType_c                  0
StoreType_d                  0
Assortment_orenc             0
Month                        0
DayOfMonth                   0
Year                         0
WeekOfYear                   0
dtype: int64

In [24]:
# CompetitionOpenSince[Month / Year]
nonzero_sales['CompetitionOpenSincePeriod'] = (12 * (nonzero_sales['Year'] -
                                                     nonzero_sales['CompetitionOpenSinceYear']
                                                    )
                                              ) + (nonzero_sales['Month'] - 
                                                   nonzero_sales['CompetitionOpenSinceMonth']
                                                  )

In [25]:
# Promo2Since[Week / Year]
nonzero_sales['Promo2SincePeriod'] = (52 * (nonzero_sales['Year'] - 
                                            nonzero_sales['Promo2SinceYear']
                                           ) + (nonzero_sales['WeekOfYear'] - 
                                                nonzero_sales['Promo2SinceWeek']
                                               )
                                     )

In [26]:
# PromoInterval
nonzero_sales = pd.get_dummies(nonzero_sales, 
                               columns=['PromoInterval'], 
                               drop_first=True)

In [27]:
# StateHoliday
state_holiday_enc = {'0':0, 0.0:0, 'a':1, 'b':1, 'c':1}
nonzero_sales['StateHoliday_benc'] = nonzero_sales.StateHoliday.map(state_holiday_enc)

# Drop the encoded columns

In [30]:
drop_encoded_cols = ['Store', 'Assortment', 'Date', 
                     'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 
                     'Promo2SinceWeek', 'Promo2SinceYear', 
                     'StateHoliday']

In [31]:
final_df = nonzero_sales.drop(drop_encoded_cols, axis=1)

# Train test split

In [32]:
X = final_df.drop('Sales', axis=1)
y = final_df[['Sales']]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

# Model

In [34]:
rf = RandomForestRegressor(n_estimators=100, 
                           min_samples_leaf=3, 
                           max_depth=5, 
                           random_state=42)

In [35]:
rf.fit(X_train, y_train)

  rf.fit(X_train, y_train)


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [36]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [37]:
y_pred = rf.predict(X_test)

rf_base_metric = metric(y_pred, y_test.values)

print("Metric for baseline prediction = {}".format(rf_base_metric))

Metric for baseline prediction = 49.27381478129825
