# Part 1 : Import packages and finish cleaning 

In [1]:
import pandas as pd
from datetime import timedelta
import holidays
import numpy as np
import matplotlib.pyplot as plt
import sklearn.tree as sk_tree
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [2]:
#Read cleaned files from EDA notebook
dft = pd.read_csv('trans2019_clean.csv')

#Convert event_date and event_time to datetime 
dft['event_date'] = pd.to_datetime(dft.event_date)

print(dft.shape) 
dft = dft[dft.notnull()]
dft.head()

(535232, 16)


Unnamed: 0,machine_id,site_session_id,prod_category_id,prod_name,prod_qty,prod_totprice,basket_tot,event_date,event_time,domain_name,month,day,hour,minute,second,prod_cat_name
0,265161013,0,43,Unknown,1.0,62.394229,534.3,2019-01-01,16:58:19,aa.com,1,1,16,58,19,AIR TRAVEL (TRAVEL)
1,273805739,0,43,Unknown,1.0,62.394229,150.9,2019-01-01,20:21:45,aa.com,1,1,20,21,45,AIR TRAVEL (TRAVEL)
2,278548448,9008607993732463286,43,Unknown,1.0,62.394229,360.82,2019-01-01,5:57:19,aa.com,1,1,5,57,19,AIR TRAVEL (TRAVEL)
3,282766190,0,43,Unknown,1.0,62.394229,1257.13,2019-01-01,19:46:59,aa.com,1,1,19,46,59,AIR TRAVEL (TRAVEL)
4,283151287,0,43,Unknown,1.0,62.394229,358.6,2019-01-01,18:30:51,aa.com,1,1,18,30,51,AIR TRAVEL (TRAVEL)


In [3]:
dfd = pd.read_csv('demo2019_clean.csv')
print(dfd.shape)
dfd = dfd[dfd.notnull()]
dfd.head()

(139069, 12)


Unnamed: 0,machine_id,hoh_most_education,census_region,household_size,hoh_oldest_age,household_income,children,racial_background,country_of_origin,zip_code,city_name,state
0,89948345,99,3,1,10,13,0,2,0,70503,Lafayette,LA
1,93919854,99,3,4,3,11,1,2,0,74726,Bokchito,OK
2,94485706,5,99,4,11,17,1,2,1,99999,,
3,111304835,99,1,2,5,11,1,5,1,99999,,
4,117807892,99,3,2,6,11,1,5,0,70127,New Orleans,LA


In [4]:
dfs = pd.read_csv('sess2019_clean.csv')

#Convert event_date and event_time to datetime 
dfs['event_date'] = pd.to_datetime(dfs.event_date)

#Drop machine id 
dfs.drop(columns = ['machine_id'], axis = 1, inplace=True)
print(dfs.shape)
dfs = dfs[dfs.notnull()]
dfs.head()

(224480, 8)


Unnamed: 0.1,Unnamed: 0,site_session_id,ref_domain__name,pages_viewed,duration,event_date,event_time,log_dur
0,0,1000017909239914348,Unknown,,,NaT,,<ufunc 'log'>
1,1,1000406262467311679,Unknown,,,NaT,,<ufunc 'log'>
2,2,1000578923578008850,Unknown,,,NaT,,<ufunc 'log'>
3,3,1001446498515399418,Unknown,,,NaT,,<ufunc 'log'>
4,4,1001625075236765557,Unknown,,,NaT,,<ufunc 'log'>


In [5]:
#Join dfs 
df = dft.merge(dfd, how = "left", on=["machine_id"])

#Keep only non-zero session ids 
df = df[df['site_session_id'] != 0]
df = df[df.notnull()]
print(df.shape)
df.head()

(425973, 27)


Unnamed: 0,machine_id,site_session_id,prod_category_id,prod_name,prod_qty,prod_totprice,basket_tot,event_date,event_time,domain_name,...,census_region,household_size,hoh_oldest_age,household_income,children,racial_background,country_of_origin,zip_code,city_name,state
2,278548448,9008607993732463286,43,Unknown,1.0,62.394229,360.82,2019-01-01,5:57:19,aa.com,...,3,5,5,14,0,1,0,19701,Bear,DE
5,262983154,7352146647259542770,43,Unknown,1.0,62.394229,378.2,2019-01-02,19:59:53,aa.com,...,3,3,9,11,1,1,1,33178,Miami,FL
6,275184501,8508167886963145012,43,Unknown,1.0,62.394229,298.79,2019-01-02,22:04:57,aa.com,...,3,2,8,12,0,1,1,33126,Miami,FL
8,262983154,3245432324467770346,43,Unknown,1.0,62.394229,401.36,2019-01-03,2:49:09,aa.com,...,3,3,9,11,1,1,1,33178,Miami,FL
9,269184660,1753482634031028495,43,Unknown,1.0,62.394229,1313.0,2019-01-03,17:46:10,aa.com,...,3,4,11,11,1,1,0,76209,Denton,TX


In [6]:
#Prep categorical cols for dummy variables 
#df['age'] = df['hoh_oldest_age'].map(lambda x: '65 and over' if x==11 else('50 to 59' if x>=7 else 'Under 50'))
#df['income'] = df['household_income'].map(lambda x: '$200,000+' if x==18 else('$100,000 to $199,000' if x>=16 else 'Under $100,00'))

# Part 2: Feature Engineering

In [7]:
#Get average prices, basket total according to machine ID 
prod_totprice = dict(df[['machine_id', 'prod_totprice']].groupby(by="machine_id").mean('prod_totprice').reset_index()[['machine_id', 'prod_totprice']].values)
df['avg_prod_price'] = df.machine_id.map(prod_totprice)

basket_tot = dict(df[['machine_id', 'basket_tot']].groupby(by="machine_id").mean('basket_tot').reset_index()[['machine_id', 'basket_tot']].values)
df['avg_ basket_tot'] = df.machine_id.map(basket_tot)

# Encode categorical variables as integers.
df['day_name'] = df['event_date'].dt.day_name()
df['day_name'] = pd.factorize(df.day_name)[0]

df['state_num'] = pd.factorize(df.state)[0]

df['holiday'] = df['event_date'].map(lambda x: x in holidays.US())
df['holiday'] = df['holiday'].map(lambda x: 0 if x==False else 1)
df.shape #Added 4 columns to the df 

(425973, 32)

In [8]:
df.domain_name.value_counts().head(15)

amazon.com              160925
ebay.com                 44481
walmart.com              28831
dominos.com              21781
kohls.com                 9525
papajohns.com             8740
samsclub.com              8491
bathandbodyworks.com      5456
etsy.com                  5141
target.com                5139
jetblue.com               3339
bestbuy.com               2986
vistaprint.com            2966
wish.com                  2919
chewy.com                 2894
Name: domain_name, dtype: int64

In [9]:
#Subset data by domain 
amzn = df[df['domain_name'] == 'amazon.com'] #E-commerce retail
print('amazon.com shape:' + str(amzn.shape))
kohl = df[df['domain_name'] == 'kohls.com'] #Traditional retail 
print('kohls.com shape:' + str(kohl.shape))
sam = df[df['domain_name'] == 'samsclub.com'] #Whoelsale retail 
print('samsclub.com shape:' + str(sam.shape))
dom = df[df['domain_name'] == 'dominos.com'] #Food 
print('dominos.com shape:' + str(dom.shape))
jet = df[df['domain_name'] == 'jetblue.com'] #Travel
print('jetblue.com shape:' + str(jet.shape))


amazon.com shape:(160925, 32)
kohls.com shape:(9525, 32)
samsclub.com shape:(8491, 32)
dominos.com shape:(21781, 32)
jetblue.com shape:(3339, 32)


# Part 3: Building the Random Forest Regression

In [10]:
print(df.columns)
df.head()

Index(['machine_id', 'site_session_id', 'prod_category_id', 'prod_name',
       'prod_qty', 'prod_totprice', 'basket_tot', 'event_date', 'event_time',
       'domain_name', 'month', 'day', 'hour', 'minute', 'second',
       'prod_cat_name', 'hoh_most_education', 'census_region',
       'household_size', 'hoh_oldest_age', 'household_income', 'children',
       'racial_background', 'country_of_origin', 'zip_code', 'city_name',
       'state', 'avg_prod_price', 'avg_ basket_tot', 'day_name', 'state_num',
       'holiday'],
      dtype='object')


Unnamed: 0,machine_id,site_session_id,prod_category_id,prod_name,prod_qty,prod_totprice,basket_tot,event_date,event_time,domain_name,...,racial_background,country_of_origin,zip_code,city_name,state,avg_prod_price,avg_ basket_tot,day_name,state_num,holiday
2,278548448,9008607993732463286,43,Unknown,1.0,62.394229,360.82,2019-01-01,5:57:19,aa.com,...,1,0,19701,Bear,DE,29.811426,83.906667,0,0,1
5,262983154,7352146647259542770,43,Unknown,1.0,62.394229,378.2,2019-01-02,19:59:53,aa.com,...,1,1,33178,Miami,FL,62.394229,331.39,1,1,0
6,275184501,8508167886963145012,43,Unknown,1.0,62.394229,298.79,2019-01-02,22:04:57,aa.com,...,1,1,33126,Miami,FL,35.003241,173.424286,1,1,0
8,262983154,3245432324467770346,43,Unknown,1.0,62.394229,401.36,2019-01-03,2:49:09,aa.com,...,1,1,33178,Miami,FL,62.394229,331.39,2,1,0
9,269184660,1753482634031028495,43,Unknown,1.0,62.394229,1313.0,2019-01-03,17:46:10,aa.com,...,1,0,76209,Denton,TX,32.601816,141.865,2,2,0


In [11]:
# Define X and y.
#Remove nonsense cols 
feature_cols = ['prod_category_id','prod_qty','prod_totprice','month','day','hour','minute',\
                'second','hoh_most_education','census_region','household_size', 'hoh_oldest_age', \
                'household_income','children','racial_background','country_of_origin', 'day_name',\
                'holiday','state_num','avg_prod_price']
#Create X and y for every subset (5)
X_amzn = amzn[feature_cols]
y_amzn = amzn.basket_tot
X_kohl = kohl[feature_cols]
y_kohl = kohl.basket_tot
X_sam = sam[feature_cols]
y_sam = sam.basket_tot
X_dom = dom[feature_cols]
y_dom = dom.basket_tot
X_jet = jet[feature_cols]
y_jet = jet.basket_tot


In [12]:
#log logs - random forrest 

In [13]:
#Write function to evaluate models
def evaluate(model,X,y):
    print('Model Performance:')
    scores = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')
    print('Average RMSE is: '+ str(np.mean(np.sqrt(-scores))))
    
    predictions = model.predict(X)
    errors = abs(predictions - y)
    mape = 100 * np.mean(errors / y)
    accuracy = 100 - mape
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    
    predictions = model.predict(X)
    mape = mean_absolute_percentage_error(y, predictions)*100
    print('MAPE: '+ str(round(mape,2))+"%")
    print('Accuracy: '+ str(round(100-mape,2))+'%')

## 3.1 Split features (X) and regression target (y) into test/train components

### amazon.com

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_amzn, y_amzn, random_state = 13)

## 3.2 Fit & Evaluate Baseline Model

In [15]:
#Default parameters, except oob score
rfreg = RandomForestRegressor(oob_score=True, random_state=13)
rfreg.fit(X_train, y_train)

#Print default parameters
print('Parameters currently in use:\n')
print(rfreg.get_params())

# Compute feature importances.
pd.DataFrame({'feature':feature_cols, 'importance':rfreg.feature_importances_}).sort_values(by='importance')

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': True, 'random_state': 13, 'verbose': 0, 'warm_start': False}


Unnamed: 0,feature,importance
17,holiday,0.002172
15,country_of_origin,0.004162
13,children,0.005447
14,racial_background,0.011869
9,census_region,0.014509
8,hoh_most_education,0.016468
1,prod_qty,0.017518
10,household_size,0.018582
12,household_income,0.023299
11,hoh_oldest_age,0.0282


In [16]:
# Compute the out-of-bag R-squared score.
#oob score/error = the proportion of oob samples that were incorrectly classified  
print('Out-of-bag score: '+ str(rfreg.oob_score_))

evaluate(rfreg, X_test, y_test)

Out-of-bag score: 0.5643873263449855
Model Performance:
Average RMSE is: 138.68324404794015
Average Error: 45.6618 degrees.
MAPE: 246.01%
Accuracy: -146.01%


## 3.3 Hyperparamter Tuning 

In [18]:
#Random Hyperparameter Grid
# Number of trees in random forest
n_estimators = [400, 500, 550, 600, 650, 700, 750, 800]
# Number of features to consider at every split
max_features = [None, 'log2','auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 21)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [10, 25, 50, 100]
# Minimum number of samples required at each leaf node
#min_samples_leaf = [10, 25, 50, 100]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split}
print(random_grid)

{'n_estimators': [400, 500, 550, 600, 650, 700, 750, 800], 'max_features': [None, 'log2', 'auto', 'sqrt'], 'max_depth': [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, None], 'min_samples_split': [10, 25, 50, 100]}


## 3.4 Fit New Parameters 

In [19]:
#Random search training 
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rfreg = RandomForestRegressor(random_state=13)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rfreg_random = RandomizedSearchCV(estimator = rfreg, param_distributions = random_grid, n_iter = 100,\
                                  verbose=2, cv = 3, random_state=13, n_jobs = -1)
# Fit the random search model
rfreg_random.fit(X_train, y_train)

#Best parameters
rfreg_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits




{'n_estimators': 550,
 'min_samples_split': 10,
 'max_features': 'log2',
 'max_depth': 85}

## 3.5 Evaluate predictions (test vs. train)

In [20]:
#train_eval(rfreg_random.best_estimator_,X_train, y_train)
evaluate(rfreg_random.best_estimator_,X_test, y_test)

Model Performance:
Average RMSE is: 132.17738263558266
Average Error: 44.5117 degrees.
MAPE: 249.26%
Accuracy: -149.26%


In [21]:
predictions = rfreg_random.best_estimator_.predict(X_test)
np.mean(predictions)

88.2851820667269

# Results from amazon.com model: 
- Average basket total is predicted to be: $85.8.

## 3.6 Repeat steps 3.1 to 3.5 for remaining subsets

### dominos.com

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_dom, y_dom, random_state = 13)

#Default parameters, except oob score
rfreg = RandomForestRegressor(oob_score=True, random_state=13)
rfreg.fit(X_train, y_train)

#Print default parameters
print('Parameters currently in use:\n')
print(rfreg.get_params())

# Compute feature importances.
pd.DataFrame({'feature':feature_cols, 'importance':rfreg.feature_importances_}).sort_values(by='importance')

In [None]:
# Compute the out-of-bag R-squared score.
print('Out-of-bag score: '+ str(rfreg.oob_score_))

evaluate(rfreg, X_test, y_test)

In [None]:
#Random Hyperparameter Grid
n_estimators = [100, 200, 300, 400, 500, 550, 600, 650, 700, 750, 800, 850]
max_features = [None, 'log2','auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(1, 120, num = 30)]
max_depth.append(None)
#min_samples_split = [10, 15, 20]
#min_samples_leaf = [5, 10, 15]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth}
print(random_grid)

In [None]:
#Random search training 
rfreg = RandomForestRegressor(oob_score=True, random_state=13)
rfreg_random = RandomizedSearchCV(estimator = rfreg, param_distributions = random_grid, n_iter = 250,\
                                  verbose=2, cv = 5, random_state=13, n_jobs = -1)
# Fit the random search model
rfreg_random.fit(X_train, y_train)

#Best parameters
rfreg_random.best_params_

In [None]:
#evaluate(rfreg_random.best_estimator_,X_train, y_train)
evaluate(rfreg_random.best_estimator_,X_test, y_test)

In [None]:
predictions = rfreg_random.best_estimator_.predict(X_test)
np.mean(predictions)

# Results from dominos.com model: 
- Average basket total is predicted to be: $30.3

### jetblue.com

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_jet, y_jet, random_state = 13)

#Default parameters, except oob score
rfreg = RandomForestRegressor(oob_score=True, random_state=13)
rfreg.fit(X_train, y_train)

#Print default parameters
print('Parameters currently in use:\n')
print(rfreg.get_params())

# Compute feature importances.
pd.DataFrame({'feature':feature_cols, 'importance':rfreg.feature_importances_}).sort_values(by='importance')

In [None]:
# Compute the out-of-bag R-squared score.
print('Out-of-bag score: '+ str(rfreg.oob_score_))

# Find the average RMSE.
#evaluate(rfreg_random.best_estimator_,X_train, y_train)
evaluate(rfreg,X_test, y_test)

In [None]:
#Random Hyperparameter Grid
n_estimators = [100, 200, 300, 400, 500, 550, 600, 650, 700, 750, 800, 850]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(1, 100, num = 20)]
max_depth.append(None)
min_samples_split = [100,200,300, 400, 500, 1000, 1500]
min_samples_leaf = [50,100,200, 300,400]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

In [None]:
#Random search training 
rfreg = RandomForestRegressor(oob_score=True, random_state=13)
rfreg_random = RandomizedSearchCV(estimator = rfreg, param_distributions = random_grid, n_iter = 200,\
                                  verbose=2, cv = 5, random_state=13, n_jobs = -1)
# Fit the random search model
rfreg_random.fit(X_train, y_train)

#Best parameters
rfreg_random.best_params_

In [None]:
evaluate(rfreg_random.best_estimator_,X_test, y_test)

In [None]:
predictions = rfreg_random.best_estimator_.predict(X_test)
np.mean(predictions)

# Results from jetblue.com model: 
- Average basket total is predicted to be: $664.7

In [None]:
len(y_test)

In [None]:
predictions = rfreg_random.best_estimator_.predict(X_test)
errors = abs(predictions - y_test)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Model Performance')
print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
print('Accuracy = {:0.2f}%.'.format(accuracy))

#Sources 
- https://medium.com/usf-msds/intuitive-interpretation-of-random-forest-2238687cae45
- https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
-  https://medium.com/@ofirdi/mice-is-nice-but-why-should-you-care-e66698f245a3
