# Predicting Brand Exposure

## 02_Build Models
* Load in data from previous steps
* Construct various predictive models
* Evaluate and select best model for predictions

## Import and Load Data

In [1]:
import pandas as pd

df = pd.read_excel('../data_archives/df_processed.xlsx')

In [2]:
#inspect data
df.head()

Unnamed: 0,index,name,lat,long,google_id,venue_type,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,WeekTotals,mapped_venue_type
0,0,Ellis PC,42.348034,-71.041187,ChIJq6qaXoJ644kRoNxCzt2rYyw,"['accounting', 'finance', 'point_of_interest',...",0,0,0,0,0,0,0,0,finance
1,1,"WithumSmith+Brown, PC",42.349607,-71.042722,ChIJydm7oHhw44kRYeJauZOvntA,"['accounting', 'finance', 'point_of_interest',...",0,0,0,0,0,0,0,0,finance
2,2,Cantor Stefanie D,42.349819,-71.042828,ChIJ0VvNp3hw44kRxv4C86Zgsi8,"['lawyer', 'accounting', 'finance', 'point_of_...",0,0,0,0,0,0,0,0,professional_services
3,3,Goodwin Procter Boston,42.352191,-71.043785,ChIJzyWROoRw44kR__T4RIymGyw,"['accounting', 'lawyer', 'finance', 'point_of_...",690,947,788,1020,630,0,0,4075,finance
4,4,PwC,42.351162,-71.045188,ChIJ30VlLIJw44kRk4x9eEYUtbg,"['accounting', 'finance', 'point_of_interest',...",525,558,575,644,755,0,0,3057,finance


## Prepare Train, Test, and Validation Data

In [221]:
from sklearn.preprocessing import MinMaxScaler
from keras.utils import to_categorical
from pandas import get_dummies

#split data into features and labels
X = df[df['WeekTotals']>0][['lat','long','mapped_venue_type']]
y = df[df['WeekTotals']>0]['WeekTotals']

#encode categorical variable
X = pd.get_dummies(X, columns=["mapped_venue_type"])

X = np.array(X)
y = np.array(y)

'''#scale lat long
Scaler = MinMaxScaler()
Scaler.fit(X[['lat','long']])
X[['lat','long']] = Scaler.transform(X[['lat','long']])'''

"#scale lat long\nScaler = MinMaxScaler()\nScaler.fit(X[['lat','long']])\nX[['lat','long']] = Scaler.transform(X[['lat','long']])"

In [133]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,shuffle=True)

## Build Random Forest

In [134]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [135]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [215]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
#define parameters to search
param_grid = {
    'n_estimators': [10,20,30,40,50,100],
    'max_features':[9,10,11,12,13,14,15],
    'max_depth':[40,45,50,75,100],
    'min_samples_split':[40,45,50,55,60]
}

#create regressor object
regr = RandomForestRegressor(min_samples_leaf=2,criterion='mse',random_state=2222)
#create grid search object
grid_search = GridSearchCV(estimator = regr, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 1)
#fit grid search object
grid_search.fit(X_train, y_train)
print("========================================")
print("Finished training")
print("========================================")
print("Parameters of best estimator...")
best_grid = grid_search.best_estimator_
print(grid_search.best_params_)
print("========================================")
print("Best estimator accuracy...")
grid_accuracy = evaluate(best_grid, X_test, y_test)
print(grid_accuracy)

Fitting 3 folds for each of 1050 candidates, totalling 3150 fits


[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 1472 tasks      | elapsed:   13.1s


Finished training
Parameters of best estimator...
{'max_depth': 40, 'max_features': 12, 'min_samples_split': 40, 'n_estimators': 40}
Best estimator accuracy...
Model Performance
Average Error: 927.5984 degrees.
Accuracy = 64.92%.
64.9157590076


[Parallel(n_jobs=-1)]: Done 3150 out of 3150 | elapsed:   28.5s finished


In [226]:
i = 9
print("===============================================================================")
print("===============================================================================")
print("GridSearch Model")
print("===============================================================================")
print("===============================================================================")
print(evaluate(best_grid,X_test,y_test))
print("---------------------Example Prediction on Existing Data-----------------------")
print("-------------------------------------------------------------------------------")
print("features:",X[i])
print("count_actual:",y[i])
print("count_prediction:",best_grid.predict(X[i].reshape(1,-1)))
print("===============================================================================")
print("===============================================================================")
print("Base Model")
print("===============================================================================")
print("===============================================================================")
print(evaluate(base_regr,X_test,y_test))
print("---------------------Example Prediction on Existing Data-----------------------")
print("-------------------------------------------------------------------------------")
print("features:",X[i])
print("count_actual:",y[i])
print("count_prediction:",base_regr.predict(X[i].reshape(1,-1)))

GridSearch Model
Model Performance
Average Error: 927.5984 degrees.
Accuracy = 64.92%.
64.9157590076
---------------------Example Prediction on Existing Data-----------------------
-------------------------------------------------------------------------------
features: [ 42.3515452 -71.0498813   0.          0.          0.          0.          1.
   0.          0.          0.          0.          0.          0.          0.
   0.       ]
count_actual: 2454
count_prediction: [ 3385.74830623]
Base Model
Model Performance
Average Error: 1028.6302 degrees.
Accuracy = 61.88%.
61.8841530367
---------------------Example Prediction on Existing Data-----------------------
-------------------------------------------------------------------------------
features: [ 42.3515452 -71.0498813   0.          0.          0.          0.          1.
   0.          0.          0.          0.          0.          0.          0.
   0.       ]
count_actual: 2454
count_prediction: [ 3831.8]
