# Predicting Brand Exposure

## 02_Build Models
* Load in data from previous steps
* Construct various predictive models
* Evaluate and select best model for predictions

## Import and Load Data

In [3]:
import pandas as pd

df = pd.read_excel('../data_archives/df_processed.xlsx')

In [4]:
#inspect data
df.head()

Unnamed: 0,index,name,lat,long,google_id,venue_type,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,WeekTotals,mapped_venue_type
0,0,Ellis PC,42.348034,-71.041187,ChIJq6qaXoJ644kRoNxCzt2rYyw,"['accounting', 'finance', 'point_of_interest',...",0,0,0,0,0,0,0,0,finance
1,1,"WithumSmith+Brown, PC",42.349607,-71.042722,ChIJydm7oHhw44kRYeJauZOvntA,"['accounting', 'finance', 'point_of_interest',...",0,0,0,0,0,0,0,0,finance
2,2,Cantor Stefanie D,42.349819,-71.042828,ChIJ0VvNp3hw44kRxv4C86Zgsi8,"['lawyer', 'accounting', 'finance', 'point_of_...",0,0,0,0,0,0,0,0,professional_services
3,3,Goodwin Procter Boston,42.352191,-71.043785,ChIJzyWROoRw44kR__T4RIymGyw,"['accounting', 'lawyer', 'finance', 'point_of_...",690,947,788,1020,630,0,0,4075,finance
4,4,PwC,42.351162,-71.045188,ChIJ30VlLIJw44kRk4x9eEYUtbg,"['accounting', 'finance', 'point_of_interest',...",525,558,575,644,755,0,0,3057,finance


## Prepare Train, Test, and Validation Data

In [45]:
from sklearn.preprocessing import MinMaxScaler
from keras.utils import to_categorical
from pandas import get_dummies

#split data into features and labels
X = df[['lat','long','mapped_venue_type']]
y = df['WeekTotals']

#encode categorical variable
X = pd.get_dummies(X, columns=["mapped_venue_type"])

#scale lat long
Scaler = MinMaxScaler()
Scaler.fit(X[['lat','long']])
X[['lat','long']] = Scaler.transform(X[['lat','long']])

X.head()


Unnamed: 0,lat,long,mapped_venue_type_automotive,mapped_venue_type_bar,mapped_venue_type_culture_entertainment,mapped_venue_type_education,mapped_venue_type_finance,mapped_venue_type_food,mapped_venue_type_government,mapped_venue_type_grocery,mapped_venue_type_health_wellness,mapped_venue_type_other,mapped_venue_type_professional_services,mapped_venue_type_religous,mapped_venue_type_retail,mapped_venue_type_transportation
0,0.496209,0.586301,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0.507842,0.576653,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0.509412,0.57599,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0.526952,0.56997,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0.519344,0.561148,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,shuffle=True)

In [51]:
X_train.shape

(1526, 16)

## Build MLP

In [88]:
from keras.layers import Dense
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import cross_val_score

In [89]:
# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(16, input_dim=16, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [90]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)

In [94]:
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, X_test.values, y_test.values, cv=kfold, n_jobs=10)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

KeyboardInterrupt: 