[View in Colaboratory](https://colab.research.google.com/github/wichersq/alarm-predict-ML/blob/master/timePredML.ipynb)

The model is a 5 layer fully connected network that takes in the features defined in **list_samp_x** below. It outputs how long before the event the user should start getting ready. It is implemented in Tensorflow. Values in the data are sometimes nan, so the preprocessing code sets these to -1 and defines another column which indicates if the feature was originally nan or not.


In [0]:
import tensorflow as tf
import pandas as pd
from google.colab import files

In [63]:
FILE_NAME_WITH_TOTAL_DURATION = 'https://raw.githubusercontent.com/wichersq/alarm-predict-ML/master/Data/converted_data_with_totalDuration.csv'

data_df = pd.read_csv(FILE_NAME_WITH_TOTAL_DURATION)
                 
print(data_df.head(10))

                         Business Name  \
0                   Maison D'Alexandre   
1                     Johnson Law Firm   
2                                 DROM   
3  Scrub Pro Uniforms & Gallo Clothing   
4                           DC Trekker   
5                Kohl's Fredericksburg   
6                       Investors Bank   
7                       Barnes & Noble   
8       Market Cafe & Brick Oven Pizza   
9                         Jos. A. Bank   

                                 Destination Address  \
0                   33 Lewis St, Greenwich, CT 06830   
1            302 W Broad St, Elizabethtown, NC 28337   
2                    85 Avenue A, New York, NY 10009   
3              78 Mountain Rd, Glen Burnie, MD 21060   
4           1422 Harvard St NW, Washington, DC 20009   
5  1571 Carl D. Silver Parkway, Fredericksburg, V...   
6  675 Hempstead Turnpike, Franklin Square, NY 11010   
7                601 E Pratt St, Baltimore, MD 21202   
8              425 Lexington Ave,

In [0]:
def get_open_close_time(df_input_hour, day_in_week):
  open_col = []
  close_col = []
  for n in range(len(day_in_week)):
    day = day_in_week[n]
    col = df_input_hour.iloc[n]
    close_col.append(col[((day*2 )-2)])
    open_col.append(col[((day*2 )-1)])
  return close_col, open_col

In [65]:
input_df = data_df.copy()

input_df['Destination'] = data_df['Business Name'] + ', ' + data_df["Destination Address"]

store_hour_df =  data_df[['Day1_Close', 'Day1_Open','Day2_Close','Day2_Open','Day3_Close',
                          'Day3_Open','Day4_Close','Day4_Open','Day5_Close', 'Day5_Open',     
                          'Day6_Close', 'Day6_Open','Day0_Close', 'Day0_Open']]

# Storing NAN value as 0 and available value as 1.
input_df['Close_Time'],input_df['Open_Time']  = get_open_close_time(store_hour_df, data_df['Day of the Week'])
input_df['Does_Reviews_Exist?'] = data_df['Reviews'].notnull().astype(int)
input_df['Does_Rating_Exist?'] = data_df['Rating'].notnull().astype(int)
input_df['Does_Price_Lv_Exist?'] = input_df['Price Level'].notnull().astype(int)
input_df['Does_Walking_Exist?'] = input_df['Walking_Duration'].notnull().astype(int)
input_df['Does_Transit_Exist?'] = input_df['Transit_Duration'].notnull().astype(int)
input_df['Does_Close/Open_Time_Exist?'] = input_df['Open_Time'].notnull().astype(int)

# get rid of nan
input_df = input_df.fillna(-1)

def linear_scale(series):
  min_val = series.min()
  max_val = series.max()
  scale = (max_val - min_val)
  return series.apply(lambda x:((x / scale))) 

input_df['Reviews'] = linear_scale(input_df['Reviews'])

print(input_df.head(10))

train_df=input_df.sample(frac=0.8, random_state=0)
test_df=input_df.drop(train_df.index)

                         Business Name  \
0                   Maison D'Alexandre   
1                     Johnson Law Firm   
2                                 DROM   
3  Scrub Pro Uniforms & Gallo Clothing   
4                           DC Trekker   
5                Kohl's Fredericksburg   
6                       Investors Bank   
7                       Barnes & Noble   
8       Market Cafe & Brick Oven Pizza   
9                         Jos. A. Bank   

                                 Destination Address  \
0                   33 Lewis St, Greenwich, CT 06830   
1            302 W Broad St, Elizabethtown, NC 28337   
2                    85 Avenue A, New York, NY 10009   
3              78 Mountain Rd, Glen Burnie, MD 21060   
4           1422 Harvard St NW, Washington, DC 20009   
5  1571 Carl D. Silver Parkway, Fredericksburg, V...   
6  675 Hempstead Turnpike, Franklin Square, NY 11010   
7                601 E Pratt St, Baltimore, MD 21202   
8              425 Lexington Ave,

In [0]:
def dumb_model(x):
  """Gets about 3500 error"""
  pred = x[:,0]
  add = tf.Variable(0.0)
  pred += add
  return pred

In [0]:
def NN_model(x):
  pred = tf.layers.dense(inputs = x, units = 100, activation =  tf.nn.relu)
  pred = tf.layers.dense(inputs = pred, units = 50, activation =  tf.nn.relu)
  pred = tf.layers.dense(inputs = pred, units = 10, activation =  tf.nn.relu) 
  pred = tf.layers.dense(inputs = pred, units = 5, activation =  tf.nn.relu)
  pred = tf.layers.dense(inputs =pred, units = 1)
  return pred

In [0]:
tf.reset_default_graph()

list_samp_x = ['Driving_Duration', 'Price Level','Does_Price_Lv_Exist?', 'Rating', 'Does_Rating_Exist?', 'Reviews', 'Does_Reviews_Exist?']
x = tf.placeholder(shape=(None, len(list_samp_x)), dtype=tf.float32)
y_ = tf.placeholder(shape=(None, 1), dtype=tf.float32)
pred = NN_model(x)

loss = tf.square(y_ - pred)
error = tf.abs(y_ - pred)
loss = tf.reduce_mean(loss)
error = tf.reduce_mean(error)

opt = tf.train.AdamOptimizer(.0003).minimize(loss)

sess =tf.Session()
sess.run(tf.global_variables_initializer())
loss_vals = []

In [72]:
for i in range(60000):
  batch = train_df.sample(100)
  samp_x = batch.as_matrix(list_samp_x)
  samp_y = batch.as_matrix(['Total_Driving'])
  _, loss_val, error_val = sess.run([opt, loss, error], feed_dict={x:samp_x, y_:samp_y})
  if i % 6000 == 0:
    print("loss", loss_val, "\n error_sec", error_val)
    
    batch = test_df
    samp_x = batch.as_matrix(list_samp_x)
    samp_y = batch.as_matrix(['Total_Driving'])
    loss_val, error_val = sess.run([loss, error], feed_dict={x:samp_x, y_:samp_y})
    print("testing_loss", loss_val, "\n testing_error_sec", error_val)
  
# With all features
# error 171.95436

# without price level
#error 187.73192

# without review
# error 258.79532


loss 92511304.0 
 error_sec 8409.994
testing_loss 79524010.0 
 testing_error_sec 7691.254
loss 2287769.0 
 error_sec 1279.7557
testing_loss 2312570.8 
 testing_error_sec 1300.7021
loss 322820.38 
 error_sec 378.00296
testing_loss 357379.78 
 testing_error_sec 379.03427
loss 439774.3 
 error_sec 422.62723
testing_loss 314812.03 
 testing_error_sec 340.98026
loss 293997.25 
 error_sec 341.13922
testing_loss 244057.45 
 testing_error_sec 306.90994
loss 112035.66 
 error_sec 231.18459
testing_loss 150017.12 
 testing_error_sec 241.9988
loss 148432.39 
 error_sec 254.43849
testing_loss 129793.51 
 testing_error_sec 232.23335
loss 101936.37 
 error_sec 200.57372
testing_loss 115803.086 
 testing_error_sec 235.00113
loss 105177.9 
 error_sec 190.57889
testing_loss 92675.24 
 testing_error_sec 173.53812
loss 156196.78 
 error_sec 210.00873
testing_loss 84812.45 
 testing_error_sec 153.33678
