#### Compute the moving average over 2 hour periods. There will be many NaN due to imputed dates not having "population". Generate different "date" features

In [1]:
import pandas as pd
import numpy  as np
from config import getClient
from dateutil import parser

In [2]:
datalib = getClient("OutFront")

In [3]:
df = pd.read_csv(datalib+"2c_updatedHours.csv")

df["date"] = pd.to_datetime(df["date"])

In [4]:
nan = df["population"].isnull()

print("{} records have Null population".format(nan.sum() / df.shape[0]))

0.0 records have Null population


In [5]:
# This is mapping each hour in the day to an 8-hour block
conditions = [
    (df['hour'] > 19) | (df['hour'] < 4),
    (df['hour'] > 3) & (df['hour'] < 12)]

blocks = ['evening', 'morning']

df['block']   = np.select(conditions, blocks, default='afternoon')
df["dow"]     = df['date'].dt.dayofweek
df["weekNum"] = df['date'].dt.week
df["month"]   = df["date"].dt.month

assert (df["block"].value_counts().sum()   == df.shape[0])
assert (df["dow"].value_counts().sum()     == df.shape[0])
assert (df["weekNum"].value_counts().sum() == df.shape[0])
assert (df["month"].value_counts().sum()   == df.shape[0])

In [6]:
# Create a feature that indicates Fed Holiday or not
dates = ["2017-01-01","2017-01-16","2017-01-20","2017-05-29","2017-07-04",\
         "2017-09-04","2017-11-10","2017-11-23","2017-12-25"]

fedHolidays = set()
for d in dates:
    dt = parser.parse(d)
    fedHolidays.add(dt)

In [7]:
def holiday(dt):
    if dt in fedHolidays:
        return 1
    else:
        return 0

In [8]:
df["holiday"] = df["date"].apply(holiday)

In [9]:
df.to_csv(datalib+"3_final.csv", index=False)