In [1]:
import pandas as pd
import numpy  as np
from config import getClient
from dateutil import parser
from os import listdir
from os.path import isfile, join

In [2]:
dataloc = getClient("OutFront")

In [3]:
panels = [f for f in listdir(dataloc+"panels/") if isfile(join(dataloc+"panels/", f))]

print("{:,.0f} panels to process".format(len(panels)))

699 panels to process


In [4]:
# Create a feature that indicates Fed Holiday or not
dates = ["2017-01-01","2017-01-16","2017-01-20","2017-05-29","2017-07-04",\
             "2017-09-04","2017-11-10","2017-11-23","2017-12-25"]
fedHolidays = set()
for d in dates:
    dt = parser.parse(d)
    fedHolidays.add(dt)

In [5]:
def holiday(dt):
    if dt in fedHolidays:
        return 1
    else:
        return 0

In [6]:
# This is mapping each hour in the day to an 8-hour block
def mapBlock(df):
    conditions = [(df['hour'] > 19) | (df['hour'] < 4),
                  (df['hour'] > 3) & (df['hour'] < 12)]

    blocks = ['evening', 'morning']

    df['block']   = np.select(conditions, blocks, default='afternoon')
    return df

In [7]:
def processPanel(df):
    nan = df["population"].isnull().sum()
    if nan > 0:
        print("{} records have Null population".format(nan.sum()))
        
    # Convert their date format to a standard date
    df["date"] = pd.to_datetime(df["date"])
    
    df = mapBlock(df)

    df["dow"]     = df['date'].dt.dayofweek
    df["weekNum"] = df['date'].dt.week
    df["month"]   = df["date"].dt.month

    assert (df["block"].value_counts().sum()   == df.shape[0])
    assert (df["dow"].value_counts().sum()     == df.shape[0])
    assert (df["weekNum"].value_counts().sum() == df.shape[0])
    assert (df["month"].value_counts().sum()   == df.shape[0])
    
    df["holiday"] = df["date"].apply(holiday)
    return df

In [8]:
count = 0
for panel in panels:
    count += 1
    if count %100 == 0: print(count)
    df = pd.read_csv(dataloc+"panels/"+panel,\
                     dtype={"hour":'int8',\
                            "population":'int32'})
    df = processPanel(df)
    df.to_csv(dataloc+"panels/" +panel, index=False)

100
200
300
400
500
600
