In [None]:
import pandas as pd
import numpy  as np
import time
from config import getClient
from os import listdir
from os.path import isfile, join
import os

dataloc = getClient("OutFront")

def getDuration(df):
    start = df["date"].min()
    end   = df["date"].max()
    return (end - start)/ np.timedelta64(1, 'M')

def getFillRate(df):
    dates = df["date"]
    diff  = (dates.max() - dates.min()).days
    durations = diff*24
    numPoints = len(dates)
    return numPoints/durations

In [None]:
def processError(panel, msg):
    os.rename(dataloc+"/panels/"+panel,\
              dataloc+"errors/"+panel)

    rec = panel+","+msg+"\n"
    errors.write(rec)

In [None]:
def processPanel(df):
    # Convert their date format to a standard date
    df["date"] = pd.to_datetime(df["date"], format='%Y%m%d')
    months = getDuration(df)
    if months > 13:
        rate = getFillRate(df)
        if rate < .90:
            processError(panel, "Fill Rate too low")
    else:
        processError(panel, "Duration too short")

In [1]:
panels = [f for f in listdir(dataloc+"panels/") if isfile(join(dataloc+"panels/", f))]

print("{:,.0f} panels to process".format(len(panels)))

errors = open(dataloc+"errors.csv", "w")

count = 0
start = time.time()

for panel in panels:
    count += 1
    if count %100 == 0: print(count)
    df = pd.read_csv(dataloc+"panels/"+panel,\
                     dtype={"hour":'int8',\
                            "population":'int32'})
    processPanel(df)
    
errors.close()

end = time.time()
print("Completed after {:.0f} minutes".format((end-start)/60))