##### Filter out low-fill rate panels and sort

In [1]:
import pandas as pd
import datetime
from config import getClient
import numpy as np

In [2]:
datalib = getClient("OutFront")

In [3]:
cols = ["panel", "date", "hour", "observed", "population"]
df = pd.read_csv(datalib+"0panels_original.csv", header=0,names=cols)

In [4]:
del df["observed"]  # we're using "populated"

In [5]:
start = df["date"].min()
end   = df["date"].max()
count = len(df["panel"].unique())
print("Full dataset runs from {} - {}".format(start, end))
print("Number of rows: {:,.0f}".format(df.shape[0]))
print("Number of unique panels: {}".format(count))

Full dataset runs from 20170525 - 20180710
Number of rows: 1,711,017
Number of unique panels: 200


In [6]:
# Convert their date format to a standard date
df["date"] = pd.to_datetime(df["date"], format='%Y%m%d')

In [7]:
# Analyze the fill rate and remove any panels with low rate
df = df.set_index("panel")
df = df.sort_values(["date", "hour"])

grp = df.groupby(level=0)
l = []
count = 0
before = df.shape[0]

for panel, val in grp:
    dates = val["date"]
    diff  = (dates.max() - dates.min()).days
    durations = diff*24
    numPoints = len(dates)
    fillRate = numPoints/durations
    if fillRate < .92:
        df = df.drop(panel)
        count += 1
    else:
        l.append(fillRate)

after = df.shape[0]
print("{} panels removed ({:,.0f} records) due to low fill rate".\
      format(count, (before-after)))
print("{}Remaining panels fill rate is {:.1%}".format("\n", sum(l)/len(l)))

48 panels removed (297,769 records) due to low fill rate

Remaining panels fill rate is 96.6%


In [8]:
df = df.reset_index()
df = df.set_index(["panel", "date", "hour"])
df = df.sort_index()
df.to_csv(datalib+"1fillRateFilter.csv", sep=",")