##### Each panel should have a date and hour. Some records will have Null for population: impute those records

In [1]:
import pandas as pd
import numpy  as np
from config import getClient
from datetime import datetime, timedelta
import time
from itertools import groupby
import math

In [5]:
dataloc = getClient("OutFront")

In [6]:
df  = pd.read_csv(datalib+"2b_hours.csv")

df["date"] = pd.to_datetime(df["date"])

In [7]:
notFound = open(dataloc+"notFound.csv", "w")

In [8]:
# Create list of panel/date with missing hours
def getMissing(grp):
    missingList = []
    for idx, val in grp:
        missing = val.loc[val["population"].isnull()]["hour"]
        if not missing.empty:
            missingList.append(idx)
    return missingList

In [9]:
def writeNF(panel, dt):
    rec = panel+","+str(dt)+"\n"
    notFound.write(rec)

In [10]:
def copyWeek(df, panel, date, hour, numNaN):
    try:
        dt = date - timedelta(days=7)
        start = df.index.get_loc((panel, dt, hour))
        found = True
    except:
        try:
            dt    = date + timedelta(days=7)
            start = df.index.get_loc((panel, dt, hour))
            found = True
        except:
            writeNF(panel, dt)
            found = False
    if found:
        end   = start + numNaN
        pops  = df.iloc[start:end]["population"]
    
        for hr in range(hour, hour+numNaN):
            df.loc[panel,date,hr] = pops.loc[panel, dt, hr]
    return df

In [11]:
def sameDay(df, panel, date, hour, numNaN):
    tmp = df.loc[panel, date]
    pops = tmp["population"]
    if hour == 0:
        dt = date - timedelta(days=1)
        try:
            tmp = df.loc[panel, dt]
            found = True
            avg = (tmp.iloc[23] + pops[hour+numNaN])/2
        except:
            writeNF(panel, dt)
            found = False
        if found:
            if avg[0] > 1:
                for hr in range(hour, hour+numNaN):
                    df.loc[panel, date, hr] = avg
    elif (hour+numNaN) > 23:
        dt = date + timedelta(days=1)
        try:
            tmp = df.loc[panel, dt]
            found = True
        except:
            writeNF(panel, dt)
            found = False
        if found:
            nextDayPop = np.nan
            n = 0
            while math.isnan(nextDayPop):
                nextDayPop = tmp.iloc[n][0]
                n += 1
                if n > 23:
                    break
            if nextDayPop > 1:
                avg =  (nextDayPop + pops[hour-1]) / 2
                for hr in range(hour, 24):
                    df.loc[panel, date, hr] = avg
            else:
                writeNF(panel, dt)
    else:
        avg = (pops[hour-1] + pops[hour+numNaN]) / 2
        if avg > 1:
            for hr in range(hour, hour+numNaN):
                df.loc[panel, date, hr] = avg
    return df

In [12]:
def processClusters(df, panel, date, clusters):
# "clusters" is sometimes zero-indexed and sometimes 1, for no known
# reason. It screws up getting the hour, so this will compensate
    offset = clusters.index[0]   # value will be either 0 or 1

    count = 0
    for numNaN in clusters:
        if numNaN in [1,2,3]:
            hour = count + offset
            df = sameDay(df, panel, date, hour, numNaN)
        elif numNaN > 3:
            hour = count + offset
            df = copyWeek(df, panel, date, hour, numNaN)
        offset += numNaN            
        count += 1
    return df

In [13]:
df  = df.set_index(["panel", "date"])
grp = df.groupby(level=[0,1])
df  = df.reset_index()
df  = df.set_index(["panel", "date", "hour"])

# Get a list of missing values for each panel/date
missingList = getMissing(grp)

before = df["population"].isnull().sum()
print("{:,.0f} null values in 'population'".format(before))
start = time.time()

# For each panel/date combination, process the NaN values
for x in missingList:
    panel = x[0]
    date  = x[1]
    tmp = df.loc[panel, date]["population"]
    clusters = tmp.isnull().astype(int).groupby(tmp.notnull().astype(int).cumsum()).sum()
    df = processClusters(df, panel, date, clusters)

notFound.close()
end = time.time()
diff = end-start
after = df["population"].isnull().sum()
updates = before-after
print("{:,.0f} updates ({:.0%}) after {:,.0f} seconds".\
      format(updates, (before-after)/before, diff))
print("{:,.0f} remaining".format(after))

72,810 null values in 'population'
72,803 updates (100%) after 80 seconds
7 remaining


In [14]:
df = df.reset_index()
df  = df.set_index(["panel", "date"])
grp = df.groupby(level=[0,1])
df  = df.reset_index()
df  = df.set_index(["panel", "date", "hour"])

df= df.sort_index()
df.to_csv(dataloc+"2c_updatedHours.csv")