##### Each panel should have a date and hour. Some records will have Null for population: impute those records

In [1]:
import pandas as pd
import numpy  as np
from config import getClient
from datetime import datetime, timedelta
import time
from itertools import groupby
import math
import random

In [2]:
datalib = getClient("OutFront")

In [3]:
df  = pd.read_csv(datalib+"2b_hours.csv")

df["date"] = pd.to_datetime(df["date"])

In [4]:
# Create list of panel/date with missing hours
def getMissing(grp):
    missingList = []
    for idx, val in grp:
        missing = val.loc[val["population"].isnull()]["hour"]
        if not missing.empty:
            missingList.append(idx)
    return missingList

In [5]:
def copyWeek(df, panel, date, hour, numNaN):
    try:
        dt = date - timedelta(days=7)
        start = df.index.get_loc((panel, dt, hour))
        found = True
    except:
        try:
            dt    = date + timedelta(days=7)
            start = df.index.get_loc((panel, dt, hour))
            found = True
        except:
            print(panel, dt, hour, " not found")
            found = False
    if found:
        end   = start + numNaN
        pops  = df.iloc[start:end]["population"]
    
        for hr in range(hour, hour+numNaN):
            df.loc[panel,date,hr] = pops.loc[panel, dt, hr]
    return df

In [6]:
def sameDay(df, panel, date, hour, numNaN):
    tmp = df.loc[panel, date]
    pops = tmp["population"]
    if hour == 0:
        dt = date - timedelta(days=1)
        try:
            tmp = df.loc[panel, dt]
            found = True
            avg = (tmp.iloc[23] + pops[hour+numNaN])/2
        except:
            print("Not found: ", panel, dt)
            found = False
        if found:
            if avg[0] > 1:
                for hr in range(hour, hour+numNaN):
                    df.loc[panel, date, hr] = avg
    elif (hour+numNaN) > 23:
        dt = date + timedelta(days=1)
        try:
            tmp = df.loc[panel, dt]
            found = True
        except:
            print("Not found: ", panel, dt)
            found = False
        if found:
            nextDayPop = np.nan
            n = 0
            while math.isnan(nextDayPop):
                nextDayPop = tmp.iloc[n][0]
                n += 1
                if n > 23:
                    break
            if nextDayPop > 1:
                avg =  (nextDayPop + pops[hour-1]) / 2
                for hr in range(hour, 24):
                    df.loc[panel, date, hr] = avg
            else:
                print("Not found: ", panel, date, hour)
    else:
        if random.randint(1,200) == 51:
            print(panel, date)
        avg = (pops[hour-1] + pops[hour+numNaN]) / 2
        if avg > 1:
            for hr in range(hour, hour+numNaN):
                df.loc[panel, date, hr] = avg
    return df

In [7]:
def processClusters(df, panel, date, clusters):
# "clusters" is sometimes zero-indexed and sometimes 1, for no known
# reason. It screws up getting the hour, so this will compensate
    offset = clusters.index[0]   # value will be either 0 or 1

    count = 0
    for numNaN in clusters:
        if numNaN in [1,2,3]:
            hour = count + offset
            df = sameDay(df, panel, date, hour, numNaN)
        elif numNaN > 3:
            hour = count + offset
            df = copyWeek(df, panel, date, hour, numNaN)
        offset += numNaN            
        count += 1
    return df

In [8]:
df  = df.set_index(["panel", "date"])
grp = df.groupby(level=[0,1])
df  = df.reset_index()
df  = df.set_index(["panel", "date", "hour"])

# Get a list of missing values for each panel/date
missingList = getMissing(grp)

before = df["population"].isnull().sum()
print("{:,.0f} null values in 'population'".format(before))
start = time.time()

# For each panel/date combination, process the NaN values
for x in missingList:
    panel = x[0]
    date  = x[1]
    tmp = df.loc[panel, date]["population"]
    clusters = tmp.isnull().astype(int).groupby(tmp.notnull().astype(int).cumsum()).sum()
    df = processClusters(df, panel, date, clusters)

end = time.time()
diff = end-start
after = df["population"].isnull().sum()
updates = before-after
print("{:,.0f} updates ({:.0%}) after {:,.0f} seconds".\
      format(updates, (before-after)/before, diff))
print("{:,.0f} remaining".format(after))

31,120 null values in 'population'
AT0001000 2017-06-05 00:00:00
AT0001000 2017-11-27 00:00:00
AT00B8646 2017-07-13 00:00:00
AT00B8646 2017-10-05 00:00:00
BK0001163 2018-03-07 00:00:00
DT0001110 2017-11-26 00:00:00
DT0001110 2018-03-20 00:00:00
FF0003944 2017-11-03 00:00:00
FF0003944 2017-11-27 00:00:00
FF0003944 2018-03-15 00:00:00
FF0003944 2018-05-23 00:00:00
FF0007038 2018-03-08 00:00:00
FF000N130 2017-07-08 00:00:00
FF0081618 2017-09-24 00:00:00
FF0082460 2017-11-29 00:00:00
GR0005927 2017-10-26 00:00:00
Not found:  GR0006072 2017-05-24 00:00:00
GR0006072 2017-08-17 00:00:00
GR0006072 2018-01-31 00:00:00
GR0006072 2018-05-22 00:00:00
GR0006146 2018-05-08 00:00:00
HB060580A 2018-02-16 00:00:00
HB060580A 2018-06-19 00:00:00
KC002243A 2018-06-21 00:00:00
LO0004312 2017-12-24 00:00:00
LO0005573 2017-11-16 00:00:00
LO0005573 2017-12-31 00:00:00
LV0001109 2017-09-05 00:00:00
MN0000222 2018-05-07 00:00:00
OT0001161 2017-10-06 00:00:00
OTGCD0001 2017-08-09 00:00:00
OTGCD0001 2017-08-21 00

In [22]:
df = df.reset_index()
df  = df.set_index(["panel", "date"])
grp = df.groupby(level=[0,1])
df  = df.reset_index()
df  = df.set_index(["panel", "date", "hour"])

# Get a list of missing values for each panel/date
missingList = getMissing(grp)
print(missingList)

[]


In [21]:
df.loc["GR0006072", "2017-05-27",0]["population"] = 15
df.loc["GR0006072", "2017-05-27",4]["population"] = 41

In [23]:
df= df.sort_index()
df.to_csv(datalib+"2c_updatedHours.csv")