In [62]:
import pandas as pd
import numpy as np
import json
import math
import urllib.request
import dateutil.parser
import dateutil.rrule
import dateutil.tz
import datetime
import re
import gc
import time
import sqlite3

In [106]:
tzUTC = dateutil.tz.gettz('UTC')
tzLocal = dateutil.tz.gettz('Europe/London')

earliestData = datetime.datetime.strptime('2020-01-01T00:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=tzUTC)
dateToday = datetime.datetime.combine(datetime.date.today(), datetime.datetime.min.time()).replace(tzinfo=tzUTC)

In [107]:
# Fetch any previous data
previousDataEnd = None
pointTsByIntervalOld = {}
pdSourcesOld = None

try:
    pdSources = pd.read_pickle('../cache/recent-feature-counts-point-metadata.pkl')

    for interval in pdSources['interval'].unique():
        print('Loading data with %u second interval...' % interval)
        pointTsByIntervalOld[interval] = pd.read_pickle('../cache/recent-feature-counts-pd-%usec.pkl' % interval)
        intervalDataEnd = np.max(pointTsByIntervalOld[interval].index)

        if previousDataEnd is None or intervalDataEnd > previousDataEnd:
            previousDataEnd = intervalDataEnd
            
    print('Loaded previous data.')
    pdSourcesOld = pdSources

except:
    pdSources = None
    print('No existing data could be loaded.')
    
if previousDataEnd is None:
    previousDataEnd = earliestData
else:
    previousDataEnd = datetime.datetime.combine(previousDataEnd.date(), datetime.datetime.min.time()).replace(tzinfo=tzUTC)

if pdSources is not None:
    for interval in pdSources['interval'].unique():
        pointTsByIntervalOld[interval] = pointTsByIntervalOld[interval][pointTsByIntervalOld[interval].index < previousDataEnd]
    
print('  Start reading from %s' % previousDataEnd)

No existing data could be loaded.
  Start reading from 2020-01-01 00:00:00+00:00


In [108]:
visionApiBase = 'https://uo-vision.dev.urbanobservatory.ac.uk/stills/dict'
visionResponse = json.loads(
    urllib.request.urlopen(visionApiBase).read().decode('utf-8')
)

In [110]:
pdSources = pd.DataFrame.from_records(visionResponse).transpose()
pdSources['min_date'] = pdSources['min_date'].apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d').replace(tzinfo=tzUTC))
pdSources['max_date'] = pdSources['max_date'].apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d').replace(tzinfo=tzUTC))
if pdSourcesOld is not None:
    pdSources = pdSources.join(pdSourcesOld['interval'])

camerasByInterval = {}
pointTsByInterval = {}

pdSources

Unnamed: 0,min_date,max_date
CM_A69A1,2017-10-31 00:00:00+00:00,2020-04-29 00:00:00+00:00
GH_A1114A1,2017-10-31 00:00:00+00:00,2020-04-29 00:00:00+00:00
GH_A167F1,2017-10-31 00:00:00+00:00,2020-04-29 00:00:00+00:00
GH_A167G1,2017-10-31 00:00:00+00:00,2020-04-29 00:00:00+00:00
GH_A167H1,2017-10-31 00:00:00+00:00,2020-04-29 00:00:00+00:00
...,...,...
VAISALACCTV56,2020-01-01 00:00:00+00:00,2020-04-29 00:00:00+00:00
VAISALACCTV57,2020-01-01 00:00:00+00:00,2020-04-29 00:00:00+00:00
VAISALACCTV58,2020-01-01 00:00:00+00:00,2020-04-29 00:00:00+00:00
VAISALACCTV59,2020-01-01 00:00:00+00:00,2020-04-29 00:00:00+00:00


In [None]:
for cameraId in pdSources.index:
    #alreadyExists = False
    #for testInterval in pointTsByInterval:
    #    if pointTsByInterval[testInterval] is not None and (('%s: Source image' % cameraId) in pointTsByInterval[testInterval].columns):
    #        alreadyExists = True
            
    #if alreadyExists:
    #    continue
    
    source = pdSources[pdSources.index == cameraId].to_dict(orient='records')[0]
    
    print(cameraId)
    print('  [', end='')
    
    sourceTs = None

    for date in dateutil.rrule.rrule(
            dateutil.rrule.DAILY,
            interval=1,
            dtstart=source['min_date'] if source['min_date'] > previousDataEnd else previousDataEnd,
            until=source['max_date']
        ):
        
        windowResponse = None
        windowAttempts = 0
        while windowResponse is None:
            try:
                windowAttempts = windowAttempts + 1
                windowResponse = json.loads(
                    urllib.request.urlopen(
                      'https://uo-vision.dev.urbanobservatory.ac.uk/stills/counts?location=%s&date=%s' % (cameraId, date.isoformat()[0:10])
                    ).read().decode('utf-8')
                )
            except:
                windowAttempts = windowAttempts + 1
                print('x', end='')
                time.sleep(min(windowAttempts, 10))

        for i, r in enumerate(windowResponse):
            for count in windowResponse[i]['counts']:
                if not isinstance(windowResponse[i]['counts'][count], int):
                    windowResponse[i]['counts'][count] = windowResponse[i]['counts'][count]['count']
                
        sourceOnDay = pd.DataFrame.from_records(pd.json_normalize(windowResponse), index=['ts'])
        if not sourceOnDay.empty:
            sourceOnDay.index = sourceOnDay.index.to_series().apply(lambda t: datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S').replace(tzinfo=tzUTC))
        
        if len(sourceOnDay.columns) == 0:
            print('-', end='')
            continue
        
        if sourceTs is None:
            sourceTs = sourceOnDay
        else:
            sourceTs = sourceTs.append(sourceOnDay)
        
        print('#', end='')
    
    if sourceTs is None:
        continue
    
    sourceTs.drop(columns=['camera'], inplace=True)
    sourceTs.rename(inplace=True, errors='ignore', columns={
        'url': 'Source image',
        'counts.bus': 'Bus',
        'counts.car': 'Car',
        'counts.cyclist': 'Cyclist',
        'counts.motorcyclist': 'Motorcyclist',
        'counts.person': 'Person',
        'counts.truck': 'Truck',
        'counts.van': 'Van'
    })
    
    sourceTs = sourceTs.loc[~sourceTs.index.duplicated(keep='first')]
    tsInterval = sourceTs.index.to_series().diff().median().seconds
    
    if 'interval' in source:
        sourceInterval = source['interval']
    elif np.isnan(tsInterval) == True:
        continue
    else:
        sourceInterval = min(600, round(tsInterval / 60) * 60)
        if sourceInterval == 360:
            sourceInterval = 300
    camerasByInterval[cameraId] = sourceInterval

    sourceTsNumeric = sourceTs.resample('%us' % sourceInterval).nearest().drop(columns=['Source image'], errors='ignore').fillna(0)
    sourceTs = sourceTsNumeric.join(sourceTs['Source image'].resample('%us' % sourceInterval).nearest())
    
    sourceTs = sourceTs.add_prefix('%s: ' % cameraId)
    
    print('] Interval %u seconds' % sourceInterval)
    
    if sourceInterval not in pointTsByInterval:
        pointTsByInterval[sourceInterval] = None
    
    if pointTsByInterval[sourceInterval] is None:
        pointTsByInterval[sourceInterval] = sourceTs
    else:
        if ('%s: Source image' % cameraId) in pointTsByInterval[sourceInterval].columns:
            pointTsByInterval[sourceInterval] = pointTsByInterval[sourceInterval].concat(sourceTs, sort=True, axis='index')
        else:        
            pointTsByInterval[sourceInterval] = pointTsByInterval[sourceInterval].join(sourceTs)

CM_A69A1
  [..........] Interval 300 seconds
GH_A1114A1
  [......................................................................................................................] Interval 300 seconds
GH_A167F1
  [......................................................................................................................] Interval 300 seconds
GH_A167G1
  [......................................................................................................................] Interval 300 seconds
GH_A167H1
  [.....................................................................................................................] Interval 300 seconds
GH_A167I1
  [......................................................................................................................] Interval 300 seconds
GH_A167J1
  [......................................................................................................................] Interval 300 seconds
GH_A167K1
  [..................

In [104]:
for interval in pointTsByInterval:
    if interval in pointTsByIntervalOld:
        pointTsByInterval[interval] = pointTsByIntervalOld[interval].append(pointTsByInterval[interval])

In [105]:
pointTsByInterval[300]

Unnamed: 0_level_0,GH_A184C1: Van,GH_A184C1: Car,GH_A184C1: Truck,GH_A184C1: Person,GH_A184C1: Bus,GH_A184C1: Motorcyclist,GH_A184C1: Cyclist,GH_A184C1: Source image
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-30 00:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-03-30 00:05:00+00:00,0.0,0.0,0.0,1.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-03-30 00:10:00+00:00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-03-30 00:15:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-03-30 00:20:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
...,...,...,...,...,...,...,...,...
2020-04-29 10:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-04-29 10:05:00+00:00,1.0,1.0,1.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-04-29 10:10:00+00:00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-04-29 10:15:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...


In [9]:
pdSources.drop(columns=['interval'], inplace=True, errors='ignore')
pdSourcesWithInterval = pdSources.join(pd.DataFrame.from_dict(camerasByInterval, orient='index', columns=['interval']))
pdSourcesWithInterval.to_pickle('../cache/recent-feature-counts-point-metadata.pkl')
pdSourcesWithInterval.to_csv('../output/recent-feature-counts-point-metadata.csv')
pdSourcesWithInterval

for interval in pointTsByInterval:
    pointTsByInterval[interval].to_pickle('../cache/recent-feature-counts-pd-%usec.pkl' % interval)
    pointTsByInterval[interval].to_csv('../output/recent-feature-counts-pd-%usec.csv' % interval)

In [10]:
# Use for testing only...

#pdSourcesAll = None

#for interval in [60, 120, 300]:
#    # Per hour...
#    pdResampled = (pointTsByInterval[interval].resample('1800s').sum() * (1800 / interval) / 30)
#    if pdSourcesAll is None:
#        pdSourcesAll = pdResampled
#    else:
#        pdSourcesAll = pdSourcesAll.join(pdResampled)

#pdSourcesAll.groupby(axis='columns', by=lambda x: x[x.find(':') + 1:]).sum().plot(figsize=(35, 15), stacked=False, legend=True)


#pdSourcesAll[list(filter(lambda cn: 'Car' in cn, pdSourcesAll.columns))] #.plot(figsize=(35, 6.5), stacked=True, legend=False)
        
#ax = (dfPointTs[list(filter(lambda cn: 'Person' in cn, dfPointTs.columns))].resample('900s').mean() / 15).sum(axis=1).plot(figsize=(35, 6.5))
#ax = (dfPointTs[list(filter(lambda cn: 'Person' in cn, dfPointTs.columns))].resample('900s').mean() / 15).plot(figsize=(35, 6.5), stacked=True, legend=False)
#ax.set_ylabel('Pedestrians per minute')
#ax = (dfPointTs[list(filter(lambda cn: 'Car' in cn, dfPointTs.columns))].resample('900s').mean() / 15).plot(figsize=(35, 6.5), stacked=True, legend=False)
#ax.set_ylabel('Cars per minute')
#ax = (dfPointTs[list(filter(lambda cn: 'Bus' in cn, dfPointTs.columns))].resample('3600s').mean()).plot(figsize=(35, 6.5), stacked=True, legend=False)
#ax.set_ylabel('Buses per hour')