In [11]:
import pandas as pd
import numpy as np
import json
import math
import urllib.request
import dateutil.parser
import dateutil.rrule
import dateutil.tz
import datetime
import re
import gc
import time
import sqlite3

In [2]:
tzUTC = dateutil.tz.gettz('UTC')
tzLocal = dateutil.tz.gettz('Europe/London')

earliestData = datetime.datetime.strptime('2020-04-20T00:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=tzUTC)
dateToday = datetime.datetime.combine(datetime.date.today(), datetime.datetime.min.time()).replace(tzinfo=tzUTC)

In [18]:
# Fetch baseline from SQL
conn = sqlite3.connect('../cache/baseline-feature-counts.db')
conn.row_factory = sqlite3.Row
c = conn.cursor()

[dict(row) for row in c.execute('''SELECT * FROM stills_counts LIMIT 10;''').fetchall()]

[{'location': 'PS193',
  'url': 'https://file.newcastle.urbanobservatory.ac.uk/camera-feeds/PS193/20200102/000509.jpg',
  'datetime': '2020-01-02 00:05:09',
  'counts': '{"counts": {"truck": 1, "bus": 1}}'},
 {'location': 'PS193',
  'url': 'https://file.newcastle.urbanobservatory.ac.uk/camera-feeds/PS193/20200102/003108.jpg',
  'datetime': '2020-01-02 00:31:08',
  'counts': '{"counts": {"truck": 1, "bus": 1}}'},
 {'location': 'PS193',
  'url': 'https://file.newcastle.urbanobservatory.ac.uk/camera-feeds/PS193/20200102/003308.jpg',
  'datetime': '2020-01-02 00:33:08',
  'counts': '{"counts": {"truck": 1, "bus": 1}}'},
 {'location': 'PS193',
  'url': 'https://file.newcastle.urbanobservatory.ac.uk/camera-feeds/PS193/20200102/003908.jpg',
  'datetime': '2020-01-02 00:39:08',
  'counts': '{"counts": {"truck": 1, "bus": 1}}'},
 {'location': 'PS193',
  'url': 'https://file.newcastle.urbanobservatory.ac.uk/camera-feeds/PS193/20200102/004708.jpg',
  'datetime': '2020-01-02 00:47:08',
  'counts':

In [3]:
# Fetch any previous data
previousDataEnd = None
pointTsByIntervalOld = {}
pdSourcesOld = None

try:
    pdSources = pd.read_pickle('../cache/recent-feature-counts-point-metadata.pkl')

    for interval in pdSources['interval'].unique():
        print('Loading data with %u second interval...' % interval)
        pointTsByIntervalOld[interval] = pd.read_pickle('../cache/recent-feature-counts-pd-%usec.pkl' % interval)
        intervalDataEnd = np.max(pointTsByIntervalOld[interval].index)

        if previousDataEnd is None or intervalDataEnd > previousDataEnd:
            previousDataEnd = intervalDataEnd
            
    print('Loaded previous data.')
    pdSourcesOld = pdSources

except:
    pdSources = None
    print('No existing data could be loaded.')
    
if previousDataEnd is None:
    previousDataEnd = earliestData
else:
    previousDataEnd = datetime.datetime.combine(previousDataEnd.date(), datetime.datetime.min.time()).replace(tzinfo=tzUTC)

if pdSources is not None:
    for interval in pdSources['interval'].unique():
        pointTsByIntervalOld[interval] = pointTsByIntervalOld[interval][pointTsByIntervalOld[interval].index < previousDataEnd]
    
print('  Start reading from %s' % previousDataEnd)

Loading data with 300 second interval...
Loading data with 60 second interval...
Loading data with 120 second interval...
Loading data with 600 second interval...
Loading data with 660 second interval...
No existing data could be loaded.
  Start reading from 2020-04-27 00:00:00+00:00


In [4]:
visionApiBase = 'https://uo-vision.dev.urbanobservatory.ac.uk/stills/dict'
visionResponse = json.loads(
    urllib.request.urlopen(visionApiBase).read().decode('utf-8')
)

In [5]:
pdSources = pd.DataFrame.from_records(visionResponse).transpose()
pdSources['min_date'] = pdSources['min_date'].apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d').replace(tzinfo=tzUTC))
pdSources['max_date'] = pdSources['max_date'].apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d').replace(tzinfo=tzUTC))
if pdSourcesOld is not None:
    pdSources = pdSources.join(pdSourcesOld['interval'])

camerasByInterval = {}
pointTsByInterval = {}

pdSources

Unnamed: 0,min_date,max_date
CM_A69A1,2017-10-31 00:00:00+00:00,2020-04-27 00:00:00+00:00
GH_A1114A1,2017-10-31 00:00:00+00:00,2020-04-27 00:00:00+00:00
GH_A167F1,2017-10-31 00:00:00+00:00,2020-04-27 00:00:00+00:00
GH_A167G1,2017-10-31 00:00:00+00:00,2020-04-27 00:00:00+00:00
GH_A167H1,2017-10-31 00:00:00+00:00,2020-04-27 00:00:00+00:00
...,...,...
VAISALACCTV54,2020-04-20 00:00:00+00:00,2020-04-27 00:00:00+00:00
VAISALACCTV56,2020-04-20 00:00:00+00:00,2020-04-27 00:00:00+00:00
VAISALACCTV57,2020-04-20 00:00:00+00:00,2020-04-27 00:00:00+00:00
VAISALACCTV58,2020-04-20 00:00:00+00:00,2020-04-27 00:00:00+00:00


In [6]:
for cameraId in pdSources.index:
    #alreadyExists = False
    #for testInterval in pointTsByInterval:
    #    if pointTsByInterval[testInterval] is not None and (('%s: Source image' % cameraId) in pointTsByInterval[testInterval].columns):
    #        alreadyExists = True
            
    #if alreadyExists:
    #    continue
    
    source = pdSources[pdSources.index == cameraId].to_dict(orient='records')[0]
    
    print(cameraId)
    print('  [', end='')
    
    sourceTs = None
    
    for date in dateutil.rrule.rrule(
            dateutil.rrule.DAILY,
            interval=1,
            dtstart=source['min_date'] if source['min_date'] > previousDataEnd else previousDataEnd,
            until=source['max_date']
        ):
        
        windowResponse = None
        windowAttempts = 0
        while windowResponse is None:
            try:
                windowAttempts = windowAttempts + 1
                windowResponse = json.loads(
                    urllib.request.urlopen(
                      'https://uo-vision.dev.urbanobservatory.ac.uk/stills/counts?location=%s&date=%s' % (cameraId, date.isoformat()[0:10])
                    ).read().decode('utf-8')
                )
            except:
                windowAttempts = windowAttempts + 1
                print('x', end='')
                time.sleep(min(windowAttempts, 10))

        sourceOnDay = pd.DataFrame.from_records(pd.json_normalize(windowResponse), index=['ts'])
        if not sourceOnDay.empty:
            sourceOnDay.index = sourceOnDay.index.to_series().apply(lambda t: datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S').replace(tzinfo=tzUTC))
        
        if len(sourceOnDay.columns) == 0:
            continue
        
        if sourceTs is None:
            sourceTs = sourceOnDay
        else:
            sourceTs = sourceTs.append(sourceOnDay)
        
        print('.', end='')
    
    if sourceTs is None:
        continue
    
    sourceTs.drop(columns=['camera'], inplace=True)
    sourceTs.rename(inplace=True, errors='ignore', columns={
        'url': 'Source image',
        'counts.bus': 'Bus',
        'counts.car': 'Car',
        'counts.cyclist': 'Cyclist',
        'counts.motorcyclist': 'Motorcyclist',
        'counts.person': 'Person',
        'counts.truck': 'Truck',
        'counts.van': 'Van'
    })
    
    sourceTs = sourceTs.loc[~sourceTs.index.duplicated(keep='first')]
    tsInterval = sourceTs.index.to_series().diff().median().seconds
    
    if 'interval' in source:
        sourceInterval = source['interval']
    elif np.isnan(tsInterval) == True:
        continue
    else:
        sourceInterval = min(600, round(tsInterval / 60) * 60)
    camerasByInterval[cameraId] = sourceInterval

    sourceTsNumeric = sourceTs.resample('%us' % sourceInterval).nearest().drop(columns=['Source image'], errors='ignore').fillna(0)
    sourceTs = sourceTsNumeric.join(sourceTs['Source image'].resample('%us' % sourceInterval).nearest())
    
    sourceTs = sourceTs.add_prefix('%s: ' % cameraId)
    
    print('] Interval %u seconds' % sourceInterval)
    
    if sourceInterval not in pointTsByInterval:
        pointTsByInterval[sourceInterval] = None
    
    if pointTsByInterval[sourceInterval] is None:
        pointTsByInterval[sourceInterval] = sourceTs
    else:
        if ('%s: Source image' % cameraId) in pointTsByInterval[sourceInterval].columns:
            pointTsByInterval[sourceInterval] = pointTsByInterval[sourceInterval].concat(sourceTs, sort=True, axis='index')
        else:        
            pointTsByInterval[sourceInterval] = pointTsByInterval[sourceInterval].join(sourceTs)

CM_A69A1
  [.] Interval 300 seconds
GH_A1114A1
  [.] Interval 300 seconds
GH_A167F1
  [.] Interval 60 seconds
GH_A167G1
  [.] Interval 120 seconds
GH_A167H1
  [.] Interval 300 seconds
GH_A167I1
  [.] Interval 120 seconds
GH_A167J1
  [.] Interval 300 seconds
GH_A167K1
  [.] Interval 300 seconds
GH_A167L1
  [.] Interval 300 seconds
GH_A167M1
  [.] Interval 300 seconds
GH_A167O1
  [.] Interval 300 seconds
GH_A167P1
  [.] Interval 300 seconds
GH_A184A1
  [.] Interval 120 seconds
GH_A184A2
  [.] Interval 300 seconds
GH_A184B1
  [.] Interval 120 seconds
GH_A184C1
  [.] Interval 300 seconds
GH_A184E1
  [.] Interval 120 seconds
GH_A184F1
  [.] Interval 300 seconds
GH_A692A1
  [.] Interval 300 seconds
GH_A692B1
  [.] Interval 300 seconds
GH_A694A1
  [.] Interval 300 seconds
GH_A695A1
  [.] Interval 300 seconds
GH_A695C1
  [.] Interval 300 seconds
GH_A695D1
  [.] Interval 300 seconds
GH_B1288A1
  [.] Interval 300 seconds
GH_B1296A1
  [.] Interval 300 seconds
GH_B1296B1
  [.] Interval 300 seconds

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


.] Interval 120 seconds
NT_A191D1
  [.] Interval 300 seconds
NT_A191E1
  [.] Interval 300 seconds
NT_A191E2
  [.] Interval 300 seconds
NT_A191G1
  [.] Interval 300 seconds
NT_A193C1
  [.] Interval 300 seconds
NT_A193D1
  [.] Interval 300 seconds
NT_A193E1
  [.] Interval 300 seconds
NT_A193F1
  [.] Interval 300 seconds
NT_A193G1
  [.] Interval 300 seconds
NT_A193H1
  [.] Interval 120 seconds
NT_A193I1
  [.] Interval 120 seconds
NT_SVLA1
  [.] Interval 120 seconds
PS190
  [.] Interval 120 seconds
PS191
  [.] Interval 120 seconds
PS192
  [.] Interval 120 seconds
PS196
  [.] Interval 120 seconds
PS303
  [.] Interval 120 seconds
SL_A1018C1
  [.SL_A1018D1
  [.] Interval 120 seconds
SL_A1018E1
  [.] Interval 60 seconds
SL_A1018F1
  [.] Interval 300 seconds
SL_A1231C1
  [.] Interval 300 seconds
SL_A1231D1
  [.] Interval 300 seconds
SL_A1231D2
  [.] Interval 120 seconds
SL_A1231D3
  [.] Interval 300 seconds
SL_A1231G1
  [.] Interval 300 seconds
SL_A1290A1
  [.] Interval 300 seconds
SL_A1290B1
 

In [7]:
for interval in pointTsByInterval:
    if interval in pointTsByIntervalOld:
        pointTsByInterval[interval] = pointTsByIntervalOld[interval].append(pointTsByInterval[interval])

In [8]:
pointTsByInterval[60]

Unnamed: 0_level_0,GH_A167F1: Truck,GH_A167F1: Car,GH_A167F1: Bus,GH_A167F1: Van,GH_A167F1: Person,GH_A167F1: Cyclist,GH_A167F1: Motorcyclist,GH_A167F1: Source image,MCA1Onslip: Car,MCA1Onslip: Truck,...,TT2South4: Bus,TT2South4: Source image,UO_NGATE02: Bus,UO_NGATE02: Car,UO_NGATE02: Person,UO_NGATE02: Van,UO_NGATE02: Motorcyclist,UO_NGATE02: Truck,UO_NGATE02: Cyclist,UO_NGATE02: Source image
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-20 21:16:00+00:00,1.0,2.0,1.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,1.0,0.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-04-20 21:17:00+00:00,1.0,2.0,1.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,0.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-04-20 21:18:00+00:00,0.0,3.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,1.0,0.0,...,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-04-20 21:19:00+00:00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,0.0,...,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-04-20 21:20:00+00:00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,0.0,...,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-04-27 15:27:00+00:00,0.0,4.0,0.0,1.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,0.0,...,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-04-27 15:28:00+00:00,0.0,7.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,1.0,...,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-04-27 15:29:00+00:00,0.0,2.0,0.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,0.0,...,1.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...
2020-04-27 15:30:00+00:00,0.0,2.0,0.0,1.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,0.0,...,0.0,https://file.newcastle.urbanobservatory.ac.uk/...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,https://file.newcastle.urbanobservatory.ac.uk/...


In [9]:
pdSources.drop(columns=['interval'], inplace=True, errors='ignore')
pdSourcesWithInterval = pdSources.join(pd.DataFrame.from_dict(camerasByInterval, orient='index', columns=['interval']))
pdSourcesWithInterval.to_pickle('../cache/recent-feature-counts-point-metadata.pkl')
pdSourcesWithInterval.to_csv('../output/recent-feature-counts-point-metadata.csv')
pdSourcesWithInterval

for interval in pointTsByInterval:
    pointTsByInterval[interval].to_pickle('../cache/recent-feature-counts-pd-%usec.pkl' % interval)
    pointTsByInterval[interval].to_csv('../output/recent-feature-counts-pd-%usec.csv' % interval)

In [10]:
# Use for testing only...

#pdSourcesAll = None

#for interval in [60, 120, 300]:
#    # Per hour...
#    pdResampled = (pointTsByInterval[interval].resample('1800s').sum() * (1800 / interval) / 30)
#    if pdSourcesAll is None:
#        pdSourcesAll = pdResampled
#    else:
#        pdSourcesAll = pdSourcesAll.join(pdResampled)

#pdSourcesAll.groupby(axis='columns', by=lambda x: x[x.find(':') + 1:]).sum().plot(figsize=(35, 15), stacked=False, legend=True)


#pdSourcesAll[list(filter(lambda cn: 'Car' in cn, pdSourcesAll.columns))] #.plot(figsize=(35, 6.5), stacked=True, legend=False)
        
#ax = (dfPointTs[list(filter(lambda cn: 'Person' in cn, dfPointTs.columns))].resample('900s').mean() / 15).sum(axis=1).plot(figsize=(35, 6.5))
#ax = (dfPointTs[list(filter(lambda cn: 'Person' in cn, dfPointTs.columns))].resample('900s').mean() / 15).plot(figsize=(35, 6.5), stacked=True, legend=False)
#ax.set_ylabel('Pedestrians per minute')
#ax = (dfPointTs[list(filter(lambda cn: 'Car' in cn, dfPointTs.columns))].resample('900s').mean() / 15).plot(figsize=(35, 6.5), stacked=True, legend=False)
#ax.set_ylabel('Cars per minute')
#ax = (dfPointTs[list(filter(lambda cn: 'Bus' in cn, dfPointTs.columns))].resample('3600s').mean()).plot(figsize=(35, 6.5), stacked=True, legend=False)
#ax.set_ylabel('Buses per hour')