In [1]:
import json
import requests
import statistics
import numpy as np
import pandas as pd
import os
import warnings
from pathlib import Path
from IPython.display import Image, display

from cogo import plotting


warnings.filterwarnings('ignore')

In [2]:
APP_ROOT = Path(os.path.realpath(os.path.expanduser(os.getcwd()))).parents[0]

# Create the ./data and ./output directories if they don't exist and
# download the COGO trip dataset
Path(APP_ROOT / 'data').mkdir(parents=True, exist_ok=True)
Path(APP_ROOT / 'output').mkdir(parents=True, exist_ok=True)

if not Path(APP_ROOT / 'data' / 'cogo_trip_data.csv').is_file():
    data = requests.get(
        'https://data.smartcolumbusos.com/api/v1/dataset/' +
        '4053a9a2-fc8f-437c-af56-d104d4b5d63c/download?_format=csv')
    with open(APP_ROOT / 'data' / 'cogo_trip_data.csv', 'w') as outfile:
        outfile.write(data.text)

# Column headings are a little borked from the download.
# Not sure what's up with that, but seems reproducible
# from the couple separate times I've downloaded the file
# so we'll just assume that it's deterministic and kinda
# go with that. What could go wrong?
cogo_data = pd.read_csv(
    APP_ROOT / 'data' / 'cogo_trip_data.csv',
    header=0,
    skiprows=0,
    names=[
        'bike_id',
        'user_gender',
        'start_station_id',
        'start_station_lat',
        'start_station_long',
        'start_station_name',
        'start_timestamp',
        'stop_station_id',
        'stop_station_lat',
        'stop_station_long',
        'stop_station_name',
        'stop_timestamp',
        'user_type',
        'user_birth_year'
    ]
)
# There are, turns out, 248 records with 'bad' timestamp info that are
# are probably just not worth the effort of correcting given that it
# amounts to a 0.11% error rate. They get nulled out here.
cogo_data['date'] = pd.to_datetime(cogo_data['start_timestamp'], errors='coerce')
cogo_data['date'] = cogo_data['date'].dt.strftime('%Y-%m-%d')


In [3]:
cogo_data.head()

Unnamed: 0,bike_id,user_gender,start_station_id,start_station_lat,start_station_long,start_station_name,start_timestamp,stop_station_id,stop_station_lat,stop_station_long,stop_station_name,stop_timestamp,user_type,user_birth_year,date
0,110,0,1,39.955864,-83.003106,Bicentennial Park,3/14/2015 17:07:39,19,39.961234,-83.006558,COSI,3/14/2015 17:15:13,Customer,,2015-03-14
1,277,0,18,39.963964,-82.998665,3rd St & Gay St,3/15/2015 00:09:05,10,39.949348,-82.995285,3rd St & Sycamore St,3/15/2015 00:15:23,Customer,,2015-03-15
2,174,0,6,39.942026,-82.995019,Schiller Park - Stewart Ave,3/15/2015 14:16:55,9,39.957591,-82.998002,Columbus Commons - Rich St,3/15/2015 14:30:12,Customer,,2015-03-15
3,204,1,54,39.990309,-83.013626,Neil Ave & King Ave,8/6/2016 15:22:07,12,39.970512,-83.002354,Convention Center,8/6/2016 15:38:01,Subscriber,1974.0,2016-08-06
4,270,0,10,39.949348,-82.995285,3rd St & Sycamore St,8/6/2016 11:49:55,1,39.955864,-83.003106,Bicentennial Park,8/6/2016 12:59:12,Customer,,2016-08-06


In [11]:
# OOP is borked right now. Should aggregate to the appropriate
# grain first and then join out to the hexagons. Otherwise things
# are likely to get weird when we go to aggregate (e.g., daily
# to lifetime grains)
df_agg = plotting.counts_by_hexagon(df=cogo_data, resolution=9)
df_agg.sort_values(by='value', ascending=False, inplace=True)

In [13]:
m_hex = plotting.choropleth_map(df_agg=df_agg, with_legend=True)
m_hex.save(str(APP_ROOT / 'output' / 'choropleth_counts.html'))
m_hex