In [2]:
import requests
import pandas as pd
import numpy as np
import datetime as dt
import glob
import os
import argparse

# Helper functions

def transitland_request(url):
    """This is a helper function to paginate through Transitland api responses with multiple pages."""
    next_url = url
    while next_url:
        # next url in 'meta'
        data = requests.get(next_url).json()
        meta = data.get('meta', {})
        next_url = meta.get('next')
        # transitland responses will have one main key that isn't "meta"
        main_key = (set(data.keys()) - set(['meta'])).pop()
        for item in data[main_key]:
            yield item

def get_vehicle_types(operator_onestop_id):
    """This function will get all **vehicle types** for an operator, by route. So we can ask *"what vehicle type is this particular trip?"* and color code trips by vehicle type."""
    routes_url = "http://transit.land/api/v1/routes?operated_by={}&per_page=1000&api_key={}".format(operator_onestop_id, MAPZEN_APIKEY)
    lookup_vehicle_types = {i['onestop_id']: i['vehicle_type'] for i in transitland_request(routes_url)}
    return lookup_vehicle_types

# Get stops

def get_stop_lats(operator_onestop_id):
    """Get stop lats for a particular operator."""
    stops_url = "http://transit.land/api/v1/stops?served_by={}&per_page=1000&api_key={}".format(operator_onestop_id, MAPZEN_APIKEY)
    lookup_stop_lats = {i['onestop_id']: i['geometry']['coordinates'][1] for i in transitland_request(stops_url)}
    return lookup_stop_lats

def get_stop_lons(operator_onestop_id):
    """Get stop lons for a particular operator."""
    stops_url = "http://transit.land/api/v1/stops?served_by={}&per_page=1000&api_key={}".format(operator_onestop_id, MAPZEN_APIKEY)
    lookup_stop_lons = {i['onestop_id']: i['geometry']['coordinates'][0] for i in transitland_request(stops_url)}
    return lookup_stop_lons

# Get Schedule data

def get_schedule_stop_pairs(operator_onestop_id, date):
    """This function gets origin-destination pairs and timestamps from the schedule stop pairs API. This is the most important function and the largest API request."""
    schedule_stop_pairs_url = "http://transit.land/api/v1/schedule_stop_pairs?date={}&operator_onestop_id={}&per_page=1000&api_key={}".format(date, operator_onestop_id, MAPZEN_APIKEY)
    print schedule_stop_pairs_url
    origin_times = []
    destination_times = []
    origin_stops = []
    destination_stops = []
    route_ids = []
    count=0
    for i in transitland_request(schedule_stop_pairs_url):
        count+=1
        if count % 10000 == 0:
            print count
        origin_times.append(i['origin_departure_time'])
        destination_times.append(i['destination_arrival_time'])
        origin_stops.append(i['origin_onestop_id'])
        destination_stops.append(i['destination_onestop_id'])
        route_ids.append(i['route_onestop_id'])

    return origin_times, destination_times, origin_stops, destination_stops, route_ids

def calculate_durations(origin_times, destination_times):
    """This function calculates durations between origin and destination pairs (in seconds)."""
    origin_since_epoch = [int(i.split(':')[0])*60*60 + int(i.split(':')[1])*60 + int(i.split(':')[2]) for i in origin_times]
    destination_since_epoch = [int(i.split(':')[0])*60*60 + int(i.split(':')[1])*60 + int(i.split(':')[2]) for i in destination_times]
    durations = [b - a for a, b in zip(origin_since_epoch, destination_since_epoch)]
    return durations

def clean_times(origin_times, destination_times):
    """This function cleans origin and destination times. This is a bit tricky because operators will often include non-real times such as "26:00:00" to indicate 2am the next day."""
    # Modulo away the > 24 hours
    origin_times_clean = [":".join([str(int(i.split(':')[0]) % 24), i.split(':')[1], i.split(':')[2]]) for i in origin_times]
    destination_times_clean = [":".join([str(int(i.split(':')[0]) % 24), i.split(':')[1], i.split(':')[2]]) for i in destination_times]
    return origin_times_clean, destination_times_clean

def add_dates(date, origin_times_clean, destination_times_clean):
    """This function appends destination and origin dates to times, so that times become datetimes."""
    date1 = dt.datetime.strptime(date, "%Y-%m-%d").date()
    date2 = date1 + dt.timedelta(days=1)

    origin_datetimes = []
    destination_datetimes = []

    for i in range(len(origin_times_clean)):
        if int(destination_times_clean[i].split(":")[0]) < int(origin_times_clean[i].split(":")[0]):
            origin_datetime = str(date1) + " " + origin_times_clean[i]
            destination_datetime = str(date2) + " " + destination_times_clean[i]
        else:
            origin_datetime = str(date1) + " " + origin_times_clean[i]
            destination_datetime = str(date1) + " " + destination_times_clean[i]

        origin_datetimes.append(origin_datetime)
        destination_datetimes.append(destination_datetime)

    return origin_datetimes, destination_datetimes

# Output

def generate_output(operator_onestop_id, origin_datetimes, destination_datetimes, durations, origin_stops, destination_stops, route_ids, lookup_stop_lats, lookup_stop_lons, lookup_vehicle_types):
    """This function generates the output table, to be saved later as a csv."""
    origin_stop_lats = [lookup_stop_lats[i] for i in origin_stops]
    origin_stop_lons = [lookup_stop_lons[i] for i in origin_stops]
    destination_stop_lats = [lookup_stop_lats[i] for i in destination_stops]
    destination_stop_lons = [lookup_stop_lons[i] for i in destination_stops]
    vehicle_types = []
    for i in route_ids:
        try:
            vehicle_type = lookup_vehicle_types[i]
            vehicle_types.append(vehicle_type)
        except:
            vehicle_types.append("NA")

    output = pd.DataFrame({
        'route_type': vehicle_types,
        'start_time': origin_datetimes,
        'start_lat': origin_stop_lats,
        'start_lon': origin_stop_lons,
        'end_time': destination_datetimes,
        'end_lat': destination_stop_lats,
        'end_lon': destination_stop_lons,
        'duration': durations
    })
    output = output[['start_time', 'start_lat', 'start_lon', 'end_time', 'end_lat', 'end_lon', 'duration', 'route_type']]
    return output

# Combine data

def concatenate_csvs(path):
    all_files = glob.glob(os.path.join(path, "*.csv"))     # advisable to use os.path.join as this makes concatenation OS independent
    df_from_each_file = (pd.read_csv(f) for f in all_files) # generators
    concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)
    del concatenated_df['Unnamed: 0'] # delete the blank column that gets added
    concatenated_df['start_time'] = pd.to_datetime(concatenated_df['start_time'])
    concatenated_df['end_time'] = pd.to_datetime(concatenated_df['end_time'])
    concatenated_df = concatenated_df.sort_values(by="start_time").reset_index(drop=True)
    return concatenated_df

def animate_one_day(operator_onestop_id, date):
    """This is the main function that ties all of the above together!"""
    lookup_vehicle_types = get_vehicle_types(operator_onestop_id)
    lookup_stop_lats = get_stop_lats(operator_onestop_id)
    lookup_stop_lons = get_stop_lons(operator_onestop_id)
    origin_times, destination_times, origin_stops, destination_stops, route_ids = get_schedule_stop_pairs(operator_onestop_id, date)
    durations = calculate_durations(origin_times, destination_times)
    origin_times_clean, destination_times_clean = clean_times(origin_times, destination_times)
    origin_datetimes, destination_datetimes = add_dates(date, origin_times_clean, destination_times_clean)
    output = generate_output(operator_onestop_id, origin_datetimes, destination_datetimes, durations, origin_stops, destination_stops, route_ids, lookup_stop_lats, lookup_stop_lons, lookup_vehicle_types)
    output = output.sort_values(by='start_time').reset_index(drop=True)
    return output

def animate_many_operators(operators, date):
    """Main."""
    starttime = dt.datetime.now()
    print starttime

    results = []
    failures = []

    length = len(operators)
    count = 1

    for i in operators:
        print i, count, "/", length
        try:
            output = animate_one_day(i, date)
            results.append(output)
            print "success!"
            output.to_csv("data/indiv_operators/{}/{}.csv".format(OUTPUT_NAME,i))
        except StandardError:
            failures.append(i)
            print "failed"
        count += 1

    endtime = dt.datetime.now()
    print endtime
    print "Time taken:", endtime - starttime
    return results, failures

# Stacked bar chart functions

def count_vehicles_on_screen(concatenated_df, date):
    number_of_vehicles = []
    number_of_buses = []
    number_of_trams = []
    number_of_cablecars = []
    number_of_metros = []
    number_of_trains = []
    number_of_ferries = []

    day = dt.datetime.strptime(date, "%Y-%m-%d")
    thisday = dt.datetime.strftime(day, "%Y-%m-%d")

    # Every minute in the day
    the_day = [pd.to_datetime(thisday) + dt.timedelta(seconds = i*15) for i in range(60 * 24 * 4)]

    count = 0
    for minute in the_day:

        vehicles_on_the_road = concatenated_df[(concatenated_df['end_time'] > minute) & (concatenated_df['start_time'] <= minute)]
        number_vehicles_on_the_road = len(vehicles_on_the_road)
        number_of_vehicles.append(number_vehicles_on_the_road)

        for route_type in ['bus', 'tram', 'cablecar', 'metro', 'rail', 'ferry']:
            just_this_mode = vehicles_on_the_road[vehicles_on_the_road['route_type'] == route_type]
            number_of_this_mode = len(just_this_mode)
            if route_type == 'bus':
                number_of_buses.append(number_of_this_mode)
            elif route_type == 'tram':
                number_of_trams.append(number_of_this_mode)
            elif route_type == 'cablecar':
                number_of_cablecars.append(number_of_this_mode)
            elif route_type == 'metro':
                number_of_metros.append(number_of_this_mode)
            elif route_type == 'rail':
                number_of_trains.append(number_of_this_mode)
            elif route_type == 'ferry':
                number_of_ferries.append(number_of_this_mode)

        if count % (60*4) == 0:
            print minute

        count += 1

    vehicles = pd.DataFrame(zip(the_day, number_of_vehicles))
    buses = pd.DataFrame(zip(the_day, number_of_buses))
    trams = pd.DataFrame(zip(the_day, number_of_trams))
    cablecars = pd.DataFrame(zip(the_day, number_of_cablecars))
    metros = pd.DataFrame(zip(the_day, number_of_metros))
    trains = pd.DataFrame(zip(the_day, number_of_trains))
    ferries = pd.DataFrame(zip(the_day, number_of_ferries))

    for df in [vehicles, buses, trams, metros, cablecars, trains, ferries]:
        df.columns = ['time', 'count']

    return vehicles, buses, trams, metros, cablecars, trains, ferries

In [5]:
MAPZEN_APIKEY = "mapzen-ai1duha"
OUTPUT_NAME = "sacramento"
date = "2017-07-17"
bbox = "38.466493,-121.640625,38.683366,-121.297302"
south, west, north, east = bbox.split(",")

In [6]:
print "date: ", date
print "name: ", OUTPUT_NAME
print "apikey: ", MAPZEN_APIKEY
print "south, west, north, east", south, west, north, east

# First, let's get a list of the onestop id's for every operator in our bounding box.
operators_url = "http://transit.land/api/v1/operators?bbox={},{},{},{}&per_page=1000&api_key={}".format(west, south, east, north, MAPZEN_APIKEY)
operators_in_bbox = {i['onestop_id'] for i in transitland_request(operators_url)}
print len(operators_in_bbox), "operators in bounding box."

# We will exclude national Amtrak trips from the visualizaiton and vehicle counts.
operators_in_bbox -= {'o-9-amtrak'}
print len(operators_in_bbox), "operators to be downloaded."

# ### Run script on every operator and save each operator's results to a separate csv
if not os.path.exists("data/indiv_operators/{}".format(OUTPUT_NAME)):
    os.makedirs("data/indiv_operators/{}".format(OUTPUT_NAME))
results, failures = animate_many_operators(operators_in_bbox, date)
print len(results), "operators successfully downloaded."
print len(failures), "operators failed."
if len(failures): print "failed operators:", failures

# ### Concatenate all individual operator csv files into one big dataframe
concatenated_df = concatenate_csvs("data/indiv_operators/{}".format(OUTPUT_NAME))
concatenated_df.head()
concatenated_df.to_csv("data/output/{}.csv".format(OUTPUT_NAME))

print concatenated_df.shape

date:  2017-07-17
name:  sacramento
apikey:  mapzen-ai1duha
south, west, north, east 38.466493 -121.640625 38.683366 -121.297302
14 operators in bounding box.
13 operators to be downloaded.
2017-08-15 11:07:46.478504
o-9qc-yuba~suttertransit 1 / 13
http://transit.land/api/v1/schedule_stop_pairs?date=2017-07-17&operator_onestop_id=o-9qc-yuba~suttertransit&per_page=1000&api_key=mapzen-ai1duha
success!
o-9qc-eldoradotransit 2 / 13
http://transit.land/api/v1/schedule_stop_pairs?date=2017-07-17&operator_onestop_id=o-9qc-eldoradotransit&per_page=1000&api_key=mapzen-ai1duha
success!
o-9qc-placercountytransit 3 / 13
http://transit.land/api/v1/schedule_stop_pairs?date=2017-07-17&operator_onestop_id=o-9qc-placercountytransit&per_page=1000&api_key=mapzen-ai1duha
success!
o-9q-amtrakcalifornia 4 / 13
http://transit.land/api/v1/schedule_stop_pairs?date=2017-07-17&operator_onestop_id=o-9q-amtrakcalifornia&per_page=1000&api_key=mapzen-ai1duha
success!
o-9qce-sacramentoregionaltransit 5 / 13
http://tr

KeyboardInterrupt: 