In [3]:
import requests
import math
import json
import pandas as pd
import numpy as np
import re
from sodapy import Socrata
import multiprocessing
import psycopg2

In [24]:
def quick_dist(coord1, coord2):
    """
    A function to rapidly calculate distances without trigonometry using constants local to New York City.
    """
    return math.sqrt(math.pow(12430*((coord1[1]-coord2[1])/180),2)+math.pow(24901*((coord1[0]-coord2[0])/360)*0.16133111759,2))

In [11]:
def retrieve_data():
    """
    Pulls data from NYC Open Data using their SODA APIs.
    """
    client = Socrata("data.cityofnewyork.us", None)
    return {'crashes': client.get("h9gi-nx95", limit=100000000), 
            'centerlines': client.get("yfsx-e2j3", limit=100000000), 
            'lanes': client.get("cc5c-sm6z", limit=100000000)}

In [56]:
data = retrieve_data()



In [57]:
def filter_crashes(data):
    regex = r"vehicle_type_code\d+"
    bike_accidents = []
    errors = 0
    for crash in data['crashes']:
        for item in crash.keys():
            if re.match(regex, item) and crash[item] == 'Bike' and int(crash['number_of_cyclist_injured'] + crash['number_of_cyclist_killed']) > 0:
                try:
                    bike_accidents.append({
                        'date': crash['crash_date'],
                        'time': crash['crash_time'],
                        'latitude': float(crash['latitude']),
                        'longitude': float(crash['longitude']),
                        'number_of_cyclist_injured': crash['number_of_cyclist_injured'],
                        'number_of_cyclist_killed': crash['number_of_cyclist_killed'],
                    })
                except:
                    pass
                break
    data['crashes'] = bike_accidents

In [58]:
def clean_lanes(data):
    lane_assembly = {}
    new_lane_dataset = []
    for lane in data['lanes']:
        lane_assembly['on_street'] = lane['onoffst'] == 'ON'
        lane_assembly['directional_width'] = lane['lanecount']
        if 'ft_facilit' in lane.keys():
            lane_assembly['infrastructure'] = lane['ft_facilit']
        elif 'tf_facilit' in lane.keys():
            lane_assembly['infrastructure'] = lane['tf_facilit']
        else:
            lane_assembly['infrastructure'] = 'unknown'
        if lane['bikedir'] == 'L':
            lane_assembly['first_coord'] = lane['the_geom']['coordinates'][0][1]
            lane_assembly['second_coord'] = lane['the_geom']['coordinates'][0][0]
        else:
            lane_assembly['first_coord'] = lane['the_geom']['coordinates'][0][0]
            lane_assembly['second_coord'] = lane['the_geom']['coordinates'][0][1]
            if lane['bikedir'] == '2':
                lane_assembly['direction'] = 'two-way'
            if lane['bikedir'] == 'X':
                lane_assembly['direction'] = 'construction'
        new_lane_dataset.append(lane_assembly)
        lane_assembly = {}
    data['lanes'] = new_lane_dataset

In [59]:
filter_crashes(data)
clean_lanes(data)

In [90]:
counter = []
for x in data['centerlines']:
    if "MANHATTAN BRIDGE" in x['st_label']:
        counter.append(x)
print(len(counter))

64


In [92]:
data['centerlines'][15]

{'the_geom': {'type': 'MultiLineString',
  'coordinates': [[[-74.13609026778178, 40.60413086111318],
    [-74.13532248219795, 40.604189662118465]]]},
 'l_high_hn': '69',
 'l_low_hn': '43',
 'physicalid': '94303',
 'r_low_hn': '44',
 'r_high_hn': '60',
 'l_zip': '10314',
 'r_zip': '10314',
 'l_blkfc_id': '1622612036',
 'r_blkfc_id': '1622609781',
 'st_label': 'PRESIDENT ST',
 'status': '2',
 'borocode': '5',
 'st_width': '38',
 'created': '2007-11-29T00:00:00.000Z',
 'modified': '2017-03-17T00:00:00.000Z',
 'trafdir': 'TW',
 'rw_type': '1',
 'frm_lvl_co': '13',
 'to_lvl_co': '13',
 'snow_pri': 'S',
 'post_type': 'ST',
 'full_stree': 'PRESIDENT ST',
 'st_name': 'PRESIDENT',
 'shape_leng': '214.271871219'}

In [None]:
def combine_data(data, datapoint):
    dists = []
    for crash in data['crashes']:
        first_dist = quick_dist(street['first_coord'], (crash['latitude'], crash['longitude']))
        second_dist = quick_dist(street['second_coord'], (crash['latitude'], crash['longitude']))
        if street_dist > first_dist or street_dist > second_dist:
            try:
                lane['injuries'] += int(crash['number_of_cyclist_injured'])
                lane['deaths'] += int(crash['number_of_cyclist_killed'])
            except:
                lane['injuries'] = int(crash['number_of_cyclist_injured'])
                lane['deaths'] = int(crash['number_of_cyclist_killed'])
    for lane in data['lanes']:
        first_dist = quick_dist(street['first_coord'], (lane['latitude'], lane['longitude']))
        second_dist = quick_dist(street['second_coord'], (lane['latitude'], lane['longitude']))
    return lane

In [65]:
def call_scoring_func(lane):
    return score_lane(data, lane)

In [66]:
def pool(data):
    with multiprocessing.Pool() as pool:
        multiproc = pool.map(call_scoring_func, data['lanes'])
    return multiproc

In [67]:
if __name__ == '__main__':
    threaded = pool(data)

In [None]:
def cds(data):
    actionable_data = []
    assembly_array = []
    count = 0
    for lane in data:
        newlane = lane
        count += 1
        for option in data:
            if quick_dist(lane['first_coord'], option['first_coord']) < 0.5 and lane['first_coord'] != option['first_coord'] and lane['second_coord'] != option['second_coord']:
                obj = {'first_coord': option['first_coord'], 'second_coord': option['second_coord'], 'type': option['type']}
                if 'injuries' in option.keys():
                    obj['injuries'] = option['injuries']
                if 'deaths' in option.keys():
                    obj['deaths'] = option['deaths']
                assembly_array.append(obj)
        newlane['options'] = assembly_array
        assembly_array = []
        if count % 5000 == 0:
            print(str(int(count/len(data)*100))+'%')
        actionable_data.append(newlane)
    return actionable_data

In [14]:
lane_data = cds(threaded)

26%
53%
80%


In [37]:
%load_ext sql
%sql postgresql://alex:password@localhost:5432/alex

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [38]:
conn = psycopg2.connect(host="localhost", port = 5432, database="alex", user="alex", password="password")
cur = conn.cursor()
cur.execute("""CREATE TABLE IF NOT EXISTS lanes (first_coord_1 FLOAT, first_coord_2 FLOAT, second_coord_1 FLOAT, second_coord_2 FLOAT, type VARCHAR(50), injuries INT, deaths INT, id INT GENERATED ALWAYS AS IDENTITY);""")
for lane in lane_data:
    cur.execute("""INSERT INTO lanes (first_coord_1, first_coord_2, second_coord_1, second_coord_2, type, injuries, deaths) VALUES (%s, %s, %s, %s, %s, %s, %s);""", (lane['first_coord'][0], lane['first_coord'][1], lane['second_coord'][0], lane['second_coord'][1], lane['type'], lane['injuries'], lane['deaths']))
cur.execute("""SELECT * FROM lanes LIMIT 5;""")
query_results = cur.fetchall()
print(query_results)
conn.commit()
cur.close()
conn.close()

[(40.72315861141582, -73.87218201068114, 40.72352286351853, -73.87137759976227, 'Sharrows', 0, 0, 1), (40.57717211991796, -74.00066694563638, 40.577121471058895, -74.00110488656607, 'Standard', 0, 0, 2), (40.662347802483495, -73.84937839467118, 40.66217490965365, -73.84931944600652, 'Unknown', 0, 0, 3), (40.661046003203786, -73.97950974891293, 40.661099666912, -73.97926926441511, 'Greenway', 0, 0, 4), (40.72529744997435, -74.00921397183593, 40.72541007010525, -74.0091943858655, 'Unknown', 0, 0, 5)]
