In [32]:
import requests
import math
import json
import pandas as pd
import re
from sodapy import Socrata
import multiprocessing
import psycopg2
from sqlalchemy import create_engine

In [2]:
def cd():
    data = requests.get('https://data.cityofnewyork.us/api/views/cc5c-sm6z/rows.json')
    raw_data = data.json()['data']
    cleaner_data = []
    for lane in raw_data:
        coords = lane[8].split(',')
        if lane[-1] != None:
            cleaner_data.append({'first_coord': (float(coords[0][18:].split(' ')[1]), float(coords[0][18:].split(' ')[0])), 'second_coord': (float(coords[1][1:-2].split(' ')[1]), float(coords[1][1:-2].split(' ')[0])), 'type': lane[-1]})
        else:
            cleaner_data.append({'first_coord': (float(coords[0][18:].split(' ')[1]), float(coords[0][18:].split(' ')[0])), 'second_coord': (float(coords[1][1:-2].split(' ')[1]), float(coords[1][1:-2].split(' ')[0])), 'type': 'Unknown'})
    return cleaner_data

In [3]:
lanes_from_api = cd()

In [4]:
def quick_dist(coord1, coord2):
    return math.sqrt(math.pow(12430*((coord1[1]-coord2[1])/180),2)+math.pow(24901*((coord1[0]-coord2[0])/360)*0.16133111759,2))

In [27]:
def cds(data):
    actionable_data = []
    assembly_array = []
    count = 0
    for lane in data:
        newlane = lane
        count += 1
        for option in data:
            if quick_dist(lane['first_coord'], option['first_coord']) < 0.5 and lane['first_coord'] != option['first_coord'] and lane['second_coord'] != option['second_coord']:
                obj = {'first_coord': option['first_coord'], 'second_coord': option['second_coord'], 'type': option['type']}
                if 'injuries' in option.keys():
                    obj['injuries'] = option['injuries']
                if 'deaths' in option.keys():
                    obj['deaths'] = option['deaths']
                assembly_array.append(obj)
        newlane['options'] = assembly_array
        assembly_array = []
        if count % 5000 == 0:
            print(str(int(count/len(data)*100))+'%')
        actionable_data.append(newlane)
    return actionable_data

In [6]:
def retrieve_crashes():
    client = Socrata("data.cityofnewyork.us", None)
    results = client.get("h9gi-nx95", limit="100000000")
    return results

In [7]:
crash_data = retrieve_crashes()



In [8]:
def parse_crashes(data):
    regex = r"vehicle_type_code\d+"
    bike_accidents = []
    errors = 0
    for crash in data:
        for item in crash.keys():
            if re.match(regex, item) and crash[item] == 'Bike' and int(crash['number_of_cyclist_injured'] + crash['number_of_cyclist_killed']) > 0:
                try:
                    bike_accidents.append({
                        'date': crash['crash_date'],
                        'time': crash['crash_time'],
                        'latitude': float(crash['latitude']),
                        'longitude': float(crash['longitude']),
                        'number_of_cyclist_injured': crash['number_of_cyclist_injured'],
                        'number_of_cyclist_killed': crash['number_of_cyclist_killed'],
                    })
                except:
                    errors += 1
                break
    print("The percentage of bike crashes which are invalid is:", errors/len(bike_accidents))
    return bike_accidents

In [9]:
bike_crashes = parse_crashes(crash_data)

The percentage of bike crashes which are invalid is: 0.08198757763975155


In [10]:
def score_lane(crash_data, lane):
    lane_dist = quick_dist(lane['first_coord'], lane['second_coord']) / 2
    for crash in crash_data:
        first_dist = quick_dist(lane['first_coord'], (crash['latitude'], crash['longitude']))
        second_dist = quick_dist(lane['second_coord'], (crash['latitude'], crash['longitude']))
        if lane_dist > first_dist or lane_dist > second_dist:
            try:
                lane['injuries'] += int(crash['number_of_cyclist_injured'])
                lane['deaths'] += int(crash['number_of_cyclist_killed'])
            except:
                lane['injuries'] = int(crash['number_of_cyclist_injured'])
                lane['deaths'] = int(crash['number_of_cyclist_killed'])
    return lane

In [11]:
def call_scoring_func(lane):
    return score_lane(bike_crashes, lane)

In [12]:
def pool(lanes):
    with multiprocessing.Pool() as pool:
        multiproc = pool.map(call_scoring_func, lanes)
    return multiproc

In [13]:
if __name__ == '__main__':
    threaded = pool(lanes_from_api)

In [14]:
count = 0
for x in threaded:
    if 'injuries' in x.keys() or 'deaths' in x.keys():
        count += 1
count / len(threaded)

0.4492258408969568

In [28]:
lane_data = cds(threaded)

26%
53%
80%


In [29]:
lane_data[100]

{'first_coord': (40.670002667989024, -73.92034889591703),
 'second_coord': (40.669977233332105, -73.91987744194444),
 'type': 'Standard',
 'injuries': 5,
 'deaths': 0,
 'options': [{'first_coord': (40.670036359198015, -73.9209733705167),
   'second_coord': (40.670029756223435, -73.9208509751104),
   'type': 'Standard'},
  {'first_coord': (40.69216530176907, -73.91475403047755),
   'second_coord': (40.691737609817096, -73.91400199049562),
   'type': 'Unknown'},
  {'first_coord': (40.71331347525393, -73.9199811023929),
   'second_coord': (40.712448189187704, -73.91852593338456),
   'type': 'Signed Route',
   'injuries': 4,
   'deaths': 0},
  {'first_coord': (40.62973036353593, -73.92221536892792),
   'second_coord': (40.628527667961826, -73.9220865670328),
   'type': 'Standard',
   'injuries': 2,
   'deaths': 0},
  {'first_coord': (40.69623968472667, -73.92191460440849),
   'second_coord': (40.69621988765262, -73.92187980985112),
   'type': 'Unknown'},
  {'first_coord': (40.6894086959289

In [43]:
%load_ext sql
%sql postgresql://alex:password@localhost:5432/alex
engine = create_engine('postgresql://alex:password@localhost:5432/alex')

The sql extension is already loaded. To reload it, use:
  %reload_ext sql
