In [32]:
import requests
import math
import json
import pandas as pd
import numpy as np
import re
from sodapy import Socrata
import multiprocessing
import psycopg2

In [35]:
def cd():
    data = requests.get('https://data.cityofnewyork.us/api/views/cc5c-sm6z/rows.json')
    raw_data = data.json()['data']
    cleaner_data = []
    for lane in raw_data:
        coords = lane[8].split(',')
        if lane[-1] != None:
            cleaner_data.append({'first_coord': (float(coords[0][18:].split(' ')[1]), float(coords[0][18:].split(' ')[0])), 'second_coord': (float(coords[1][1:-2].split(' ')[1]), float(coords[1][1:-2].split(' ')[0])), 'type': lane[-1], 'deaths': 0, 'injuries': 0})
        else:
            cleaner_data.append({'first_coord': (float(coords[0][18:].split(' ')[1]), float(coords[0][18:].split(' ')[0])), 'second_coord': (float(coords[1][1:-2].split(' ')[1]), float(coords[1][1:-2].split(' ')[0])), 'type': 'Unknown', 'deaths': 0, 'injuries': 0})
    return cleaner_data

In [36]:
lanes_from_api = cd()

In [4]:
def quick_dist(coord1, coord2):
    return math.sqrt(math.pow(12430*((coord1[1]-coord2[1])/180),2)+math.pow(24901*((coord1[0]-coord2[0])/360)*0.16133111759,2))

In [5]:
def cds(data):
    actionable_data = []
    assembly_array = []
    count = 0
    for lane in data:
        newlane = lane
        count += 1
        for option in data:
            if quick_dist(lane['first_coord'], option['first_coord']) < 0.5 and lane['first_coord'] != option['first_coord'] and lane['second_coord'] != option['second_coord']:
                obj = {'first_coord': option['first_coord'], 'second_coord': option['second_coord'], 'type': option['type']}
                if 'injuries' in option.keys():
                    obj['injuries'] = option['injuries']
                if 'deaths' in option.keys():
                    obj['deaths'] = option['deaths']
                assembly_array.append(obj)
        newlane['options'] = assembly_array
        assembly_array = []
        if count % 5000 == 0:
            print(str(int(count/len(data)*100))+'%')
        actionable_data.append(newlane)
    return actionable_data

In [6]:
def retrieve_crashes():
    client = Socrata("data.cityofnewyork.us", None)
    results = client.get("h9gi-nx95", limit="100000000")
    return results

In [7]:
crash_data = retrieve_crashes()



In [8]:
def parse_crashes(data):
    regex = r"vehicle_type_code\d+"
    bike_accidents = []
    errors = 0
    for crash in data:
        for item in crash.keys():
            if re.match(regex, item) and crash[item] == 'Bike' and int(crash['number_of_cyclist_injured'] + crash['number_of_cyclist_killed']) > 0:
                try:
                    bike_accidents.append({
                        'date': crash['crash_date'],
                        'time': crash['crash_time'],
                        'latitude': float(crash['latitude']),
                        'longitude': float(crash['longitude']),
                        'number_of_cyclist_injured': crash['number_of_cyclist_injured'],
                        'number_of_cyclist_killed': crash['number_of_cyclist_killed'],
                    })
                except:
                    errors += 1
                break
    print("The percentage of bike crashes which are invalid is:", errors/len(bike_accidents))
    return bike_accidents

In [9]:
bike_crashes = parse_crashes(crash_data)

The percentage of bike crashes which are invalid is: 0.08206096574049096


In [10]:
def score_lane(crash_data, lane):
    lane_dist = quick_dist(lane['first_coord'], lane['second_coord']) / 2
    for crash in crash_data:
        first_dist = quick_dist(lane['first_coord'], (crash['latitude'], crash['longitude']))
        second_dist = quick_dist(lane['second_coord'], (crash['latitude'], crash['longitude']))
        if lane_dist > first_dist or lane_dist > second_dist:
            try:
                lane['injuries'] += int(crash['number_of_cyclist_injured'])
                lane['deaths'] += int(crash['number_of_cyclist_killed'])
            except:
                lane['injuries'] = int(crash['number_of_cyclist_injured'])
                lane['deaths'] = int(crash['number_of_cyclist_killed'])
    return lane

In [11]:
def call_scoring_func(lane):
    return score_lane(bike_crashes, lane)

In [12]:
def pool(lanes):
    with multiprocessing.Pool() as pool:
        multiproc = pool.map(call_scoring_func, lanes)
    return multiproc

In [13]:
if __name__ == '__main__':
    threaded = pool(lanes_from_api)

In [14]:
lane_data = cds(threaded)

26%
53%
80%


In [37]:
%load_ext sql
%sql postgresql://alex:password@localhost:5432/alex

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [38]:
conn = psycopg2.connect(host="localhost", port = 5432, database="alex", user="alex", password="password")
cur = conn.cursor()
cur.execute("""CREATE TABLE IF NOT EXISTS lanes (first_coord_1 FLOAT, first_coord_2 FLOAT, second_coord_1 FLOAT, second_coord_2 FLOAT, type VARCHAR(50), injuries INT, deaths INT, id INT GENERATED ALWAYS AS IDENTITY);""")
for lane in lane_data:
    cur.execute("""INSERT INTO lanes (first_coord_1, first_coord_2, second_coord_1, second_coord_2, type, injuries, deaths) VALUES (%s, %s, %s, %s, %s, %s, %s);""", (lane['first_coord'][0], lane['first_coord'][1], lane['second_coord'][0], lane['second_coord'][1], lane['type'], lane['injuries'], lane['deaths']))
cur.execute("""SELECT * FROM lanes LIMIT 5;""")
query_results = cur.fetchall()
print(query_results)
conn.commit()
cur.close()
conn.close()

[(40.72315861141582, -73.87218201068114, 40.72352286351853, -73.87137759976227, 'Sharrows', 0, 0, 1), (40.57717211991796, -74.00066694563638, 40.577121471058895, -74.00110488656607, 'Standard', 0, 0, 2), (40.662347802483495, -73.84937839467118, 40.66217490965365, -73.84931944600652, 'Unknown', 0, 0, 3), (40.661046003203786, -73.97950974891293, 40.661099666912, -73.97926926441511, 'Greenway', 0, 0, 4), (40.72529744997435, -74.00921397183593, 40.72541007010525, -74.0091943858655, 'Unknown', 0, 0, 5)]
