In [1]:
import os
import timestring
import pandas as pd
import numpy as np
from geopy.distance import vincenty

In [2]:
def collision_clean(file_path):
  """
  Filters and removes unneeded observations from the collisions dataset
  """
  if not os.path.exists(file_path):
        raise ValueError('The file path is not valid')
  collisions = pd.read_csv(file_path, sep=',', header=0)
  collisions = collisions[["objectid",
                           "X",
                           "Y",
                           "incdate",
                           "incdttm",
                           "pedcount",
                           "pedcylcount",
                           "severitycode",
                           "severitydesc"]]
  collisions.dropna(inplace=True)
  collisions = collisions.rename(columns={"objectid": "c_id",
                                          "X": "c_long",
                                          "Y":"c_lat",
                                          "incdate": "c_date",
                                          "incdttm":"c_datetime",
                                          "pedcount": "c_ped",
                                          "pedcylcount" : "c_cyc",
                                          "severitycode" : "c_severity_code",
                                          "severitydesc" : "c_severity_desc"})
  collisions['c_veh'] = np.where(collisions['c_cyc'] == 0,
                                 np.where(collisions['c_ped'] == 0, 1, 0), 0)
  collisions.to_csv('collisions_clean.csv')
  return collisions


In [3]:
def buildings_clean(file_path):
    """Filters and removes unneeded observations from the Building Permits dataset"""
    if not os.path.exists(file_path):
        raise ValueError('The file path is not valid')
    buildings = pd.read_csv(file_path, sep=',', header=0, index_col=0)
    buildings = buildings[["Application/Permit Number",
                           "Permit Type",
                           "Category",
                           "Action Type",
                           "Work Type",
                           "Value",
                           "Issue Date",
                           "Final Date",
                           "Expiration Date",
                           "Status",
                           "Latitude",
                           "Longitude"]]
    buildings = buildings[buildings["Action Type"] == "NEW"]
    buildings = buildings[buildings["Value"] > 250000]
    buildings = buildings[pd.notnull(buildings["Issue Date"])]
    buildings = buildings[pd.notnull(buildings["Final Date"]) |
                          pd.notnull(buildings["Expiration Date"])]
    buildings = buildings[buildings["Status"] != "CANCELLED"]
    buildings = buildings.rename(columns={"Application/Permit Number": "b_id",
                                          "Permit Type": "b_permit_type",
                                          "Category" : "b_category",
                                          "Action Type" : "b_action_type",
                                          "Work Type" : "b_work_type",
                                          "Value": "b_value",
                                          "Issue Date" : "b_issue_date",
                                          "Final Date" : "b_final_date",
                                          "Expiration Date" : "b_expiration_date",
                                          "Status" : "b_status",
                                          "Latitude" : "b_latitude",
                                          "Longitude" : "b_longitude"})
    buildings.to_csv('buildings_clean.csv')
    return buildings


In [4]:
def radius_table(collisions, buildings):
    """
    Uses geopy's vincenty distance function to calculate collision distance
    from each building site. Distance is recorded in feet.
    
    For all collisions within 1500 feet of a building site, a builing/collision
    pair is added to the radius data table.
    
    Radius data table includes (for unique (building, collision) pairs):
        build_id: (string) matches to buildings_clean table
        coll_id: (string) mathces to collision_clean table
        build_loc: (float, float) long/lat coordinate pair
        build_start_dt: (datetime.datetime) date with time stripped
        build_end_dt: (datetime.datetime) date with time stripped
        coll_dt: (datetime.datetime) date with time stripped
        coll_loc: (float, float) long/lat coordinate pair
        radius: (float) distance in feet between building and collision
    """
    rad_data = []
    
    for i,b in buildings.iterrows():
        b_loc = (b["b_latitude"], b["b_longitude"])
        for j,c in collisions.iterrows():
            c_loc = (c["c_lat"], c["c_long"])
            d = vincenty(b_loc, c_loc).ft
            if d <= 1500:
                rad_data.append({
                    'build_id': b["b_id"],
                    'coll_id': c["c_id"],
                    'build_loc': b_loc,
                    'build_start_dt': timestring.Date(b["b_issue_date"]).date,
                    'build_end_dt': timestring.Date(b["b_final_date"]).date,
                    'coll_dt': timestring.Date(c["c_datetime"]).date,
                    'coll_loc': c_loc,
                    'radius': d
                })
            else:
                pass
            
    # Export dataframe to csv
    rads = pd.DataFrame(rad_data)
    rads.to_csv("radius_clean.csv")
    return rads
        

In [5]:
colls = collision_clean("seattlecollision/data/Collisions.csv")
builds = buildings_clean('seattlecollision/data/clean_permits.csv')

In [6]:
%%time

rads = radius_table(colls, builds.head())
rads.shape

Wall time: 3min 24s


In [7]:
rads.head()

Unnamed: 0,build_end_dt,build_id,build_loc,build_start_dt,coll_dt,coll_id,coll_loc,radius
0,2018-05-13,6552481,"(47.58900409, -122.304421)",2017-06-06,2016-12-31 11:54:00,162244258,"(47.591928735782034, -122.30851154024509)",1468.678126
1,2018-05-13,6552481,"(47.58900409, -122.304421)",2017-06-06,2016-12-20 10:45:00,162244267,"(47.58965910896437, -122.30677910424879)",629.053819
2,2018-05-13,6552481,"(47.58900409, -122.304421)",2017-06-06,2016-12-04 08:10:00,162244296,"(47.585020154303706, -122.30292070084373)",1499.648586
3,2018-05-13,6552481,"(47.58900409, -122.304421)",2017-06-06,2017-06-17 02:38:00,162244575,"(47.58849200541599, -122.3058896008914)",407.718078
4,2018-05-13,6552481,"(47.58900409, -122.304421)",2017-06-06,2004-02-03 00:00:00,162244984,"(47.588992204086274, -122.30516473950665)",183.585395


In [8]:
builds.shape

(4833, 12)