In [None]:
import pandas as pd # version '0.18.1'
import numpy as np # version '1.10.4'
import matplotlib.pyplot as plt # version '2.1.0'
import ast
import re
import time

# Goal of this notebook

- For each 311 point, identify if there is an AHV near it. Add a column that says AHV = 1 or 0 in the 311 dataset

# Methodology

- This is equivalent of trying to do the buffer and intersect function on arcGIS
- The goal now is to get both AHV and 311 dataset to the same time period so that the spatial intersection can be conducted
- The test run now will only focus on the year of 2017

# AHV

After much trial and error, we finally figured out that AHV dataset has a delayed data update, ie. those that are recently uploaded belongs to a few months back. 

In [None]:
ahv = pd.read_csv('ahv_lat_long.csv')
ahv['timestamp'] = pd.to_datetime(ahv['timestamp'])
ahv.head()

## Dealing with the range column of AHV

In [None]:
#code credit: the genius duke
def correct_json(s):
    '''
    This function removes the extra characters from the json for permitted times and helps us read the values easily.
    '''
    s = re.sub("-\d\d:\d\d","'", s)
    s = s.replace("start': ", "start': '")    
    s = s.replace("end': ", "end': '")        
    s = s.replace("u'", "'")    
    return s

ahv['range_json'] = ahv['range'].apply(correct_json)
ahv[['timestamp','range_json', 'range']].head()

### Datetime adjustments

The timestamp seems IS NOT a good ballpark of the start dates in the range

In [None]:
ahv['ballpark_time'] = pd.to_datetime(ahv.timestamp, utc=True)
ahv['ballpark_year'] = ahv['ballpark_time'].dt.year
ahv['ballpark_month'] = ahv['ballpark_time'].dt.month


In [None]:
ahv_2016 = ahv[(ahv['ballpark_year']== 2016)]
ahv_2015_11 = ahv[(ahv['ballpark_year']== 2015)&(ahv['ballpark_month']==11)]
ahv_2015_12 = ahv[(ahv['ballpark_year']== 2015)&(ahv['ballpark_month']==12)]
ahv_2017_01 = ahv[(ahv['ballpark_year']== 2017)&(ahv['ballpark_month']==1)]
ahv_2017_02 = ahv[(ahv['ballpark_year']== 2017)&(ahv['ballpark_month']==2)]

In [None]:
ahv_2016_final = ahv_2015_11.append(ahv_2015_12).append(ahv_2016).append(ahv_2017_01).append(ahv_2017_02)

In [None]:
print (ahv_2016_final.shape)
ahv_2016_final.head()

# 311 Data

# New Methodology

### 1) Check if construction before or after hour

In [None]:
complaints16= pd.read_csv('2016_dep_noise.csv')

In [None]:
complaints16['c_date'] = pd.to_datetime(complaints16['Created Date'])

In [None]:
print (complaints16.shape)
complaints16.head()

In [None]:
complaints16 = complaints16[complaints16.Descriptor.isin(['Noise: Construction Before/After Hours (NM1)',
                              'Noise: Jack Hammering (NC2)','Noise: Construction Equipment (NC1)',
                             'Noise: Manufacturing Noise (NK1)'])]

Because it takes too long to run the whole thing, will parallelize by splitting it up

### 2) For each complaint, if it is within 200m of ahv, save it

In [None]:
def return_dist(complaint_lat,complaint_lon,ahv_lat,ahv_lon):
    ''' 
    This function returns the distance between two points
    Input : Latitude1, Longitude1, Latitude2, Longitude2
    Output : Distance in Metres
    '''
    from math import sin, cos, sqrt, atan2, radians
   # approximate radius of earth in km
    R = 6373.0
    lat1 = radians(complaint_lat)
    lon1 = radians(complaint_lon)
    lat2 = radians(ahv_lat)
    lon2 = radians(ahv_lon)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return(distance*1000)

In [None]:
ahv_list = []

def filterdistance(complaint, ahv_df):
    ''' 
       This function returns the indexes of all the AHV permits
       that the complaint lies within.
       Input : Single complaint, dataframe of AHV permits
       Output : AHV permit indexes
       '''
    for index,i in ahv_df.iterrows(): #for each ahv record
        if return_dist(float(complaint.Latitude), float(complaint.Longitude), i.lon, i.lat)<200: #if distance is less than 200m
            ahv_list.append(index)
#             ahv_within_radius.append(list(i))

    return (ahv_list)
            

In [None]:
return_dist(float(-73.946165),float(40.77410), -73.946165, 40.77410)

In [None]:
complaints16[:1]

In [None]:
ahv_2016_final[:1]

In [None]:
def filtered_time(complaint, ahv):
    ''' 
    This function returns the indexes of all the AHV permits
    that the complaint lies within.
    Input : Single complaint, dataframe of AHV permits
    Output : AHV permit indexes
    '''
    column_list = []
    for index,k in ahv.iterrows():
        for j in ast.literal_eval(k['range_json']):
            if ((complaint.c_date>pd.to_datetime(j['start']))&(complaint.c_date<pd.to_datetime(j['end']))):
                column_list.append(index)
    return(column_list)

In [None]:
complaints16['AHV'] = 0
start= time.time()
counter = 0
for index,i in complaints16.iterrows():
    time_indices = []
    space_indices = []
    time_indices = filtered_time(i,ahv_2016_final)
    ahv_final = ahv_2016_final.ix[time_indices]
    space_indices = filterdistance(i,ahv_final)
    if len(space_indices)>0:
        complaints16.AHV.loc[index] = 1
    counter = counter + 1

print("Time to run: {} seconds".format(time.time()-start))

Next steps: After getting AHV final, need to do another filterdistance with 311 dataset, this time using index,i of complaints so that we can get the 311 points of interest and record AHV=1 for these points

In [None]:
complaints16.to_csv('subset_files/ahv_1.csv')