In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

This notebook is used to preprocess the data for the 311 Data Analysis Project.

In [2]:
df = pd.read_csv('311_NYPD_6month.csv')

#keep relevant cols
keep_cols = ['Created Date', 'Closed Date', 'Complaint Type', 'Descriptor', 'Location Type',
             'Incident Zip', 'Due Date', 'Resolution Description', 'Resolution Action Updated Date',
             'Latitude', 'Longitude','Borough','City']

df = df[keep_cols]

#export for visna visualization
df.to_csv('relevant_unprocessed_311_data.csv')

#remove all rows with missing values.
df = df.dropna()

  interactivity=interactivity, compiler=compiler, result=result)


Check if we have any missing value left:

In [3]:
np.sum(df.isnull())

Created Date                      0
Closed Date                       0
Complaint Type                    0
Descriptor                        0
Location Type                     0
Incident Zip                      0
Due Date                          0
Resolution Description            0
Resolution Action Updated Date    0
Latitude                          0
Longitude                         0
Borough                           0
City                              0
dtype: int64

In [4]:
set(df['Resolution Description'])

{'The Police Department issued a summons in response to the complaint.',
 'The Police Department made an arrest in response to the complaint.',
 'The Police Department responded and upon arrival those responsible for the condition were gone.',
 'The Police Department responded to the complaint and a report was prepared.',
 'The Police Department responded to the complaint and determined that police action was not necessary.',
 'The Police Department responded to the complaint and took action to fix the condition.',
 'The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.',
 'The Police Department responded to the complaint but officers were unable to gain entry into the premises.',
 'The Police Department reviewed your complaint and provided additional information below.',
 "This complaint does not fall under the Police Department's jurisdiction.",
 'Your request can not be processed at this time because o

Check if police acted upon it, unable to act, or if the problem is a false alarm. Subgroup the aforementioned group.

In [5]:
comp_type = {'Noise':['Noise - Commercial','Noise - House of Worship','Noise - Park','Noise - Residential','Noise - Street/Sidewalk','Noise - Vehicle'], 
             'Street condition related':['Derelict Vehicle','Graffiti','Homeless Encampment','Panhandling','Vending', 'Posting Advertisement','Sidewalk Condition','Street Condition',],
             'Disturbance':['Bike/Roller/Skate Chronic','Disorderly Youth','Drinking','Drug Activity','Illegal Fireworks','Urinating in Public'],
             'Traffic related': ['Traffic','Illegal Parking','Blocked Driveway'],
             'Miscellaneous': ['Non-Emergency Police Matter','Animal Abuse']}
reso_type = {'Police Department Acted':['The Police Department issued a summons in response to the complaint.',
                                       'The Police Department made an arrest in response to the complaint.',
                                       'The Police Department responded to the complaint and took action to fix the condition.',
                                        'The Police Department responded to the complaint and a report was prepared.',
                                       'The Police Department reviewed your complaint and provided additional information below.'],
            'Police Department Unable to Act': ['The Police Department responded and upon arrival those responsible for the condition were gone.',
                                       'The Police Department responded to the complaint but officers were unable to gain entry into the premises.',
                                       "This complaint does not fall under the Police Department's jurisdiction.",
                                        'Your request can not be processed at this time because of insufficient contact information. Please create a new Service Request on NYC.gov and provide more detailed contact information.'],
            'False Alarm': ['The Police Department responded to the complaint and determined that police action was not necessary.',
                           'The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.']}

def subgroup_type(col, subgroups):
    """
    Helper method to help subgroup the data into mentioned subgroups. 
    
    Args: 
    - col: column to subgroup 
    - subgroups: dictionary of subgroups
    
    Returns: 
    - array of supgrouped items
    """
    array = np.zeros(len(col),dtype='object')
    for i,entry in enumerate(col):
        for key,comp_list in subgroups.items():
            if entry in comp_list:
                array[i] = key
    
    return(array)

df['Complaint Subgroups'] = subgroup_type(df['Complaint Type'],comp_type)
df['Resolution Subgroups'] = subgroup_type(df['Resolution Description'],reso_type)

### Data Enrichment: Add Police Station Distance
Now we enrich the data, and find the distance to the closest police station.

In [7]:
df_nypp = pd.read_csv('nypd_precincts.csv')

In [8]:
def get_centroid(col):
    """
    The centroid estimation for multi-polygon
    """
    centroid_list = []
    for i,val in enumerate(col):
        v_str = val[val.find('('):].strip('(').strip(')').split(',')
        v_coord = []
        for j,string in enumerate(v_str):
            if string[0] == ' ':
                string = string[1:]
            if string[-1] == ')':
                string = string[:-2]
            if string[0] == '(':
                string = string[2:]
            c0 = float(string[:string.find(' ')].strip(' '))
            c1 = float(string[string.find(' ')+1:].strip(' '))
            if abs(c0+70) >20 or abs(c1-40)>20:
                print('lmao something went south')
            v_coord.append((c0,c1)) 
            
        centroid_list.append(list(np.mean(v_coord,axis = 0).astype('float')))
    return(centroid_list)
def naive_dist(data,prec):
    """
    Naive Euclidean distance
    """
    R = 6371e3
    sqdt = ((np.radians(data)[:,np.newaxis,:]-np.radians(prec)[np.newaxis,:,:])**2)
    naive_geo = np.sqrt(np.sum(sqdt,axis=2))
    return(naive_geo*R)
def calculate_geodesic(data_lonlat,precinct_lonlat):
    """
    lonlat_0 should be the data one, 
    lonlat_1 should be the precinct one
    
    formulas given in the report
      
    """
    R = 6371e3
    data_rad = np.radians(data_lonlat)[:,np.newaxis,:]
    prec_rad = np.radians(precinct_lonlat)[np.newaxis,:,:]
    diff = data_rad-prec_rad
    a = np.sin(diff[:,:,0]/2)**2+np.cos(data_rad[:,:,0])*np.cos(prec_rad[:,:,0])*np.sin(diff[:,:,1]/2)**2
    c = 2*np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    return c*R

Test how will naive distance worked against true distance

In [9]:
calculate_geodesic(np.array([44,-77],dtype='float').reshape((1,-1)),np.array([45,-78],dtype='float').reshape((1,-1)))

array([[136578.40369675]])

In [10]:
naive_dist(np.array([44,-77],dtype='float').reshape((1,-1)),np.array([45,-78],dtype='float').reshape((1,-1)))

array([[157253.37332782]])

relative error

In [11]:
(157253.37332782-136578.40369675)/136578.40369675

0.15137802955272048

Now we use our functions to compute the things we want

In [12]:
#%debug
result = get_centroid(df_nypp['the_geom'])
result_arr = np.array(result,dtype='float')

In [13]:
data_arr = df[['Longitude','Latitude']].values

In [14]:
#%debug
dist_to_closest_station = np.min(calculate_geodesic(data_arr,result_arr),axis = 1)

In [15]:
df['Distance To Closest Station'] = dist_to_closest_station

In [16]:
df.to_csv('cleaned_311_data.csv')