# San Francisco Crime Classification
## Mapping Neighborhoods

In [1]:
import pandas as pd
df = pd.read_csv("./train1.csv", 
                 parse_dates=['Dates'], 
                 infer_datetime_format=True
                )

In [206]:
df_violent = df[df.Crime_Type == 'VIOLENT CRIMES']
df_nocrime = df[df.Crime_Type == 'NO CRIMES']
df_sub = df[df.Crime_Type == 'SUBSTANCE-BASED CRIMES']
df_property = df[df.Crime_Type == 'PROPERTY CRIMES']
df_other = df[df.Crime_Type == 'OTHER CRIMES']

len(df_violent), len(df_nocrime), len(df_sub), len(df_property), len(df_other)

(96878, 88956, 61489, 304847, 298754)

## A lot of the data preprocessing went into mapping the coordinates into actual neighborhoods...

In [3]:
import urllib.request
import json
def read_geo_url(url):
    return json.loads(urllib.request.urlopen(url).read().decode('UTF-8'))

def make_geojson(dataframe):
    """
    Let's make our own geojson file from data frame.
    """
    x = dataframe[['Category', 'X', 'Y']].to_dict()
    rs = []
    for val in x['Category']:
        geojson = {'type': 'Feature',
                   'geometry': {'coordinates': None, 'type': 'Point'},
                   "properties" : {}}
        geojson['geometry']['coordinates'] = [x['X'][val], x['Y'][val]]
        geojson['properties']['category'] = x['Category'][val]
        rs.append(geojson)

    final_object = {"type": "FeatureCollection", "features" : rs}
    return final_object

In [4]:
'''
import folium
SF_COORDINATES = [37.79086, -122.40147]

# We should only use 500 observations as folium will stall ipython with any more points
geojson_violent['features'] = geojson_violent['features'][:100] 

# create empty map zoomed in on San Francisco
crime_map = folium.Map(location=SF_COORDINATES, zoom_start=12)
crime_map.choropleth(geo_str=geojson_larceny)
crime_map
# add a marker for every record in the filtered data, use a clustered view
'''
print(__doc__)


import folium
SF_COORDINATES = [37.79086, -122.40147]

# We should only use 500 observations as folium will stall ipython with any more points
geojson_violent['features'] = geojson_violent['features'][:100] 

# create empty map zoomed in on San Francisco
crime_map = folium.Map(location=SF_COORDINATES, zoom_start=12)
crime_map.choropleth(geo_str=geojson_larceny)
crime_map
# add a marker for every record in the filtered data, use a clustered view



In [189]:
# Lets overlay that with some neighborhood break downs.
# I took this data from Code For America. Thanks Code America!
# https://github.com/codeforamerica/click_that_hood/blob/master/public/data/san-francisco.geojson
# load shape path
sf_path='https://raw.githubusercontent.com/joshuacano/DS-SF-24/master/folium_lecture/san-francisco.geojson'
hood_json = read_geo_url(sf_path)

In [190]:
import shapely.geometry
hood_shapes = []

for feature in hood_json["features"]:
    hood_shapes.append({
            "shape" : shapely.geometry.shape(feature["geometry"]),
            "name" : feature['properties']['name']})
    
print("hood_shapes created.")
print(len(hood_shapes))

hood_shapes created.
37


In [7]:
"""
Map each coordinate into neighborhoods.
Needs to do it small piece by piece because of memory contraint.
"""
pd.options.mode.chained_assignment = None
from datetime import datetime

for file_index, file in enumerate([df_violent, df_nocrime, df_sub, df_property, df_other]):
    print(datetime.now())
    print("file", file_index)
    for row_index in range(0, len(file), 15000):
        print(row_index)
        df_xy = file[row_index:(row_index+15000)][['X', 'Y']] ## we don't this 15,000 rows at a time 
                                                              ## otherwise the program will eat all your memory!
        df_xy['shape_neighborhood'] = None
        for hood_shape in hood_shapes:
            for row in df_xy.loc[df_xy['shape_neighborhood'].isnull(),:].itertuples():
                point = shapely.geometry.Point([row.X, row.Y])
                # This will check to see if the incident was in the neighborhood
                if hood_shape['shape'].contains(point):
                    df_xy.loc[row[0], 'shape_neighborhood'] = hood_shape['name']
        df_xy.to_csv("df_%s_%s.csv" % (file_index, row_index))
        print("write to file", file_index, row_index)
    print(datetime.now())

In [None]:
"""
Concatenate the data frames back.
"""
dfs = {}
for file_index, file in enumerate([df_violent, df_nocrime, df_sub, df_property, df_other]):
    print(datetime.now())
    print("file", file_index)
    dfs[file_index] = []
    for row_index in range(0, len(file), 15000):
        df_temp = pd.read_csv("df_%s_%s.csv" % (file_index, row_index), index_col=0)
        dfs[file_index].append(df_temp)

dfss = []
for _, collects in dfs.items(): 
    dfss.append(pd.concat(collects, axis=0))

df_final = pd.concat(dfss, axis=0)
df = pd.concat([df_final['shape_neighborhood'], df], axis=1)

In [194]:
# In case we have some points not in neighborhoods lets setup a default case
def fill_with_default(hood_shapes, frame):
    default_frame = pd.DataFrame(hood_shapes)
    default_frame['X'] = 0 
    default_frame['shape_neighborhood'] = default_frame['name']
    del default_frame['shape']
    default_frame.set_index(['name'], inplace=True)
    default_frame.update(frame)
    default_frame.X = default_frame.X.astype(int)
    return default_frame

In [204]:
def get_threshold_scale(frame, column):
    """In case you want to have a customized Threshold scale for the dataFrame
    
    We will use a larger than normal range for the first 4 scales.
    In order to get some more separation for the tiles"""
    rs = []
    for i in np.linspace(0.2, 0.95, 6, endpoint=True):
        rs.append(frame[frame[column] >0][column].quantile(i))
    return rs

In [205]:
import folium
def mapping_crime(df, hood_json, hood_shapes):
    # Ok so now that we know how to match an incident with our existing neighborhood geojson file,
    # lets make a new dataframe that has all the neighborhoods with their frequency per neighborhood 
    freq_frame = df.groupby(['shape_neighborhood']).count()
    freq_frame['shape_neighborhood'] = freq_frame.index
    freq_frame = fill_with_default(hood_shapes, freq_frame[['X', 'shape_neighborhood']])  
    # Now lets associate it with our original neighborhood map.
    crime_map = folium.Map(location=[37.79086, -122.40147], zoom_start=12)
    crime_map.choropleth(geo_str=hood_json,
                         data=freq_frame,
                         columns=['shape_neighborhood', 'X'],
                         threshold_scale=(get_threshold_scale(freq_frame, 'X')),
                         key_on='feature.properties.name',
                         fill_color='YlOrRd',
                         fill_opacity=0.7, 
                         line_opacity=0.2,
                         legend_name='Crime Frequency')
    return crime_map

## Crime Mapping, All Categories

In [210]:
mapping_crime(df, hood_json, hood_shapes)

## Crime Mapping, Violent Crimes

In [213]:
mapping_crime(df_violent, hood_json, hood_shapes)

## Crime Mapping, Property Crimes

In [214]:
mapping_crime(df_property, hood_json, hood_shapes)

## Crime Mapping, Substance-based Crimes

In [215]:
mapping_crime(df_sub, hood_json, hood_shapes)