In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('max_columns', 100)

In [2]:
df = pd.read_csv('data/mst/clean_police_reports.csv')
print(df.shape)
df.head()

(333016, 23)


Unnamed: 0,incident_date,incident_time,incident_year,incident_day_of_week,report_datetime,filed_online,incident_category,incident_subcategory,police_district,analysis_neighborhood,latitude,longitude,"areas_of_vulnerability,_2016",incident_month,incident_day,incident_hour,zipcode,population,population_density,housing_units,occupied_housing_units,median_home_value,median_household_income
0,2019/05/01,01:00,2019,Wednesday,2019/06/12 08:27:00 PM,False,Offences Against The Family And Children,Other,Taraval,Sunset/Parkside,37.762569,-122.499627,1.0,5,1,1,94122.0,56023.0,23699.0,23633.0,22162.0,774500.0,81768.0
1,2019/06/22,07:45,2019,Saturday,2019/06/22 08:05:00 AM,False,Non-Criminal,Other,Southern,South of Market,37.780535,-122.408161,2.0,6,22,7,94103.0,27170.0,20036.0,14778.0,13098.0,628000.0,43364.0
2,2019/06/03,16:16,2019,Monday,2019/06/03 04:16:00 PM,False,Missing Person,Missing Person,Bayview,Bayview Hunters Point,37.7216,-122.390745,2.0,6,3,16,94124.0,33996.0,6901.0,10812.0,9717.0,470400.0,50146.0
3,2018/11/16,16:34,2018,Friday,2018/11/16 04:34:00 PM,False,Offences Against The Family And Children,Family Offenses,Central,Chinatown,37.79486,-122.404876,2.0,11,16,16,94104.0,406.0,5232.0,368.0,207.0,1000001.0,48750.0
4,2019/05/27,02:25,2019,Monday,2019/05/27 02:55:00 AM,False,Assault,Simple Assault,Northern,Marina,37.797716,-122.430559,1.0,5,27,2,94123.0,23088.0,22622.0,15083.0,13717.0,1000001.0,112650.0


In [3]:
print(list(df))

['incident_date', 'incident_time', 'incident_year', 'incident_day_of_week', 'report_datetime', 'filed_online', 'incident_category', 'incident_subcategory', 'police_district', 'analysis_neighborhood', 'latitude', 'longitude', 'areas_of_vulnerability,_2016', 'incident_month', 'incident_day', 'incident_hour', 'zipcode', 'population', 'population_density', 'housing_units', 'occupied_housing_units', 'median_home_value', 'median_household_income']


In [4]:
# Create crime column
df['crime'] = 1
df_2018 = df[df['incident_year'] == 2018].copy()
df_2019 = df[df['incident_year'] == 2019].copy()
print(df_2018.shape)
print(df_2019.shape)

(153662, 24)
(148685, 24)


In [5]:
# Create Train and Test periods - 2018 and 2019
predictors = ['incident_day_of_week', 'police_district', 'incident_month', 'incident_day', 
              'incident_hour', 'zipcode', 'population', 'population_density', 'housing_units',
              'occupied_housing_units', 'median_home_value', 'median_household_income']
target = ['crime']
df_2018 = df_2018[predictors+target]
df_2019 = df_2019[predictors+target]
print(df_2018.shape)
print(df_2019.shape)

(153662, 13)
(148685, 13)


In [6]:
# Produce crime counts per police_district per day of week + hour
group_by_cols = ['incident_day_of_week', 'police_district', 'incident_hour', 'zipcode']
info_cols = ['zipcode', 'population', 'population_density', 'housing_units',
             'occupied_housing_units', 'median_home_value', 'median_household_income']

df_2018_group = df_2018.groupby(group_by_cols)['crime'].sum().reset_index()
df_2019_group = df_2019.groupby(group_by_cols)['crime'].sum().reset_index()
df_zip_info = df[info_cols].drop_duplicates()
df_2018_group = pd.merge(df_2018_group, df_zip_info, on = 'zipcode', how = 'left')
df_2019_group = pd.merge(df_2019_group, df_zip_info, on = 'zipcode', how = 'left')
print(df_2018.shape)
print(df_2019.shape)

(153662, 13)
(148685, 13)


In [7]:
df_2018_group.head(5)

Unnamed: 0,incident_day_of_week,police_district,incident_hour,zipcode,crime,population,population_density,housing_units,occupied_housing_units,median_home_value,median_household_income
0,Friday,Bayview,0,94107.0,7,26599.0,14845.0,15141.0,13736.0,719700.0,114439.0
1,Friday,Bayview,0,94110.0,9,69333.0,29816.0,28913.0,27128.0,768200.0,82111.0
2,Friday,Bayview,0,94124.0,43,33996.0,6901.0,10812.0,9717.0,470400.0,50146.0
3,Friday,Bayview,0,94134.0,26,40798.0,17001.0,11867.0,11244.0,547800.0,56716.0
4,Friday,Bayview,0,94158.0,10,4792.0,7288.0,2482.0,2271.0,643900.0,82195.0


In [8]:
df_2018_group.to_csv('data/mst/df_train.csv', index = None)
df_2019_group.to_csv('data/mst/df_test.csv', index = None)