# Overview of SF crime files
- Incident reports that have been filed by officers or self-reported by members of the public using SFPD’s online reporting system.
- data separated into two datasets; 2003-2018, 2018-2023, which will be merged at the end for EDA

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 2003-2018

In [None]:
df_raw = pd.read_csv('/content/drive/My Drive/ColabNotebooks/Police_Department_Incident_Reports__Historical_2003_to_May_2018_20231019.csv')
df_raw.head()

Unnamed: 0,PdId,IncidntNum,Incident Code,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,location,SF Find Neighborhoods 2 2,Current Police Districts 2 2,Current Supervisor Districts 2 2,Analysis Neighborhoods 2 2,DELETE - Fire Prevention Districts 2 2,DELETE - Police Districts 2 2,DELETE - Supervisor Districts 2 2,DELETE - Zip Codes 2 2,DELETE - Neighborhoods 2 2,DELETE - 2017 Fix It Zones 2 2,Civic Center Harm Reduction Project Boundary 2 2,Fix It Zones as of 2017-11-06 2 2,DELETE - HSOC Zones 2 2,Fix It Zones as of 2018-02-07 2 2,"CBD, BID and GBD Boundaries as of 2017 2 2","Areas of Vulnerability, 2016 2 2",Central Market/Tenderloin Boundary 2 2,Central Market/Tenderloin Boundary Polygon - Updated 2 2,HSOC Zones as of 2018-06-05 2 2,OWED Public Spaces 2 2,Neighborhoods 2
0,4133422003074,41334220,3074,ROBBERY,"ROBBERY, BODILY FORCE",Monday,11/22/2004,17:50,INGLESIDE,NONE,GENEVA AV / SANTOS ST,-122.420084,37.708311,POINT (-122.420084075249 37.7083109744362),,,9.0,,,,8.0,309.0,,,,,,,,,,,,,
1,5118535807021,51185358,7021,VEHICLE THEFT,STOLEN AUTOMOBILE,Tuesday,10/18/2005,20:00,PARK,NONE,TURK ST / STJOSEPHS AV,-120.5,90.0,POINT (-120.50000000000001 90),,,,,,,,,,,,,,,,,,,,,
2,4018830907021,40188309,7021,VEHICLE THEFT,STOLEN AUTOMOBILE,Sunday,02/15/2004,02:00,SOUTHERN,NONE,BRANNAN ST / 1ST ST,-120.5,90.0,POINT (-120.50000000000001 90),,,,,,,,,,,,,,,,,,,,,
3,11014543126030,110145431,26030,ARSON,ARSON,Friday,02/18/2011,05:27,INGLESIDE,NONE,0 Block of SANJUAN AV,-122.43622,37.724377,POINT (-122.43622001281001 37.7243766140428),94.0,9.0,1.0,28.0,9.0,7.0,6.0,28861.0,25.0,,,,,,,1.0,,,,,94.0
4,10108108004134,101081080,4134,ASSAULT,BATTERY,Sunday,11/21/2010,17:00,SOUTHERN,NONE,400 Block of 10TH ST,-122.410541,37.770913,POINT (-122.410541166987 37.7709130566165),32.0,1.0,10.0,34.0,8.0,2.0,9.0,28853.0,34.0,,,,,,,2.0,,,,,32.0


## Filtering out crimes not near universities

In [None]:
# sourced from latitude.to
# can opt to add more if necessary
coords_universities = [
                  ('San Francisco State University', 37.720663784, -122.474498102),
                   ('University of San Francisco', 37.774330236, -122.451164862),
                    ('University of California, San Francisco', 37.7257157, -122.4501783),
                     ('University of California, San Francisco', 37.7579886347, -122.455121513),
                      ('Academy of Art University', 37.78785, -122.40065),
                       ('Golden Gate University', 37.78923, -122.3988),
                        ('University of California, Hastings College of the Law', 37.7751268995, -122.409201697),
                         ('California College of the Arts' , 37.83593, -122.2503),
                          ('University of the Pacific Arthur A. Dugoni School of Dentistry,', 37.79, -122.432),
                  ('California State University East Bay', 37.6575, -122.0568 ),
                          ("Saint Mary's College of California",37.8409,  -122.1089 ),
                          ("San Francisco Conservatory of Music", 37.7755, -122.4204),
                          ("Hult International Business School", 37.8026, -122.4032),
                          ("Presidio Graduate School", 37.7869, -122.4007),
                          ("California Institute of Integral Studies",37.7747, -122.4164)
  ]

In [None]:
import geopandas as gpd
from shapely.geometry import Point
from sklearn.neighbors import BallTree
import folium
import plotly.graph_objects as go

universities_df = pd.DataFrame(coords_universities, columns=['University', 'Latitude', 'Longitude'])
universities_gdf = gpd.GeoDataFrame(universities_df, geometry=[Point(xy) for xy in zip(universities_df.Longitude, universities_df.Latitude)])

In [None]:
crime_gdf = gpd.GeoDataFrame(df_raw, geometry=gpd.points_from_xy(x=df_raw['X'], y=df_raw['Y']))

In [None]:
# Build a spatial index for efficient nearest-neighbor queries
tree = BallTree(universities_gdf[['Longitude', 'Latitude']].values, metric='haversine')

# Query the spatial index to find the distance to the nearest university for each crime
distances, indices = tree.query(crime_gdf[['X', 'Y']].values, return_distance=True)

# Convert distances to meters (BallTree returns distances in radians)
distances_meters = distances * 6371000

# Determine whether each crime is near a university
near_university = distances_meters <= 3000  # for example, using 3km as the threshold

# Filter the crimes that are near universities
crimes_near_universities = crime_gdf[near_university]

# Filter the crimes that are not near any university by metric
crimes_not_near_universities = crime_gdf[~near_university]

crimes_near_universities.head()

Unnamed: 0,PdId,IncidntNum,Incident Code,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,location,SF Find Neighborhoods 2 2,Current Police Districts 2 2,Current Supervisor Districts 2 2,Analysis Neighborhoods 2 2,DELETE - Fire Prevention Districts 2 2,DELETE - Police Districts 2 2,DELETE - Supervisor Districts 2 2,DELETE - Zip Codes 2 2,DELETE - Neighborhoods 2 2,DELETE - 2017 Fix It Zones 2 2,Civic Center Harm Reduction Project Boundary 2 2,Fix It Zones as of 2017-11-06 2 2,DELETE - HSOC Zones 2 2,Fix It Zones as of 2018-02-07 2 2,"CBD, BID and GBD Boundaries as of 2017 2 2","Areas of Vulnerability, 2016 2 2",Central Market/Tenderloin Boundary 2 2,Central Market/Tenderloin Boundary Polygon - Updated 2 2,HSOC Zones as of 2018-06-05 2 2,OWED Public Spaces 2 2,Neighborhoods 2,geometry
1310,4027035407021,40270354,7021,VEHICLE THEFT,STOLEN AUTOMOBILE,Saturday,03/06/2004,22:00,SOUTHERN,NONE,FOLSOM ST / RODGERS ST,-122.409482,37.775451,POINT (-122.40948237086701 37.7754506237928),32.0,1.0,10.0,34.0,8.0,2.0,9.0,28853.0,34.0,24.0,1.0,6.0,1.0,6.0,,2.0,,,1.0,,32.0,POINT (-122.40948 37.77545)
2751,7079764128160,70797641,28160,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Friday,08/03/2007,15:00,SOUTHERN,NONE,0 Block of RODGERS ST,-122.409046,37.775176,POINT (-122.409046343419 37.7751758248112),32.0,1.0,10.0,34.0,8.0,2.0,9.0,28853.0,34.0,24.0,,6.0,,6.0,,2.0,,,,,32.0,POINT (-122.40905 37.77518)
4412,12616398628150,126163986,28150,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM",Sunday,10/21/2012,17:45,SOUTHERN,NONE,0 Block of NEWMONTGOMERY ST,-122.400593,37.787602,POINT (-122.40059325966801 37.7876024687437),108.0,6.0,10.0,8.0,12.0,2.0,9.0,28855.0,6.0,,,,,,4.0,1.0,,,,,108.0,POINT (-122.40059 37.78760)
6152,4052246206224,40522462,6224,LARCENY/THEFT,GRAND THEFT FROM UNLOCKED AUTO,Sunday,11/23/2003,00:01,SOUTHERN,NONE,1400 Block of MISSION ST,-122.416469,37.774751,POINT (-122.416468768127 37.774750702851),32.0,1.0,10.0,34.0,8.0,2.0,9.0,28853.0,34.0,,1.0,,1.0,,7.0,2.0,1.0,1.0,1.0,,32.0,POINT (-122.41647 37.77475)
9761,8026358505151,80263585,5151,BURGLARY,"BURGLARY,STORE UNDER CONSTRUCTION, FORCIBLE ENTRY",Tuesday,03/11/2008,18:30,SOUTHERN,NONE,600 Block of MISSION ST,-122.400931,37.78719,POINT (-122.40093111397 37.7871900467437),32.0,1.0,10.0,8.0,12.0,2.0,9.0,28855.0,6.0,,,,,,4.0,1.0,,,,,32.0,POINT (-122.40093 37.78719)


In [None]:
# Calculate average latitude and longitude for map centering
average_lat, average_lon = universities_gdf['Latitude'].mean(), universities_gdf['Longitude'].mean()

# Create a base map
m = folium.Map(location=[average_lat, average_lon], zoom_start=13)

# Add points for crimes and universities
for idx, row in crimes_near_universities.iterrows():
    folium.Marker([row['Y'], row['X']], icon=folium.Icon(color='red')).add_to(m)
for idx, row in universities_gdf.iterrows():
    folium.Marker([row['Latitude'], row['Longitude']], icon=folium.Icon(color='blue')).add_to(m)

# Display the map
m

In [None]:
crimes_near_universities['datetime'] = pd.to_datetime(crimes_near_universities['Date'] + ' ' + crimes_near_universities['Time'])
crimes_near_universities = crimes_near_universities[crimes_near_universities['datetime']<"2018-01-01"]
crimes_near_universities['datetime'].describe()

count                    2475
unique                   2089
top       2005-05-11 10:40:00
freq                        8
first     2003-01-01 06:48:00
last      2017-12-28 03:36:00
Name: datetime, dtype: object

In [None]:
crimes_near_universities['Category'].value_counts()


LARCENY/THEFT                  822
OTHER OFFENSES                 294
NON-CRIMINAL                   276
ASSAULT                        208
BURGLARY                       162
VANDALISM                      139
VEHICLE THEFT                  130
SUSPICIOUS OCC                  92
WARRANTS                        57
ROBBERY                         53
FRAUD                           48
MISSING PERSON                  33
TRESPASS                        30
DRUG/NARCOTIC                   29
STOLEN PROPERTY                 17
RECOVERED VEHICLE               13
WEAPON LAWS                     12
FORGERY/COUNTERFEITING          12
DRUNKENNESS                     11
EMBEZZLEMENT                     7
DRIVING UNDER THE INFLUENCE      6
SECONDARY CODES                  6
SEX OFFENSES, FORCIBLE           5
DISORDERLY CONDUCT               3
ARSON                            3
BAD CHECKS                       2
SUICIDE                          2
KIDNAPPING                       2
LIQUOR LAWS         

# 2018-2023

In [None]:
df_raw2 = pd.read_csv('/content/drive/My Drive/ColabNotebooks/Police_Department_Incident_Reports__2018_to_Present_20231101.csv')
df_raw2.dropna(subset='Point', inplace=True)
df_raw2.head()

Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Report Datetime,Row ID,Incident ID,Incident Number,CAD Number,Report Type Code,Report Type Description,Filed Online,Incident Code,Incident Category,Incident Subcategory,Incident Description,Resolution,Intersection,CNN,Police District,Analysis Neighborhood,Supervisor District,Supervisor District 2012,Latitude,Longitude,Point,Neighborhoods,ESNCAG - Boundary File,Central Market/Tenderloin Boundary Polygon - Updated,Civic Center Harm Reduction Project Boundary,HSOC Zones as of 2018-06-05,Invest In Neighborhoods (IIN) Areas,Current Supervisor Districts,Current Police Districts
9,2023/03/11 02:00:00 PM,2023/03/11,14:00,2023,Saturday,2023/03/15 11:21:00 AM,125431804134,1254318,230182844,230741133.0,II,Initial,,4134,Assault,Simple Assault,Battery,Open or Active,STANYAN ST \ HAYES ST,26446000.0,Park,Golden Gate Park,1.0,1.0,37.772895,-122.454285,POINT (-122.45428511766733 37.772895177200766),,,,,,,4.0,7.0
11,2022/06/27 12:00:00 PM,2022/06/27,12:00,2022,Monday,2023/03/15 05:20:00 PM,125439371000,1254393,230184129,230742480.0,II,Initial,,71000,Lost Property,Lost Property,Lost Property,Open or Active,GEARY ST \ POWELL ST,24903000.0,Central,Financial District/South Beach,3.0,3.0,37.787359,-122.408227,POINT (-122.40822672700406 37.78735926098589),19.0,,1.0,,,,3.0,6.0
13,2023/03/16 05:30:00 PM,2023/03/16,17:30,2023,Thursday,2023/03/16 06:02:00 PM,125482604134,1254826,230187101,230752550.0,II,Initial,,4134,Assault,Simple Assault,Battery,Open or Active,18TH ST \ DE HARO ST,23743000.0,Bayview,Potrero Hill,10.0,10.0,37.76229,-122.401324,POINT (-122.40132418490647 37.76228996810526),54.0,,,,,,9.0,2.0
33,2023/03/21 03:50:00 PM,2023/03/21,15:50,2023,Tuesday,2023/03/21 04:01:00 PM,125656351040,1256563,230199764,230802198.0,II,Initial,,51040,Non-Criminal,Non-Criminal,Aided Case,Open or Active,POST ST \ LARKIN ST,25167000.0,Northern,Tenderloin,3.0,6.0,37.787038,-122.418271,POINT (-122.41827098126804 37.787037946181535),50.0,,,,,,10.0,6.0
61,2021/08/22 09:40:00 AM,2021/08/22,09:40,2021,Sunday,2021/08/22 09:40:00 AM,106267662071,1062676,210537297,212340900.0,II,Initial,,62071,Warrant,Other,Probation Search,Open or Active,LAGUNA ST \ PACIFIC AVE,26569000.0,Northern,Pacific Heights,2.0,2.0,37.793977,-122.429804,POINT (-122.42980398313114 37.79397724418211),102.0,,,,,,6.0,4.0


## Filtering out crimes not near universities

In [None]:
universities_df = pd.DataFrame(coords_universities, columns=['University', 'Latitude', 'Longitude'])
universities_gdf = gpd.GeoDataFrame(universities_df, geometry=[Point(xy) for xy in zip(universities_df.Longitude, universities_df.Latitude)])

crime_gdf2 = gpd.GeoDataFrame(df_raw2, geometry=gpd.points_from_xy(x=df_raw2['Longitude'], y=df_raw2['Latitude']))

# Build a spatial index for efficient nearest-neighbor queries
tree = BallTree(universities_gdf[['Longitude', 'Latitude']].values, metric='haversine')

# Query the spatial index to find the distance to the nearest university for each crime
distances, indices = tree.query(crime_gdf2[['Longitude', 'Latitude']].values, return_distance=True)

# Convert distances to meters (BallTree returns distances in radians)
distances_meters = distances * 6371000

# Determine whether each crime is near a university
near_university = distances_meters <= 3000  # for example, using 3km as the threshold

# Filter the crimes that are near universities
crimes_near_universities2 = crime_gdf2[near_university]

# Filter the crimes that are not near any university by metric
crimes_not_near_universities2 = crime_gdf2[~near_university]

crimes_near_universities2.head()

Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Report Datetime,Row ID,Incident ID,Incident Number,CAD Number,Report Type Code,Report Type Description,Filed Online,Incident Code,Incident Category,Incident Subcategory,Incident Description,Resolution,Intersection,CNN,Police District,Analysis Neighborhood,Supervisor District,Supervisor District 2012,Latitude,Longitude,Point,Neighborhoods,ESNCAG - Boundary File,Central Market/Tenderloin Boundary Polygon - Updated,Civic Center Harm Reduction Project Boundary,HSOC Zones as of 2018-06-05,Invest In Neighborhoods (IIN) Areas,Current Supervisor Districts,Current Police Districts,geometry
2114,2022/10/28 02:50:00 AM,2022/10/28,02:50,2022,Friday,2022/10/29 02:38:00 PM,125701171000,1257011,226239532,,II,Coplogic Initial,True,71000,Lost Property,Lost Property,Lost Property,Open or Active,MISSION ST \ SHAW ALY,24602000.0,Southern,Financial District/South Beach,6.0,6.0,37.788981,-122.398564,POINT (-122.39856429383492 37.78898117261747),108.0,,,,,,10.0,1.0,POINT (-122.39856 37.78898)
2657,2023/03/24 03:30:00 PM,2023/03/24,15:30,2023,Friday,2023/03/24 05:09:00 PM,125764226105,1257642,230207129,230832481.0,II,Initial,,26105,Other Miscellaneous,Other,Bomb Threat or False Report of Bomb,Open or Active,MISSION ST \ SHAW ALY,24602000.0,Southern,Financial District/South Beach,6.0,6.0,37.788981,-122.398564,POINT (-122.39856429383492 37.78898117261747),108.0,,,,,,10.0,1.0,POINT (-122.39856 37.78898)
4528,2023/09/19 11:15:00 PM,2023/09/19,23:15,2023,Tuesday,2023/09/20 10:07:00 AM,132409006224,1324090,236187874,,II,Coplogic Initial,True,6224,Larceny Theft,Larceny - From Vehicle,"Theft, From Unlocked Vehicle, >$950",Open or Active,NEW MONTGOMERY ST \ MISSION ST,24610000.0,Central,Financial District/South Beach,6.0,6.0,37.78745,-122.400503,POINT (-122.4005030970072 37.78745049797145),32.0,,,,,,10.0,1.0,POINT (-122.40050 37.78745)
5548,2023/10/03 06:29:00 AM,2023/10/03,06:29,2023,Tuesday,2023/10/03 06:36:00 AM,132482128150,1324821,230709517,232760408.0,II,Initial,,28150,Malicious Mischief,Vandalism,"Malicious Mischief, Vandalism to Property",Open or Active,MISSION ST \ SHAW ALY,24602000.0,Southern,Financial District/South Beach,6.0,6.0,37.788981,-122.398564,POINT (-122.39856429383492 37.78898117261747),108.0,,,,,,10.0,1.0,POINT (-122.39856 37.78898)
5556,2023/10/03 10:00:00 AM,2023/10/03,10:00,2023,Tuesday,2023/10/03 07:20:00 PM,132488906373,1324889,230711512,232762886.0,II,Initial,,6373,Larceny Theft,Larceny Theft - Other,"Theft, Other Property, $200-$950",Open or Active,NEW MONTGOMERY ST \ AMBROSE BIERCE ST,24611000.0,Central,Financial District/South Beach,6.0,6.0,37.787699,-122.400809,POINT (-122.40080859571334 37.78769894626064),,,,,,,10.0,6.0,POINT (-122.40081 37.78770)


In [None]:
crimes_near_universities2['Incident Category'].value_counts()

Larceny Theft                               266
Burglary                                     88
Malicious Mischief                           84
Other Miscellaneous                          66
Missing Person                               63
Fraud                                        47
Non-Criminal                                 44
Assault                                      40
Robbery                                      29
Warrant                                      24
Motor Vehicle Theft                          24
Suspicious Occ                               19
Lost Property                                17
Recovered Vehicle                            14
Offences Against The Family And Children     13
Drug Offense                                 11
Case Closure                                 10
Disorderly Conduct                           10
Traffic Violation Arrest                      9
Stolen Property                               7
Other                                   

# Reclassification of crime categories
- the file containing data from 2003-2018 and 2018-2023 has some difference in categorising crimes
- reclassification to
  1. ensure consistency in categories between the two files
  2. extract crimes which have a higher possibility of occurring in NUS

In [None]:
crimes_near_universities2[crimes_near_universities2['Incident Category']=='Other Miscellaneous']['Incident Description'].value_counts()

Investigative Detention                                      16
Trespassing                                                  14
Probation Violation                                           5
Resisting, Delaying, or Obstructing  Peace Officer Duties     5
Burglary Tools, Possession Of                                 5
Driving, No License Issued                                    5
Parole Violation, Adult                                       4
Conspiracy                                                    3
Driving While Under The Influence Of Drugs                    2
Bomb Threat or False Report of Bomb                           1
Driving While Under The Influence Of Alcohol, w/Injury        1
State Code Felony (general)                                   1
Lodging Without Permission                                    1
Identification To Peace Officer, False                        1
Kidnapping during Robbery                                     1
Driving, License Suspended or Revoked   

## Merging traffic incidents to one category called Traffic Violations
- traffic related crimes categorised too sparsely

In [None]:
# Traffic Violation
keywords = ['license', 'traffic', 'evading', 'vehicle']
mask = (crimes_near_universities['Category'] == 'OTHER OFFENSES') & (crimes_near_universities['Descript'].str.contains('|'.join(keywords), case=False, regex=True))
crimes_near_universities.loc[mask, 'Category'] = 'Traffic Violation'

keywords = ['Traffic Violation Arrest', 'Traffic Collision', 'Vehicle Impounded']
mask = (crimes_near_universities2['Incident Category'].str.contains('|'.join(keywords), case=False, regex=True))
crimes_near_universities2.loc[mask, 'Incident Category'] = 'Traffic Violation'

## Reclassifying crimes which were categorised under too general categories ('OTHER OFFENSES' and 'NON-CRIMINAL') in 2003-2018 file

In [None]:
# remap OTHER OFFENSES / NON-CRIMINAL from 2018b
mask = (crimes_near_universities['Category'] == 'OTHER OFFENSES') & (crimes_near_universities['Descript']=='MISCELLANEOUS INVESTIGATION')
crimes_near_universities.loc[mask, 'Category'] = 'Miscellaneous Investigation'

mask = (crimes_near_universities['Category'] == 'NON-CRIMINAL') & (crimes_near_universities['Descript']=='LOST PROPERTY')
crimes_near_universities.loc[mask, 'Category'] = 'Lost Property'

mask = (crimes_near_universities['Category'] == 'NON-CRIMINAL') & (crimes_near_universities['Descript']=='CASE CLOSURE')
crimes_near_universities.loc[mask, 'Category'] = 'Case Closure'

##  Renaming category names in 2003-2018 file to match 2018-2023 file's naming conventions

In [None]:
# remap columns from 2018b
category_mapping ={
    'LARCENY/THEFT': 'Larceny Theft',
    'OTHER OFFENSES': 'Other',
    'NON-CRIMINAL': 'Non-Criminal',
    'ASSAULT': 'Assault',
    'BURGLARY': 'Burglary',
    'VANDALISM': 'Vandalism',
    'VEHICLE THEFT': 'Motor Vehicle Theft',
    'SUSPICIOUS OCC': 'Suspicious Occ',
    'WARRANTS': 'Warrant',
    'ROBBERY': 'Robbery',
    'FRAUD': 'Fraud',
    'MISSING PERSON': 'Missing Person',
    'TRESPASS': 'Other Miscellaneous',
    'DRUG/NARCOTIC': 'Drug Offense',
    'STOLEN PROPERTY': 'Stolen Property',
    'RECOVERED VEHICLE': 'Recovered Vehicle',
    'WEAPON LAWS': 'Weapon Laws',
    'FORGERY/COUNTERFEITING': 'Forgery And Counterfeiting',
    'DRUNKENNESS': 'Disorderly Conduct',
    'EMBEZZLEMENT': 'Embezzlement',
    'DRIVING UNDER THE INFLUENCE': 'Other Miscellaneous',
    'SECONDARY CODES': 'Offences Against The Family And Children',
    'SEX OFFENSES, FORCIBLE': 'Sex Offenses',
    'DISORDERLY CONDUCT': 'Disorderly Conduct',
    'ARSON': 'Arson',
    'BAD CHECKS': 'FRAUD',
    'SUICIDE': 'Suicide',
    'KIDNAPPING': 'Other Miscellaneous',
    'LIQUOR LAWS': 'Other MIscellaneous'
}


crimes_near_universities['Category'].replace(category_mapping, inplace = True)

In [None]:
# remap columns from 2018a

category_mapping2 = {
    'Traffic Violation Arrest':''
}


In [None]:
crimes_near_universities['Category'].value_counts()

Larceny Theft                               822
Assault                                     208
Burglary                                    162
Other                                       161
Non-Criminal                                152
Vandalism                                   139
Motor Vehicle Theft                         130
Traffic Violation                           109
Lost Property                               109
Suspicious Occ                               92
Warrant                                      57
Robbery                                      53
Fraud                                        48
Other Miscellaneous                          38
Missing Person                               33
Drug Offense                                 29
Miscellaneous Investigation                  24
Stolen Property                              17
Case Closure                                 15
Disorderly Conduct                           14
Recovered Vehicle                       

In [None]:
crimes_near_universities2['Incident Category'].value_counts()

Larceny Theft                               266
Burglary                                     88
Malicious Mischief                           84
Other Miscellaneous                          66
Missing Person                               63
Fraud                                        47
Non-Criminal                                 44
Assault                                      40
Robbery                                      29
Warrant                                      24
Motor Vehicle Theft                          24
Suspicious Occ                               19
Lost Property                                17
Recovered Vehicle                            14
Offences Against The Family And Children     13
Traffic Violation                            12
Drug Offense                                 11
Case Closure                                 10
Disorderly Conduct                           10
Stolen Property                               7
Other                                   

# Removing irrelevant features

In [None]:
crimes_near_universities2['datetime'] = pd.to_datetime(crimes_near_universities2['Incident Datetime'])
crimes_near_universities2.head(n=2)

Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Report Datetime,Row ID,Incident ID,Incident Number,CAD Number,Report Type Code,Report Type Description,Filed Online,Incident Code,Incident Category,Incident Subcategory,Incident Description,Resolution,Intersection,CNN,Police District,Analysis Neighborhood,Supervisor District,Supervisor District 2012,Latitude,Longitude,Point,Neighborhoods,ESNCAG - Boundary File,Central Market/Tenderloin Boundary Polygon - Updated,Civic Center Harm Reduction Project Boundary,HSOC Zones as of 2018-06-05,Invest In Neighborhoods (IIN) Areas,Current Supervisor Districts,Current Police Districts,geometry,datetime
2114,2022/10/28 02:50:00 AM,2022/10/28,02:50,2022,Friday,2022/10/29 02:38:00 PM,125701171000,1257011,226239532,,II,Coplogic Initial,True,71000,Lost Property,Lost Property,Lost Property,Open or Active,MISSION ST \ SHAW ALY,24602000.0,Southern,Financial District/South Beach,6.0,6.0,37.788981,-122.398564,POINT (-122.39856429383492 37.78898117261747),108.0,,,,,,10.0,1.0,POINT (-122.39856 37.78898),2022-10-28 02:50:00
2657,2023/03/24 03:30:00 PM,2023/03/24,15:30,2023,Friday,2023/03/24 05:09:00 PM,125764226105,1257642,230207129,230832481.0,II,Initial,,26105,Other Miscellaneous,Other,Bomb Threat or False Report of Bomb,Open or Active,MISSION ST \ SHAW ALY,24602000.0,Southern,Financial District/South Beach,6.0,6.0,37.788981,-122.398564,POINT (-122.39856429383492 37.78898117261747),108.0,,,,,,10.0,1.0,POINT (-122.39856 37.78898),2023-03-24 15:30:00


In [None]:
crimes_near_universities.rename(columns={'Category':'Incident Category', 'X':'Longitude', 'Y':'Latitude'}, inplace=True)
crimes_near_universities.head(n=2)

Unnamed: 0,PdId,IncidntNum,Incident Code,Incident Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,Longitude,Latitude,location,SF Find Neighborhoods 2 2,Current Police Districts 2 2,Current Supervisor Districts 2 2,Analysis Neighborhoods 2 2,DELETE - Fire Prevention Districts 2 2,DELETE - Police Districts 2 2,DELETE - Supervisor Districts 2 2,DELETE - Zip Codes 2 2,DELETE - Neighborhoods 2 2,DELETE - 2017 Fix It Zones 2 2,Civic Center Harm Reduction Project Boundary 2 2,Fix It Zones as of 2017-11-06 2 2,DELETE - HSOC Zones 2 2,Fix It Zones as of 2018-02-07 2 2,"CBD, BID and GBD Boundaries as of 2017 2 2","Areas of Vulnerability, 2016 2 2",Central Market/Tenderloin Boundary 2 2,Central Market/Tenderloin Boundary Polygon - Updated 2 2,HSOC Zones as of 2018-06-05 2 2,OWED Public Spaces 2 2,Neighborhoods 2,geometry,datetime
1310,4027035407021,40270354,7021,Motor Vehicle Theft,STOLEN AUTOMOBILE,Saturday,03/06/2004,22:00,SOUTHERN,NONE,FOLSOM ST / RODGERS ST,-122.409482,37.775451,POINT (-122.40948237086701 37.7754506237928),32.0,1.0,10.0,34.0,8.0,2.0,9.0,28853.0,34.0,24.0,1.0,6.0,1.0,6.0,,2.0,,,1.0,,32.0,POINT (-122.40948 37.77545),2004-03-06 22:00:00
2751,7079764128160,70797641,28160,Vandalism,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Friday,08/03/2007,15:00,SOUTHERN,NONE,0 Block of RODGERS ST,-122.409046,37.775176,POINT (-122.409046343419 37.7751758248112),32.0,1.0,10.0,34.0,8.0,2.0,9.0,28853.0,34.0,24.0,,6.0,,6.0,,2.0,,,,,32.0,POINT (-122.40905 37.77518),2007-08-03 15:00:00


# Merge both files into one 2003-2023 file

In [None]:
df_final = crimes_near_universities2[['datetime', 'Incident Category', 'Longitude', 'Latitude']].append(crimes_near_universities[['datetime', 'Incident Category', 'Longitude', 'Latitude']])

In [None]:
# df_final.to_pickle('/content/drive/My Drive/ColabNotebooks/crimes_near_universities_2003_2023.pkl')