### Phoenix Crime Data API
The Phoenix Crime dataset contains crime records from 2015 up to 7 days prior to the current date. The dataset is updated regularly. The data does not contain specific locations but rather masked street addresses (123XX Main St) and a zip code. This notebook attempts to geocode all crime records within the past year so that the records can associated to coordinates and allow for greater spatiotemporal analytics. The derived record locations are approximations but should be good enough for its given purpose.

Link: https://www.phoenixopendata.com/dataset/crime-data/resource/0ce3411a-2fc6-4302-a33f-167f68608a20

In [None]:
import pandas as pd
import geopandas as gpd
from geopy.geocoders import Nominatim
from typing import Optional, Tuple
import time
import numpy as np
from shapely.geometry import Point
import re


In [None]:
# Instantiate geolocator
geolocator = Nominatim(user_agent="phoenix_crime_api")

# Read in downloaded CSV
f = "./crime-data_crime-data_crimestat.csv"
df = pd.read_csv(f)

# Observe file
df.head()

  df = pd.read_csv(f)


In [None]:
def clean_dates(df: pd.DataFrame):
    """
    Unpack the datetime field into individual columns for easier analysis
    """
    df["OCCURRED ON"] = pd.to_datetime(df["OCCURRED ON"])
    df["YEAR"] = df["OCCURRED ON"].dt.year
    df["MONTH"] = df["OCCURRED ON"].dt.month
    df["DAY"] = df["OCCURRED ON"].dt.day
    df["HOUR"] = df["OCCURRED ON"].dt.hour
    df["MINUTE"] = df["OCCURRED ON"].dt.minute
    df["SECOND"] = df["OCCURRED ON"].dt.second

    return df

In [None]:
df = clean_dates(df)

Unnamed: 0,INC NUMBER,OCCURRED ON,OCCURRED TO,UCR CRIME CATEGORY,100 BLOCK ADDR,ZIP,PREMISE TYPE,GRID,YEAR,MONTH,DAY,HOUR,MINUTE,SECOND
0,201600000594484,11/01/2015 00:00,,RAPE,13XX E ALMERIA RD,85006.0,SINGLE FAMILY HOUSE,BD30,2015.0,11.0,1.0,0.0,0.0,0.0
1,201500002102327,11/01/2015 00:00,11/01/2015 09:00,LARCENY-THEFT,51XX N 15TH ST,85014.0,APARTMENT,BJ30,2015.0,11.0,1.0,0.0,0.0,0.0
2,201500002168686,11/01/2015 00:00,11/11/2015 09:30,LARCENY-THEFT,14XX E HIGHLAND AVE,85014.0,PARKING LOT,BI30,2015.0,11.0,1.0,0.0,0.0,0.0
3,201500002102668,11/01/2015 00:00,11/01/2015 11:50,MOTOR VEHICLE THEFT,69XX W WOOD ST,85043.0,SINGLE FAMILY HOUSE,AF12,2015.0,11.0,1.0,0.0,0.0,0.0
4,201600000052855,11/01/2015 00:00,01/09/2016 00:00,MOTOR VEHICLE THEFT,N 43RD AVE & W CACTUS RD,85029.0,SINGLE FAMILY HOUSE,DA19,2015.0,11.0,1.0,0.0,0.0,0.0


In [15]:
# Extract counts of crime by zip code and year
df.groupby(["ZIP", "YEAR"]).size().unstack().fillna(0)

YEAR,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0,2022.0,2023.0,2024.0,2025.0
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
85003.0,134.0,660.0,815.0,868.0,913.0,855.0,848.0,940.0,931.0,1037.0,559.0
85004.0,101.0,823.0,828.0,812.0,900.0,724.0,867.0,1139.0,1157.0,1212.0,707.0
85006.0,174.0,1251.0,1383.0,1406.0,1452.0,1147.0,1115.0,1296.0,1103.0,1267.0,784.0
85007.0,168.0,964.0,983.0,1182.0,1228.0,1324.0,1261.0,1348.0,1052.0,1138.0,682.0
85008.0,417.0,3032.0,3306.0,2942.0,3032.0,3230.0,3080.0,3231.0,2719.0,2352.0,1530.0
...,...,...,...,...,...,...,...,...,...,...,...
85388.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
85390.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
85392.0,0.0,5.0,4.0,1.0,3.0,3.0,3.0,6.0,3.0,3.0,2.0
85395.0,0.0,2.0,0.0,0.0,1.0,1.0,1.0,3.0,0.0,1.0,1.0


In [None]:
def geocode_address(address: str, zip: str) -> Optional[Tuple[float, float]]:
    """
    The primary function for geocoding a masked address. 
    The function contains 3 strategies to attempt to identify a location
    before retuning a null response
    """

    try:
        geocoder = Nominatim(user_agent="phoenix_geocoder", timeout=10)

        zip = int(zip)

        # Strategy 1: use the provided masked address and zip
        location = geocoder.geocode(f"{address}, {zip}")
        if location:
            return (location.longitude, location.latitude)

        # Strategy 2: unmask address and use zip
        for value in ["00", "25", "50", "75"]:
            unmasked = re.sub(r"(\d+)XX\b", rf"\g<1>{value}", address)
            new_address = f"{unmasked}, {zip}"
            location = geocoder.geocode(new_address)
            if location:
                return (location.longitude, location.latitude)

            # Strategy 3: try adding city and state
            full_address = f"{unmasked}, Phoenix, AZ {zip}"
            location = geocoder.geocode(full_address)
            if location:
                return (location.longitude, location.latitude)

        return None

    except Exception as e:
        print("EXCEPTION: ", e)
        return None

In [148]:
def dataframe_to_geodataframe(
    df: pd.DataFrame,
    output: str,
    geocode_func: callable = None,
    delay_seconds: float = 1.0,
) -> gpd.GeoDataFrame:
    """
    Convert a pandas DataFrame to a GeoDataFrame by geocoding addresses.

    Args:
        df: Input DataFrame
        address_column: Name of the column containing addresses
        geocode_func: Geocoding function to use (defaults to geocode_address)
        delay_seconds: Delay between geocoding requests

    Returns:
        GeoDataFrame with longitude, latitude, and geometry columns
    """
    counter = 1
    total_count = len(df)

    if geocode_func is None:
        geocode_func = geocode_address

    # Create a copy of the DataFrame
    result_df = df.copy()

    # Initialize coordinate columns
    result_df["longitude"] = np.nan
    result_df["latitude"] = np.nan
    result_df["geometry"] = np.nan

    # Geocode each address
    for idx, row in result_df.iterrows():
        address = row["100 BLOCK ADDR"]
        addr_zip = row["ZIP"]

        coords = geocode_func(address, addr_zip)

        if coords:
            long = coords[0]
            lat = coords[1]

            result_df.at[idx, "longitude"] = long
            result_df.at[idx, "latitude"] = lat
            result_df.at[idx, "geometry"] = Point(long, lat)
        
        else:
            # Rate limiting
            if delay_seconds > 0:
                time.sleep(delay_seconds)

        if counter % 1000 == 0:
            print(f"{counter}/{total_count} complete")
        
        counter += 1

    # Convert to GeoDataFrame
    gdf = gpd.GeoDataFrame(result_df, geometry="geometry", crs="EPSG:4326")

    gdf.to_csv(output)
    
    return gdf

In [None]:
df_2025 = df.query("YEAR == 2025")

In [None]:
# View the size of the dataset to be geocoded
df_2025.shape

(30037, 14)

In [None]:

gdf_2025 = dataframe_to_geodataframe(df_2025, output="crime_geodata_2025.csv")

Note: To geocode 2025's 30,000+ records took over 35 hours to complete