# Crime Hotspot Mapping and Prediction Accuracy Index (PAI)

> Chainey, S., Tompson, L., & Uhlig, S. (2008b). The utility of hotspot mapping for predicting spatial patterns of crime. Security Journal, 21(1–2), 4–28. [10.1057/palgrave.sj.8350066](https://doi.org/10.1057/palgrave.sj.8350066)


## Part 1: Density and PAI estimation

In [None]:
# Core data handling
import pandas as pd        # tabular data manipulation
import numpy as np         # numerical operations and arrays

# Spatial data
import geopandas as gpd    # vector spatial data (points, lines, polygons)
from shapely.geometry import Polygon  # geometric primitives
from shapely import wkt  # shapely >= 2.0 (for older versions: from shapely import wkt is also fine)
import h3


# Visualisation
import plotly.express as px      # interactive charts and maps
from keplergl import KeplerGl    # interactive WebGL maps for exploratory crime mapping

# Statistics
from sklearn.cluster import DBSCAN
import pysal.lib as ps
from pysal.explore import esda
from splot.esda import moran_scatterplot # visualize moran

In [None]:
GLOBAL_CRS = "EPSG:4326"
METRIC_CRS = "EPSG:3857"

# create a data folder, and change the name of the data files
CRIMES_DATA_PATH = "./data/Crimes_2025.csv"  # e.g. "./data/chicago_crimes_sample.csv"
DISTRICT_DATA_PATH = "./data/districts.csv"
BEATS_DATA_PATH = "./data/beats.csv"

### A. Load crime data and first exploration

In [None]:
# load crimes in df
df_crimes = pd.read_csv(CRIMES_DATA_PATH).drop(columns=['X Coordinate', 'Y Coordinate','Location'])
df_crimes = df_crimes.dropna(subset=['Latitude', 'Longitude'])

# transform to gdf
gdf_crimes = gpd.GeoDataFrame(df_crimes, geometry=gpd.points_from_xy(df_crimes["Longitude"], df_crimes["Latitude"]), crs="EPSG:4326")

In [None]:
# load district
df_districts = pd.read_csv(DISTRICT_DATA_PATH)
gdf_districts = gpd.GeoDataFrame(df_districts, geometry=wkt.loads(df_districts["the_geom"]), crs=GLOBAL_CRS)

In [None]:
# load beats
df_beats = pd.read_csv(BEATS_DATA_PATH)
gdf_beats = gpd.GeoDataFrame(df_beats, geometry=wkt.loads(df_beats["the_geom"]), crs=GLOBAL_CRS)

### B. Train–test split: preparing for prediction evaluation

In [None]:
# enforce datetime
df_crimes["Date"] = pd.to_datetime(df_crimes["Date"])

# mask last date - 7 days
last_week = df_crimes["Date"].max() - pd.Timedelta(days=7)
mask = df_crimes["Date"] > last_week

# separate data
df_last_week = df_crimes.loc[mask].copy()
df_crimes = df_crimes.loc[~mask].copy()

### D. Thematic mapping of geographic boundary areas


In [None]:
gdf_beats_with_crimes

In [None]:
# Crime counts per beat
crime_beat_counts = (
    gdf_crimes
    .groupby("Beat")
    .size()
    .reset_index(name="crimes_count")
)

# Merge crime counts with beats
gdf_beats_with_crimes = gdf_beats.merge(crime_beat_counts, how="left", left_on="BEAT_NUM", right_on="Beat").drop(columns=["Beat"])

m = KeplerGl(height=600)
m.add_data(data=gdf_beats_with_crimes.drop(columns=["the_geom"]), name="crimes")
m

### E. Grid / hexagonal hotspot mapping with H3


In [None]:
H3_RESOLUTION = 8

# extract h3 cell id from point
def point_to_h3(point):
    """
    Take a shapely Point (lon/lat) and return the H3 index.
    H3 expects (lat, lon) = (y, x).
    """
    lat = point.y
    lon = point.x

    return h3.latlng_to_cell(lat, lon, H3_RESOLUTION)

In [None]:
# copy crimes
h3_crimes = gdf_crimes.copy()

# convert point to h3 cell
h3_crimes['h3_cell'] = gdf_crimes["geometry"].apply(point_to_h3)

h3_crimes_count = (
    h3_crimes
    .groupby("h3_cell")
    .size()
    .reset_index(name="point_count")
)
h3_crimes_count['geometry'] = h3_crimes_count['h3_cell'].apply(cell_to_geometry)

m = KeplerGl(height=600)
m.add_data(data=h3_crimes_count.drop(columns=["h3_cell"]), name="h3_crimes_count")
m

### F. DBSCAN clustering


In [None]:
# Keep 20% of the data
dbscan_crimes = gdf_crimes.copy().sample(frac=0.2, random_state=42) # taking a bigger sample implies having different clusters with DBSCAN
dbscan_crimes.drop(columns=["Longitude", "Latitude"], inplace=True)

In [None]:
# transform to web mercator (metric CRS) because DBSCAN needs metric distance
dbscan_crimes.to_crs(epsg=3857, inplace=True)
# extract coordinates
coords = np.array(list(zip(dbscan_crimes.geometry.x, dbscan_crimes.geometry.y)))

db = DBSCAN(
    eps=500,              # in metres, adjust
    min_samples=100,        # clusters of at least 3 hexes
    n_jobs=-1,
    metric="euclidean"  # default, but explicit
)
# fit DBSCAN
labels = db.fit_predict(coords)

dbscan_crimes["cluster_id"] = labels
# reproject back to WGS84
dbscan_crimes.to_crs(epsg=4326, inplace=True)

m = KeplerGl(height=600)
m.add_data(dbscan_crimes, "dbscan_crimes")
m

### G. Prediction Accuracy Index (PAI)


In [None]:
# total surface of the city
study_area = gdf_districts.to_crs(METRIC_CRS)["geometry"].area.sum()

# total number of crimes
total_crimes = len(df_crimes)

print(f"Total surface of the city: {study_area/1e6} km²")
print("Total number of crimes: {}".format(total_crimes))

gdf_last_week = gpd.GeoDataFrame(df_last_week, geometry=gpd.points_from_xy(df_last_week.Longitude, df_last_week.Latitude), crs=GLOBAL_CRS).to_crs(METRIC_CRS)

In [None]:
def get_PAI(hits, hotspot_area, total_crimes, study_area):
    hit_rate = hits / total_crimes
    area_percentage = hotspot_area / study_area

    return hit_rate / area_percentage

In [None]:
# Beats PAI 
gdf_beats_with_crimes["norm_crime"] = gdf_beats_with_crimes["crimes_count"] / gdf_beats_with_crimes["crimes_count"].max()

# Create the Hotsopt: top 15% beats
mask = gdf_beats_with_crimes["norm_crime"] >= gdf_beats_with_crimes["norm_crime"].quantile(0.85)
top_beats_crimes = gdf_beats_with_crimes[mask].copy()
print("Number of hot cells:", len(top_beats_crimes))

top_beats_crimes = gpd.GeoDataFrame(top_beats_crimes, geometry="geometry", crs="epsg:4326")
top_beats_crimes["beat_area"] = top_beats_crimes.to_crs(epsg=3857)["geometry"].area

# Total hotspot area
hotspot_area = top_beats_crimes["beat_area"].sum()
print("Hotspot area: {:.2f} km²".format(hotspot_area / 1e6))

# Total of hits (crimes in the hotspots)
hits = gdf_last_week.within(top_beats_crimes.to_crs(METRIC_CRS)["geometry"].union_all()).sum()
print("pourcent of hits:", hits/len(gdf_last_week))

beat_PAI = get_PAI(hits, top_beats_crimes["beat_area"].sum(), total_crimes, study_area)
print("BEAT_PAI =", beat_PAI)

In [None]:
# h3 PAI
h3_crimes_count["norm_crime"] = h3_crimes_count["point_count"] / h3_crimes_count["point_count"].max()

# Create the Hotsopt: h3 cells
mask = h3_crimes_count["norm_crime"] >= h3_crimes_count["norm_crime"].quantile(0.85)
top_hex_gdf = h3_crimes_count[mask].copy()
print("Number of hot cells:", len(top_hex_gdf))

top_hex_gdf = gpd.GeoDataFrame(top_hex_gdf, geometry="geometry", crs="epsg:4326")
top_hex_gdf["cell_area"] = top_hex_gdf.to_crs(epsg=3857)["geometry"].area

# Total hotspot area
hotspot_area = top_hex_gdf["cell_area"].sum()
print("Hotspot area: {:.2f} km²".format(hotspot_area / 1e6))

# Total of hits (crimes in the hotspots)
hits = gdf_last_week.within(top_hex_gdf.to_crs(METRIC_CRS)["geometry"].union_all()).sum()
print("pourcent of hits:", hits/len(gdf_last_week))


h3_PAI = get_PAI(hits, top_hex_gdf["cell_area"].sum(), total_crimes, study_area)
print("H3_PAI =", h3_PAI)

In [None]:
# DBSCAN Cluster geometry
clusters = []

for cid, group in dbscan_crimes.groupby("cluster_id"):
    if cid == -1:
        continue  # skip noise

    hull = group.union_all().convex_hull  # one polygon per cluster

    clusters.append({
        "cluster_id": cid,
        "n_points": len(group),
        "geometry": hull
    })

clusters_gdf = gpd.GeoDataFrame(clusters, geometry="geometry", crs=dbscan_crimes.crs)

In [None]:
print("Number of hot points:", clusters_gdf["n_points"].sum())

# Total hotspot area
clusters_gdf["cluster_area"] = clusters_gdf.to_crs(METRIC_CRS)["geometry"].area
hotspot_area = clusters_gdf["cluster_area"].sum()
print("Hotspot area: {:.2f} km²".format(hotspot_area / 1e6))

# Total of hits (crimes in the hotspots)
hits = gdf_last_week.within(clusters_gdf.to_crs(METRIC_CRS)["geometry"].union_all()).sum()
print("pourcent of hits:", hits/len(gdf_last_week))

h3_PAI = get_PAI(hits, hotspot_area, len(gdf_last_week), study_area)
print("H3_PAI =", h3_PAI)

## PART 2: Basic exploratory data analysis (EDA)

### A. When is crime most frequent? (Temporal pattern by month)


In [None]:
df_crimes['month'] = pd.to_datetime(df_crimes['Date']).dt.month
px.bar(df_crimes['month'].value_counts())

### B. What are the main crime types? (Categorical distribution)

In [None]:
px.bar(df_crimes['Primary Type'].value_counts().head(20))

### C. Grouping detailed crime types into broader categories


In [None]:
main_categories = {
    # "old category": "new category"
    # Violence Against Persons
    "ASSAULT": "violence_against_person",
    "BATTERY": "violence_against_person",
    # Residential Burglary
    "BURGLARY": "burglary",
    # Thefts
    "THEFT": "theft",  
    "MOTOR VEHICLE THEFT": "theft",    
    "DECEPTIVE PRACTICE": "theft",
    "ROBBERY": "theft",
    # Drugs
    "NARCOTICS": "drug_offense",
    # Property Environmental/Damage
    "CRIMINAL DAMAGE": "prop_env_damage",
    "CRIMINAL TRESPASS": "prop_env_damage",
    # Other
    "OTHER OFFENSE": "Other",
    "WEAPONS VIOLATION": "Other"
}

df_crimes["main_category"] = df_crimes["Primary Type"].map(main_categories).fillna("")

In [None]:
px.bar(df_crimes['main_category'].value_counts())

### D. How do crime categories shift in space over time? (Mean centres)


In [None]:
# create a new df that groups crimes per month and averages the lat and lon
df_mean_center = df_crimes.groupby(['month', 'main_category'])[['Latitude', 'Longitude']].mean().reset_index()


In [None]:
m = KeplerGl(height=600)
m.add_data(data=df_mean_center[df_mean_center['main_category'] == 'theft'], name="Theft") # filter Theft type
m.add_data(data=df_mean_center[df_mean_center['main_category'] == 'violence_against_person'], name="Violence Against Person")
m.add_data(data=df_mean_center[df_mean_center['main_category'] == 'burglary'], name="Burglary")
m

### E. Is the spatial distribution of crime random? (Global Moran’s I)


In [None]:
# Create KNN contiguity weights as a sparse matrix
w = ps.weights.KNN.from_dataframe(h3_crimes_count, k=6) # 6 is the number of neighbors, it makes sense for an hexagon
w.transform = 'R'

# Calculate Moran's I - Row-standardize the weights
mi = esda.Moran(h3_crimes_count['point_count'].astype('float64'), w)

print("------")
print(f"Moran's I: {round(mi.I, 3)}")
print(f"p-value: {round(mi.p_sim, 4)}")

moran_scatterplot(mi);

### F. Where exactly are the clusters? (Local Moran’s I)


In [None]:

moran = esda.Moran_Local(h3_crimes_count['point_count'].astype('float64'), w, geoda_quads=True)

# Moran's I value and its significance
h3_crimes_count['moran_cat'] = moran.q
h3_crimes_count['moran_zscore'] = moran.z_sim
h3_crimes_count['moran_pvalue'] = moran.p_sim

moran_scatterplot(moran, p=0.01);

In [None]:
h3_crimes_count

In [None]:
# need to transform to gdf
h3_crimes_count = gpd.GeoDataFrame(h3_crimes_count)

# Visualizing the Z-scores
h3_crimes_count['cluster_moran'] = 'Not_Significant'

h3_crimes_count.loc[(h3_crimes_count['moran_pvalue'] < 0.10) & (h3_crimes_count['moran_cat'] == 1), 'cluster_moran'] = 'High-High_90'
h3_crimes_count.loc[(h3_crimes_count['moran_pvalue'] < 0.10) & (h3_crimes_count['moran_cat'] == 2), 'cluster_moran'] = 'Low-Low_90'
h3_crimes_count.loc[(h3_crimes_count['moran_pvalue'] < 0.10) & (h3_crimes_count['moran_cat'] == 3), 'cluster_moran'] = 'Low-High_90'
h3_crimes_count.loc[(h3_crimes_count['moran_pvalue'] < 0.10) & (h3_crimes_count['moran_cat'] == 4), 'cluster_moran'] = 'High-low_90'

# Define discrete color map
color_map = {
    'Not_Significant': 'lightgray',
    'High-High_90': 'lightcoral',
    'Low-Low_90': 'lightblue',
    'Low-High_90': 'blue',
    'High-low_90': 'red',
}

px.choropleth_map(h3_crimes_count, geojson=h3_crimes_count.geometry, locations=h3_crimes_count.index,
                        color='cluster_moran', color_discrete_map=color_map,
                        hover_data=['moran_zscore', 'moran_pvalue', 'cluster_moran'],
                        zoom=10, center={"lat": 41.8781, "lon": -87.6298}, 
                        height= 1000, map_style="light", title="Anselin Local Moran'I of crime with arrests"
                )

In [None]:
# kepler still works, but you have to build the color schema manually, probably easier to plot it with plotly like above
m = KeplerGl(height=600)
m.add_data(data=h3_crimes_count.drop(columns=["h3_cell"]), name="crimes")
m

In [None]:
# Visualizing the p-value

h3_crimes_count['cluster_moran'] = 'Not_Significant'

h3_crimes_count.loc[(h3_crimes_count['moran_pvalue'] < 0.10), 'cluster_moran'] = 'p-value<0.1'
h3_crimes_count.loc[(h3_crimes_count['moran_pvalue'] < 0.05), 'cluster_moran'] = 'p-value<0.95'
h3_crimes_count.loc[(h3_crimes_count['moran_pvalue'] < 0.01), 'cluster_moran'] = 'p-value<0.01'


# Define discrete color map
color_map = {
    'Not_Significant': 'lightgray',
    'p-value<0.1': 'lightgreen',
    'p-value<0.95': 'green',
    'p-value<0.01': 'darkgreen',
}

px.choropleth_map(h3_crimes_count, geojson=h3_crimes_count.geometry, locations=h3_crimes_count.index,
                          color='cluster_moran', color_discrete_map=color_map,
                          hover_data=['moran_zscore', 'moran_pvalue', 'cluster_moran'],
                          zoom=10, center={"lat": 41.8781, "lon": -87.6298}, 
                          height= 1000, map_style="light", title="Anselin Local Moran'I p-values for crimes with arrest"
                 )

### G. Hot and cold spots with Getis–Ord Gi*


In [None]:
# Compute G_Local
g_local = esda.G_Local(h3_crimes_count['point_count'].astype('float64'), w)

# Extract z-scores and p-values
h3_crimes_count['Gi_star_zscore'] = g_local.z_sim
h3_crimes_count['Gi_star_pvalue'] = g_local.p_sim

In [None]:
px.choropleth_map(h3_crimes_count, geojson=h3_crimes_count.geometry, locations=h3_crimes_count.index,
                          color='Gi_star_zscore', color_continuous_scale="RdBu_r",
                          hover_data=['Gi_star_zscore','Gi_star_pvalue'],
                          zoom=10, center={"lat": 41.8781, "lon": -87.6298}, 
                          height= 1000, map_style="dark"
                 )

In [None]:
# considering the p-value

h3_crimes_count['cluster'] = 'Not_Significant'
# significant hot spot
h3_crimes_count.loc[(h3_crimes_count['Gi_star_pvalue'] < 0.05) & (h3_crimes_count['Gi_star_zscore'] > 2.0), 'cluster'] = 'Hotspot_95'
h3_crimes_count.loc[(h3_crimes_count['Gi_star_pvalue'] < 0.01) & (h3_crimes_count['Gi_star_zscore'] > 2.0), 'cluster'] = 'Hotspot_99'

# significant cold spot
h3_crimes_count.loc[(h3_crimes_count['Gi_star_pvalue'] < 0.05) & (h3_crimes_count['Gi_star_zscore'] < -1.5), 'cluster'] = 'Coldspot_95'
h3_crimes_count.loc[(h3_crimes_count['Gi_star_pvalue'] < 0.01) & (h3_crimes_count['Gi_star_zscore'] < -1.5), 'cluster'] = 'Coldspot_99'

# Convert 'cluster' to categorical
h3_crimes_count['cluster'] = h3_crimes_count['cluster'].astype('category')

# Define discrete color map
color_map = {
    'Not_Significant': 'lightgray',
    'Hotspot_95': 'lightcoral',
    'Hotspot_99': 'red',
    'Coldspot_95': 'blue',
    'Coldspot_99': 'darkblue'
}

px.choropleth_map(h3_crimes_count, geojson=h3_crimes_count.geometry, locations=h3_crimes_count.index,
                          color='cluster', color_discrete_map=color_map,
                          hover_data=['Gi_star_zscore','Gi_star_pvalue', 'cluster'],
                          zoom=10, center={"lat": 41.8781, "lon": -87.6298}, 
                          height= 1000, map_style="light"
                 )

In [None]:
# this line is to clear the output of the notebook, so that when you commit it, it is clean
!jupyter nbconvert --clear-output --inplace crime_sol.ipynb