In [1]:
import geopandas as gpd
from geopandas.tools import geocode
from shapely.geometry import Point, Polygon
import geopy
import matplotlib as plt
import seaborn as sns
import pandas as pd
import contextily as cx
import numpy as np

In [None]:
# import crime dataset from cville open data portal

crimedata=pd.read_csv("Crime_Data.csv")

# crime_data is the raw data, uncleaned, from the open data portal




# Create a new column "Street_Address" that combines Block Number & Street Name
crimedata["BlockNum_STR"] = crimedata["BlockNumber"].apply(
    lambda x: str(int(x)) if pd.notna(x) else ""
)

crimedata["Street_Address"] = np.where(
    crimedata["BlockNum_STR"] != "",
    crimedata["BlockNum_STR"] + " " + crimedata["StreetName"],
    crimedata["StreetName"]
)


# Remove any apartment numbers because that could harm the geocoding

crimedata["Street_Address"]=crimedata["Street_Address"].str.split(",", n=1).str[0]
crimedata.head()

# Now the crimedata dataframe contains a column called "Street_Address", this is a cleaned combination of block number and streetname


Unnamed: 0,RecordID,Offense,IncidentID,BlockNumber,StreetName,Agency,DateReported,HourReported,ReportingOfficer,BlockNum_STR,Street_Address
0,1,Assist Citizen - Mental/TDO/ECO,202500023705,2100.0,"MICHIE DR, 35B",CPD,2025/07/08 21:23:03+00,1723,"Crowley, Raeann",2100,2100 MICHIE DR
1,2,Shots Fired/Illegal Hunting,202500023699,600.0,10 1/2 ST NW,CPD,2025/07/08 20:48:27+00,1648,"Curry, Brian",600,600 10 1/2 ST NW
2,3,Sex Offense - Forcible Sodomy,202500023694,200.0,2ND ST NW,CPD,2025/07/08 20:35:32+00,1635,"O'Briant, Landon",200,200 2ND ST NW
3,4,Vandalism,202500023691,1100.0,E MARKET ST,CPD,2025/07/08 20:22:29+00,1622,"Benbow, Lauren",1100,1100 E MARKET ST
4,5,Larceny - Shoplifitng,202500023686,500.0,W MAIN ST,CPD,2025/07/08 20:12:33+00,1612,"Benbow, Lauren",500,500 W MAIN ST


In [4]:
# Geocoding with Geopandas and Geopy
""" 
This code takes about 45 minutes to run because it uses each of the cleaned Street_Address entries in the crimedata dataframe, 
and "geocodes" or looks them up via arcgis. The output is a geometric feature called a point feature that includes X,Y coordinates.
The result is a geodataframe (courtesy of geopandas), that attaches one of these points to each street address and therefore each crimedata record.

LIMITATION: there are a number of instances where addresses were entered incorrectly or not specific enough. These had to be excluded from the dataset 
because we were unable to geocode them. 

"""

"""
gdf=geocode(crimedata["Street_Address"], provider="arcgis", user_agent="python-requests/2.32.5")
gdf.to_file("CrimeGeocoded.shp", index=False)
"""

'\ngdf=geocode(crimedata["Street_Address"], provider="arcgis", user_agent="python-requests/2.32.5")\ngdf.to_file("CrimeGeocoded.shp", index=False)\n'

In [5]:
"""
Because the geocoding takes so long to run, I saved my initial output as its own file so I can easily reload it when I close this notebook.
"""

geo_points=gpd.read_file("CrimeGeocoded.shp")
geo_points.head()

Unnamed: 0,address,geometry
0,"2100 Michie Dr, Charlottesville, Virginia, 22901",POINT (-78.48384 38.063)
1,"600 10 1/2 St NW, Charlottesville, Virginia, 2...",POINT (-78.49226 38.03935)
2,"200 2nd St NW, Charlottesville, Virginia, 22902",POINT (-78.48178 38.03181)
3,"1100 E Market St, Charlottesville, Virginia, 2...",POINT (-78.47215 38.02798)
4,"500 W Main St, Charlottesville, Virginia, 22903",POINT (-78.48788 38.03079)


In [6]:
# import the key table I made to connect each geocoded address with the corresponding crime data entry

key_table=pd.read_csv("geo_crime_key_table.csv")
key_table.head()

Unnamed: 0,index,lower_geo_add,geo_address,PD_Text_Address,Notes
0,0,"2100 Michie Dr, Charlottesville, Virginia, 22901","2100 MICHIE DR, CHARLOTTESVILLE, VA",2100 MICHIE DR,
1,1,"600 10 1/2 St NW, Charlottesville, Virginia, 2...","600 10 1/2 ST NW, CHARLOTTESVILLE, VA",600 10 1/2 ST NW,
2,2,"200 2nd St NW, Charlottesville, Virginia, 22902","200 2ND ST NW, CHARLOTTESVILLE, VA",200 2ND ST NW,
3,3,"1100 E Market St, Charlottesville, Virginia, 2...","1100 E MARKET ST, CHARLOTTESVILLE, VA",1100 E MARKET ST,
4,4,"500 W Main St, Charlottesville, Virginia, 22903","500 W MAIN ST, CHARLOTTESVILLE, VA",500 W MAIN ST,


In [None]:
# Merge the key table with the geo_point data so that each point has the cville PD address
# as well as the correct geocoded address and point information
merged=pd.merge(
    geo_points, 
    key_table, 
    left_on="address", 
    right_on="lower_geo_add", 
    how="outer", 
    validate="many_to_many", 
    indicator="Matched")
merged["Matched"].value_counts() 

# the value counts show that only 17 of 5137 entries failed, so we can continue with an inner join

merged=pd.merge(
    geo_points, 
    key_table, 
    left_on="address", 
    right_on="lower_geo_add", 
    how="inner", 
    validate="many_to_many",
    )
merged.head() 
# the "merged" table is a geodataframe so it includes all the geometric info as 
# well as the correct text to match the Street_Address field in the crimedata

Unnamed: 0,address,geometry,index,lower_geo_add,geo_address,PD_Text_Address,Notes
0,"2100 Michie Dr, Charlottesville, Virginia, 22901",POINT (-78.48384 38.063),0,"2100 Michie Dr, Charlottesville, Virginia, 22901","2100 MICHIE DR, CHARLOTTESVILLE, VA",2100 MICHIE DR,
1,"600 10 1/2 St NW, Charlottesville, Virginia, 2...",POINT (-78.49226 38.03935),1,"600 10 1/2 St NW, Charlottesville, Virginia, 2...","600 10 1/2 ST NW, CHARLOTTESVILLE, VA",600 10 1/2 ST NW,
2,"200 2nd St NW, Charlottesville, Virginia, 22902",POINT (-78.48178 38.03181),2,"200 2nd St NW, Charlottesville, Virginia, 22902","200 2ND ST NW, CHARLOTTESVILLE, VA",200 2ND ST NW,
3,"1100 E Market St, Charlottesville, Virginia, 2...",POINT (-78.47215 38.02798),3,"1100 E Market St, Charlottesville, Virginia, 2...","1100 E MARKET ST, CHARLOTTESVILLE, VA",1100 E MARKET ST,
4,"500 W Main St, Charlottesville, Virginia, 22903",POINT (-78.48788 38.03079),4,"500 W Main St, Charlottesville, Virginia, 22903","500 W MAIN ST, CHARLOTTESVILLE, VA",500 W MAIN ST,


In [None]:
# merge the "merged" dataframe with the actual crimedata so that each record has attached geometry
geo_merged=pd.merge(
    merged, 
    crimedata, 
    left_on="PD_Text_Address", 
    right_on="Street_Address", 
    how="outer", 
    validate="many_to_many", 
    indicator="Matched"
    )
geo_merged["Matched"].value_counts() 

# the number of failed rows is less than 2% of the data, so I proceed with an inner join

Matched
both          228990
right_only      3504
left_only        674
Name: count, dtype: int64

In [13]:
geo_merged=pd.merge(
    merged, 
    crimedata, 
    left_on="PD_Text_Address", 
    right_on="Street_Address", 
    how="inner", 
    validate="many_to_many", 
    )
geo_merged.head()

Unnamed: 0,address,geometry,index,lower_geo_add,geo_address,PD_Text_Address,Notes,RecordID,Offense,IncidentID,BlockNumber,StreetName,Agency,DateReported,HourReported,ReportingOfficer,BlockNum_STR,Street_Address
0,"2100 Michie Dr, Charlottesville, Virginia, 22901",POINT (-78.48384 38.063),0,"2100 Michie Dr, Charlottesville, Virginia, 22901","2100 MICHIE DR, CHARLOTTESVILLE, VA",2100 MICHIE DR,,1,Assist Citizen - Mental/TDO/ECO,202500023705,2100.0,"MICHIE DR, 35B",CPD,2025/07/08 21:23:03+00,1723,"Crowley, Raeann",2100,2100 MICHIE DR
1,"2100 Michie Dr, Charlottesville, Virginia, 22901",POINT (-78.48384 38.063),0,"2100 Michie Dr, Charlottesville, Virginia, 22901","2100 MICHIE DR, CHARLOTTESVILLE, VA",2100 MICHIE DR,,49,Shots Fired/Illegal Hunting,202500023058,2100.0,"MICHIE DR, 106",CPD,2025/07/04 03:35:26+00,2335,"Pinkman, Jackson",2100,2100 MICHIE DR
2,"2100 Michie Dr, Charlottesville, Virginia, 22901",POINT (-78.48384 38.063),0,"2100 Michie Dr, Charlottesville, Virginia, 22901","2100 MICHIE DR, CHARLOTTESVILLE, VA",2100 MICHIE DR,,105,Assault Simple,202500022496,2100.0,"MICHIE DR, 47A",CPD,2025/06/30 01:43:46+00,2143,"Vlasis, Christopher",2100,2100 MICHIE DR
3,"2100 Michie Dr, Charlottesville, Virginia, 22901",POINT (-78.48384 38.063),0,"2100 Michie Dr, Charlottesville, Virginia, 22901","2100 MICHIE DR, CHARLOTTESVILLE, VA",2100 MICHIE DR,,171,Burglary,202500021792,2100.0,"MICHIE DR, 35B",CPD,2025/06/24 18:11:29+00,1411,"Ryan, Matthew",2100,2100 MICHIE DR
4,"2100 Michie Dr, Charlottesville, Virginia, 22901",POINT (-78.48384 38.063),0,"2100 Michie Dr, Charlottesville, Virginia, 22901","2100 MICHIE DR, CHARLOTTESVILLE, VA",2100 MICHIE DR,,186,Disorderly Conduct,202500021611,2100.0,"MICHIE DR, 52B",CPD,2025/06/23 15:06:58+00,1106,"Mian, Ghulam",2100,2100 MICHIE DR


In [14]:
crimedata_with_geometry=geo_merged
geo_merged.to_file("crimedata_with_geometry.shp")

  geo_merged.to_file("crimedata_with_geometry.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


Using open source GIS software "QGIS", Hannah Egl uploaded the crimedata_with_geometry file as well as the "Police Neighborhood Area" shapefile available at the Cville Open Data Portal onto a single map. On this map, she used a geoprocessing tool to assign each point to the "neighborhood" they fall inside. This means that she now has a shapefile (a file much like the geodataframe that geopandas creates), that has each crime record where the street address was a real address (98% of the records). Each of these records also has an assigned "neighborhood" that allows us to create a categorical variable that simplifies the location information for that crime and summarize crime statistics to general vacinities rather than precise locations. 

In [17]:
full_crimedata=gpd.read_file("crime_points_neighborhood_.gpkg")
full_crimedata.head()

Unnamed: 0,address,index,lower_geo_,geo_addres,PD_Text_Ad,Notes,RecordID,Offense,IncidentID,BlockNumbe,...,ReportingO,BlockNum_S,Street_Add,NeighborhoodInfoOBJECTID,NeighborhoodInfoBEAT_NO,NeighborhoodInfoNAME,NeighborhoodInfoPOPULATION,NeighborhoodInfoDISTRICT,NeighborhoodInfoGlobalID,geometry
0,"2100 Michie Dr, Charlottesville, Virginia, 22901",0,"2100 Michie Dr, Charlottesville, Virginia, 22901","2100 MICHIE DR, CHARLOTTESVILLE, VA",2100 MICHIE DR,,1,Assist Citizen - Mental/TDO/ECO,202500023705,2100.0,...,"Crowley, Raeann",2100,2100 MICHIE DR,8.0,30,29 North,488.0,5.0,{CC76DA2F-2DA3-4233-ADFD-F8B52EC4A1B2},POINT (-78.48384 38.063)
1,"2100 Michie Dr, Charlottesville, Virginia, 22901",0,"2100 Michie Dr, Charlottesville, Virginia, 22901","2100 MICHIE DR, CHARLOTTESVILLE, VA",2100 MICHIE DR,,49,Shots Fired/Illegal Hunting,202500023058,2100.0,...,"Pinkman, Jackson",2100,2100 MICHIE DR,8.0,30,29 North,488.0,5.0,{CC76DA2F-2DA3-4233-ADFD-F8B52EC4A1B2},POINT (-78.48384 38.063)
2,"2100 Michie Dr, Charlottesville, Virginia, 22901",0,"2100 Michie Dr, Charlottesville, Virginia, 22901","2100 MICHIE DR, CHARLOTTESVILLE, VA",2100 MICHIE DR,,105,Assault Simple,202500022496,2100.0,...,"Vlasis, Christopher",2100,2100 MICHIE DR,8.0,30,29 North,488.0,5.0,{CC76DA2F-2DA3-4233-ADFD-F8B52EC4A1B2},POINT (-78.48384 38.063)
3,"2100 Michie Dr, Charlottesville, Virginia, 22901",0,"2100 Michie Dr, Charlottesville, Virginia, 22901","2100 MICHIE DR, CHARLOTTESVILLE, VA",2100 MICHIE DR,,171,Burglary,202500021792,2100.0,...,"Ryan, Matthew",2100,2100 MICHIE DR,8.0,30,29 North,488.0,5.0,{CC76DA2F-2DA3-4233-ADFD-F8B52EC4A1B2},POINT (-78.48384 38.063)
4,"2100 Michie Dr, Charlottesville, Virginia, 22901",0,"2100 Michie Dr, Charlottesville, Virginia, 22901","2100 MICHIE DR, CHARLOTTESVILLE, VA",2100 MICHIE DR,,186,Disorderly Conduct,202500021611,2100.0,...,"Mian, Ghulam",2100,2100 MICHIE DR,8.0,30,29 North,488.0,5.0,{CC76DA2F-2DA3-4233-ADFD-F8B52EC4A1B2},POINT (-78.48384 38.063)


In [18]:
# Develop a category map to assign each offense to a broader topical category to simplify and generalize analysis

category_map = {
    'Assault & Violent Crime': ['assault', 'battery', 'homicide', 'murder', 'rape', 'sex offense', 'threats', 'robbery','harassment','stalking','kidnap','prowler','extortion'],
    'Theft & Larceny': ['larceny', 'shoplifting', 'stolen', 'theft', 'embezzlement','burglary'],
    'Property Damage': ['vandalism', 'graffiti', 'damage', 'arson'],
    'Narcotics/Drug Offenses': ['narcotics', 'drug', 'marijuana', 'possession', 'distribute','driving under the influence','towed vehicle'],
    'Motor Vehicle/Traffic': ['traffic', 'dui', 'dwi', 'reckless', 'unlicensed', 'accident','hit and run'],
    'Weapon Offenses': ['weapon', 'firearm', 'gun', 'shots fired', 'illegal hunting'],
    'Public Order/Disorderly': ['disorderly', 'public intoxication', 'trespassing', 'loitering', 'noise','domestic disturbance','disturbarance - non domestic','trespass','prostitution'],
    'Fraud & Financial': ['fraud', 'forgery', 'counterfeit', 'scam', 'financial','false report of crime'],
    'Mental Health/Welfare': ['mental', 'tdo', 'eco', 'suicide', 'assist citizen','missing person','runaway','crisis assessment','juvenile investigation']
}

# Implement the category map to the crime data

def assign_category(offense_type):
    """
    Checks the offense type against defined keywords and assigns a category.
    Returns 'Other' if no match is found.
    """
    # Convert to lowercase for case-insensitive matching
    offense_lower = offense_type.lower()
    
    # Iterate through categories and their keywords
    for category, keywords in category_map.items():
        if any(keyword in offense_lower for keyword in keywords):
            return category
    
    return 'Other'



full_crimedata['Offense_cat'] = full_crimedata['Offense'].apply(assign_category)

full_crimedata["Offense_cat"].value_counts()

Offense_cat
Theft & Larceny            68924
Other                      44054
Assault & Violent Crime    35277
Mental Health/Welfare      20762
Motor Vehicle/Traffic      16753
Property Damage            14964
Public Order/Disorderly    11792
Fraud & Financial           8618
Narcotics/Drug Offenses     4216
Weapon Offenses             3630
Name: count, dtype: int64