In [1]:
import pandas as pd
import geopandas as gpd
import zipfile

In [2]:
# File paths
census_tract_shp = 'chicago_census_tract/geo_export_2d3ffe6a-dd57-46d5-86e3-4f42a4df0e73.shp'
crime_csv = 'crime_data/crime_chicago_2001_2024.csv'

# Generate list of years from 2020 to 2024
years = list(range(2020, 2025))

In [None]:
# Read in the census tract shapefile
tracts_gdf = gpd.read_file(census_tract_shp)
tracts_gdf = tracts_gdf.to_crs(epsg=4326)
tracts_gdf = tracts_gdf[['geometry', 'geoid10']]

# Read in the crime points data, this is very large dataset
crime_df = pd.read_csv(crime_csv)

# Show first 5 rows of tracts_gdf, you notice the geoid10 is 11 digits
tracts_gdf.head()

Unnamed: 0,geometry,geoid10
0,"POLYGON ((-87.62405 41.73022, -87.62405 41.730...",17031842400
1,"POLYGON ((-87.68608 41.82296, -87.68607 41.823...",17031840300
2,"POLYGON ((-87.62935 41.8528, -87.62934 41.8525...",17031841100
3,"POLYGON ((-87.68813 41.85569, -87.68816 41.856...",17031841200
4,"POLYGON ((-87.63312 41.87449, -87.63306 41.874...",17031839000


In [60]:
# Define funtion to aggreate crime points counts by census tract, by type of crime
def aggregate_points(points_gdf, geometry_gdf):
    # Spatial join the crime points with the census tracts
    points_gdf = gpd.sjoin(points_gdf, geometry_gdf, how='inner', predicate='within')
    # Group by census tract and type of crime
    points_gdf = points_gdf.groupby(['geoid10', 'Primary Type']).size().reset_index(name='count')
    return points_gdf
    

The code above defines a function named `aggregate_points` that aggregates crime points by census tract and type of crime. Here's a detailed explanation:

```python
def aggregate_points(points_gdf, geometry_gdf):
    points_gdf = gpd.sjoin(points_gdf, geometry_gdf, how='inner', predicate='within')
    points_gdf = points_gdf.groupby(['geoid10', 'Primary Type']).size().reset_index(name='count')
    return points_gdf
```

### Explanation:

1. **Function Definition**:
   ```python
   def aggregate_points(points_gdf, geometry_gdf):
   ```
   - The function `aggregate_points` takes two arguments:
     - `points_gdf`: A GeoDataFrame containing the crime points data.
     - `geometry_gdf`: A GeoDataFrame containing the census tract geometries.

2. **Spatial Join**:
   ```python
   points_gdf = gpd.sjoin(points_gdf, geometry_gdf, how='inner', predicate='within')
   ```
   - Performs a spatial join between `points_gdf` and `geometry_gdf`.
   - The `how='inner'` parameter ensures that only points within the census tracts are kept.
   - The `predicate='within'` parameter specifies that the join should be based on points being within the geometries.

3. **Grouping and Aggregation**:
   ```python
   points_gdf = points_gdf.groupby(['geoid10', 'Primary Type']).size().reset_index(name='count')
   ```
   - Groups the joined GeoDataFrame by `geoid10` (census tract identifier) and `Primary Type` (type of crime).
   - Uses `.size()` to count the number of occurrences for each group.
   - Resets the index and renames the count column to `count`.

4. **Return Result**:
   ```python
   return points_gdf
   ```
   - Returns the aggregated GeoDataFrame.

### Summary:
The `aggregate_points` function aggregates crime data by census tract and type of crime, returning a GeoDataFrame with the counts of each crime type within each census tract.

In [61]:
# Create an empty dataframe to store the combined results
combined_results = pd.DataFrame()

# Loop through each year
for year in years:
    # Filter the crime data for the current year
    crime_df_year = crime_df[crime_df['Year'] == year]
    
    # Convert the filtered crime data to a GeoDataFrame
    crime_gdf_year = gpd.GeoDataFrame(crime_df_year, geometry=gpd.points_from_xy(crime_df_year.Longitude, crime_df_year.Latitude))
    crime_gdf_year = crime_gdf_year.set_crs(epsg=4326)
    
    # Aggregate the crime points by census tract
    aggregated_points = aggregate_points(crime_gdf_year, tracts_gdf)
    
    # Add the year to the aggregated points dataframe
    aggregated_points['Year'] = year
    
    # Append the results to the combined dataframe
    combined_results = pd.concat([combined_results, aggregated_points], ignore_index=True)

In [62]:
# Save the combined results to a CSV file
combined_results.to_csv('crime_data/crime_chicago_2001_2024_by_tract_type.csv', index=False)
# Randomly sample 10,000 rows from the combined results, and display the first 5 rows.
combined_results.sample(10000).head()

Unnamed: 0,geoid10,Primary Type,count,Year
26632,17031051000,WEAPONS VIOLATION,1,2022
3457,17031210501,OFFENSE INVOLVING CHILDREN,2,2020
44718,17031351100,OTHER OFFENSE,33,2023
41052,17031140800,CRIMINAL SEXUAL ASSAULT,3,2023
56047,17031250400,BATTERY,66,2024


In [63]:
# Filter combined_results where year is from 2020 - 2024
combined_results_2020_2024 = combined_results[combined_results['Year'] >= 2020]
# Summarize data by year and geoid10
combined_results_2020_2024 = combined_results_2020_2024.groupby(['geoid10', 'Year']).sum().reset_index()
# Drop primary type column
combined_results_2020_2024 = combined_results_2020_2024.drop(columns='Primary Type')
# Copy 'geometry' column from tracts_gdf to combined_results_2020_2024
combined_results_2020_2024 = combined_results_2020_2024.merge(tracts_gdf, on='geoid10', how='left')
# Convert combined_results_2020_2024 to GeoDataFrame
combined_results_2020_2024 = gpd.GeoDataFrame(combined_results_2020_2024, geometry='geometry')
# Change count, Year columns to integer
combined_results_2020_2024['count'] = combined_results_2020_2024['count'].astype(int)
# Save the combined results to a shapefile
combined_results_2020_2024.to_file('map_data/crime_chicago_aggregated.shp')


In [64]:
# Zip the shapefile
with zipfile.ZipFile('map_data/crime_chicago_aggregated.zip', 'w') as z:
    z.write('map_data/crime_chicago_aggregated.shp')
    z.write('map_data/crime_chicago_aggregated.shx')
    z.write('map_data/crime_chicago_aggregated.dbf')
    z.write('map_data/crime_chicago_aggregated.prj')
    z.write('map_data/crime_chicago_aggregated.cpg')