In [11]:
import pandas as pd
import geopandas as gpd
import zipfile

In [12]:
# File paths
census_tract_shp = 'la_census_tract/LA_City_2020_Census_Tracts_.shp'
crime_csv = 'crime_data/crime_la_2020_2024.csv'

# Generate list of years from 2001 to 2024
years = list(range(2020, 2024))

In [13]:
# Read in the census tract shapefile
tracts_gdf = gpd.read_file(census_tract_shp)
tracts_gdf = tracts_gdf.to_crs(epsg=4326)
tracts_gdf = tracts_gdf[['geometry', 'CT20']]

# Read in the crime points data, this is very large dataset
crime_df = pd.read_csv(crime_csv)

In [14]:
# Convert the date to a datetime object
crime_df['Date Rptd'] = pd.to_datetime(crime_df['Date Rptd'])
crime_df['Year'] = crime_df['Date Rptd'].dt.year

  crime_df['Date Rptd'] = pd.to_datetime(crime_df['Date Rptd'])


In [15]:
# Define funtion to aggreate crime points counts by census tract, by type of crime
def aggregate_points(points_gdf, geometry_gdf):
    # Spatial join the points to the census tracts
    points_gdf = gpd.sjoin(points_gdf, geometry_gdf, how='inner', predicate='within')
    # Group by the census tract and
    points_gdf = points_gdf.groupby(['CT20', 'Crm Cd']).size().reset_index(name='count')
    return points_gdf
    

In [16]:
# Create an empty dataframe to store the combined results
combined_results = pd.DataFrame()

# Loop through each year
for year in years:
    # Filter the crime data for the current year
    crime_df_year = crime_df[crime_df['Year'] == year]
    
    # Convert the filtered crime data to a GeoDataFrame
    crime_gdf_year = gpd.GeoDataFrame(crime_df_year, geometry=gpd.points_from_xy(crime_df_year.LON, crime_df_year.LAT))
    crime_gdf_year = crime_gdf_year.set_crs(epsg=4326)
    
    # Aggregate the crime points by census tract
    aggregated_points = aggregate_points(crime_gdf_year, tracts_gdf)
    
    # Add the year to the aggregated points dataframe
    aggregated_points['Year'] = year
    
    # Append the results to the combined dataframe
    combined_results = pd.concat([combined_results, aggregated_points], ignore_index=True)

In [17]:
# Create a 'geoid10' columns. goeid10 = '06037' + CT20
combined_results['geoid10'] = '06037' + combined_results['CT20'].astype(str)
combined_results = combined_results[['geoid10', 'Crm Cd', 'count', 'Year']]

# Replace the comma in 'Crm Cd Desc' with ' -'
# combined_results['Crm Cd Desc'] = combined_results['Crm Cd Desc'].str.replace(',', '-')

# Save the combined results to a CSV file
combined_results.to_csv('crime_data/crime_la_2020_2024_by_tract_type.csv', index=False)

# Randomly sample 10,000 rows from the combined results, and display the first 5 rows.
combined_results.sample(10000).head()

Unnamed: 0,geoid10,Crm Cd,count,Year
68539,6037276601,626,8,2021
37164,6037108101,331,4,2021
124903,6037195803,480,4,2023
125637,6037201120,110,1,2023
35119,6037980021,510,8,2020


In [18]:
# Filter combined_results where year is from 2020 - 2024
combined_results_2020_2024 = combined_results[combined_results['Year'] >= 2020]
# Summarize data by year and geoid10
combined_results_2020_2024 = combined_results_2020_2024.groupby(['geoid10', 'Year']).sum().reset_index()
# Drop primary type column
combined_results_2020_2024 = combined_results_2020_2024.drop(columns='Crm Cd')
# Copy 'geometry' column from tracts_gdf to combined_results_2020_2024
tracts_gdf['geoid10'] = '06037' + tracts_gdf['CT20'].astype(str)
combined_results_2020_2024 = combined_results_2020_2024.merge(tracts_gdf, on='geoid10', how='left')
# Convert combined_results_2020_2024 to GeoDataFrame
combined_results_2020_2024 = gpd.GeoDataFrame(combined_results_2020_2024, geometry='geometry')
# Change count, Year columns to integer
combined_results_2020_2024['count'] = combined_results_2020_2024['count'].astype(int)
# Save the combined results to a shapefile
combined_results_2020_2024.to_file('map_data/crime_la_aggregated.shp')

In [19]:
# Zip the shapefile
with zipfile.ZipFile('map_data/crime_la_aggregated.zip', 'w') as z:
    z.write('map_data/crime_la_aggregated.shp')
    z.write('map_data/crime_la_aggregated.shx')
    z.write('map_data/crime_la_aggregated.dbf')
    z.write('map_data/crime_la_aggregated.prj')
    z.write('map_data/crime_la_aggregated.cpg')