The final product is a lineplot with year on the x-axis and crime rate on the y-axis. A user can filter what counties (or aggregates of counties such as NorCal, greater LA, or the entire state) they want to see the data for. The user can also select what types of crime they want to see (or aggregates). The user could filter with year too as a way of zooming in and out.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import interact, widgets, Layout, HTML
import matplotlib.ticker as ticker

%matplotlib widget

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
raw_df = pd.read_csv("all_counties_crime_w_pop.csv")
raw_df

Unnamed: 0.1,Unnamed: 0,Year,County,Violent Crimes,Homicide,Rape (Forcible Rape prior to 2014),Rape,Attempted Rape,Robbery,Weapon,...,Over $400,Over $200,$200 through $400,$50 through $199,Under $50,Arson,Structural Property,Mobile Property,Other Property,Population
0,0,1985,California,0.007642,0.000105,0.000433,0.000315,0.000118,0.003270,0.0,...,0.007669,0.000000,0.005242,0.008628,0.012220,0.000774,0.000301,0.000194,0.000279,26441107
1,1,1986,California,0.009164,0.000112,0.000447,0.000322,0.000125,0.003413,0.0,...,0.007596,0.000000,0.005528,0.008510,0.012055,0.000728,0.000267,0.000211,0.000250,27102238
2,2,1987,California,0.009149,0.000105,0.000436,0.000317,0.000119,0.003001,0.0,...,0.007716,0.000000,0.005350,0.007864,0.011355,0.000666,0.000248,0.000198,0.000220,27777160
3,3,1988,California,0.009204,0.000104,0.000414,0.000304,0.000110,0.003028,0.0,...,0.008161,0.000000,0.005273,0.007706,0.011629,0.000662,0.000242,0.000190,0.000230,28464250
4,4,1989,California,0.009720,0.000108,0.000409,0.000308,0.000101,0.003300,0.0,...,0.008564,0.000000,0.005355,0.007667,0.011684,0.000654,0.000227,0.000187,0.000240,29218165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2296,2296,2023,Tulare,0.005161,0.000089,0.000535,0.000478,0.000056,0.000745,,...,,0.007542,,0.001991,0.003010,0.000374,0.000052,0.000029,0.000293,480747
2297,2297,2023,Tuolumne,0.006818,0.000018,0.000813,0.000813,0.000000,0.000166,,...,,0.004009,,0.000831,0.001534,0.000148,0.000018,0.000000,0.000129,54123
2298,2298,2023,Ventura,0.002894,0.000025,0.000271,0.000265,0.000006,0.000549,,...,,0.006052,,0.001575,0.002413,0.000108,0.000018,0.000012,0.000078,833071
2299,2299,2023,Yolo,0.002517,0.000018,0.000305,0.000296,0.000009,0.000467,,...,,0.011381,,0.002404,0.002431,0.000211,0.000054,0.000031,0.000126,222919


In [3]:
raw_df = raw_df.drop(columns=["Population", "Unnamed: 0"])

years = list(raw_df["Year"].unique())
counties = list(raw_df["County"].unique())
crime_types = list(raw_df.columns[2:])

In [4]:
melted_df = pd.melt(raw_df, id_vars=["Year", "County"], value_vars=crime_types)
melted_df.rename(columns={"variable": "Crime Type"}, inplace=True)
melted_df.rename(columns={"value": "Rate"}, inplace=True)
melted_df = melted_df.sort_values(by=["Year","County"])
melted_df = melted_df.dropna(subset=["Rate"])
df = melted_df
df.head()

Unnamed: 0,Year,County,Crime Type,Rate
39,1985,Alameda,Violent Crimes,0.009711
2340,1985,Alameda,Homicide,0.000119
4641,1985,Alameda,Rape (Forcible Rape prior to 2014),0.000661
6942,1985,Alameda,Rape,0.000525
9243,1985,Alameda,Attempted Rape,0.000135


In [5]:
def plot_data(selected_counties, selected_crime_types, aggregate_method="sum"):
    if not selected_counties or not selected_crime_types:
        print("Please select at least one county and one crime type.")
        return

    # Pruning down to only unique counties and crime types
    unique_counties = list(set(selected_counties))
    unique_crime_types = list(set(selected_crime_types))

    plt.figure(figsize=(10, 8))
    
    # Creating a color map for counties and a marker map for crime types
    colors = plt.cm.Dark2(np.linspace(0, 1, len(counties)))[::8]
    county_colors = np.concatenate((colors, colors, colors, colors, colors, colors, colors, colors), axis=0)[:-5]
    crime_markers = ['o', 's', '^', 'D', 'v', '<', '>', 'p', '*', 'h']
    
    # Tracking all lines for the legend
    legend_elements = []
    
    # Filtering dataframe down to only selected counties and crime types
    filtered_data = df[df["County"].isin(unique_counties) & df["Crime Type"].isin(unique_crime_types)]
    
    # Plotting line for one county and one crime rate
    if len(unique_counties) == 1 and len(unique_crime_types) == 1:
        county = unique_counties[0]
        crime_type = unique_crime_types[0]
        county_crime_data = filtered_data[(filtered_data["County"] == county) & (filtered_data["Crime Type"] == crime_type)]
        line, = plt.plot(county_crime_data["Year"], 1000*county_crime_data["Rate"], marker='o', linewidth=2, label=f"{county}: {crime_type}")
        legend_elements.append(line)

    elif len(unique_counties) == 1 or len(unique_crime_types) == 1:
        # Plotting lines for ones counties and multiple crime types
        if len(unique_counties) == 1:
            county = unique_counties[0]
            for i, crime_type in enumerate(unique_crime_types):
                county_crime_data = filtered_data[(filtered_data['County'] == county) & (filtered_data['Crime Type'] == crime_type)]
                line, = plt.plot(county_crime_data['Year'], 1000*county_crime_data['Rate'], 
                                 marker=crime_markers[i % len(crime_markers)], 
                                 linewidth=2, color=county_colors[0],
                                 label=f"{county}: {crime_type}")
                legend_elements.append(line)
        
        # Plotting lines for multiple counties and one crime type
        else:
            crime_type = unique_crime_types[0]
            for i, county in enumerate(unique_counties):
                county_crime_data = filtered_data[(filtered_data['County'] == county) & (filtered_data['Crime Type'] == crime_type)]
                line, = plt.plot(county_crime_data['Year'], 1000*county_crime_data['Rate'], 
                                 marker='o', linewidth=2,
                                 color=county_colors[i % len(county_colors)],
                                 label=f"{county}: {crime_type}")
                legend_elements.append(line)
    
    # Plotting lines for multiples counties and multiples crime types
    else:
        # Option 1: Plot aggregate per county (across selected crime types)
        for i, county in enumerate(unique_counties):
            county_data = filtered_data[filtered_data['County'] == county]
            agg_by_year = county_data.groupby('Year')['Rate'].agg(aggregate_method).reset_index()
            line, = plt.plot(agg_by_year['Year'], 1000*agg_by_year['Rate'], 
                     marker='o', linewidth=2, color=county_colors[i % len(county_colors)],
                     label=f"{county} ({len(unique_crime_types)} crime types)")
            legend_elements.append(line)
    
    # Always calculate and plot the overall aggregate line if we have multiple selections
    if len(unique_counties) > 1 or len(unique_crime_types) > 1:
        # Overall aggregate across all selected counties and crime types
        agg_by_year = filtered_data.groupby('Year')['Rate'].agg(aggregate_method).reset_index()
        line, = plt.plot(agg_by_year['Year'], 1000*agg_by_year['Rate'], 
                 marker='*', linewidth=3, linestyle='--', 
                 color='red', markersize=10, zorder=10,
                 label=f"Overall {aggregate_method.capitalize()} ({len(unique_counties)} counties, {len(unique_crime_types)} crime types)")
        legend_elements.append(line)
    
    # Setting selection-based title
    if len(unique_counties) == 1 and len(unique_crime_types) == 1:
        title = f"{unique_crime_types[0]} Rate in {unique_counties[0]} (1985-2023)"
    elif len(unique_counties) == 1:
        title = f"Selected Crime Rates in {unique_counties[0]} (1985-2023)"
    elif len(unique_crime_types) == 1:
        title = f"{unique_crime_types[0]} Rate in Selected Counties (1985-2023)"
    else:
        title = f"{aggregate_method.capitalize()} Crime Rates for Selected Counties and Crime Types (1985-2023)"
    
    plt.title(title, fontsize=16)
    plt.xlabel('Year', fontsize=12)
    plt.ylabel(f'Crime Rate (per 1,000 people)', fontsize=12)
    plt.axvline(x=1990, linestyle="--", color="blue", alpha=0.5, label="Start of 90s")
    plt.axvline(x=2000, linestyle="--", color="blue", alpha=0.5, label="End of 90s")
    plt.axvline(x=2020, linestyle="--", color="red", alpha=0.5, label="COVID-19 Arrives in US")
    
    # Format x-axis to show all years
    plt.xticks(years[::2])  # Show every other year to avoid crowding
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # Format y-axis
    plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:.2f}'))
    
    # Add print statement to debug the selection
    print(f"Selected counties: {unique_counties}, Count: {len(unique_counties)}")
    print(f"Selected crime types: {unique_crime_types}, Count: {len(unique_crime_types)}")
    
    plt.legend(handles=legend_elements, loc="best")
    plt.tight_layout()
    plt.show()

In [6]:
help_text = HTML(
    """<p style="font-style:italic; color:#555;">
    Selection tips:
    <ul style="margin-top:0">
      <li>Hold Ctrl (or Cmd) while clicking to select multiple non-adjacent items</li>
      <li>Click one item, then Shift+click another to select a range</li>
      <li>To unselect an item, hold Ctrl (or Cmd) and click on the already selected item</li>
      <li>At least one county and one crime type must be selected</li>
    </ul>
    </p>"""
)

counties_select = widgets.SelectMultiple(
    options=counties,
    value=[counties[0]],  # Default to first county
    description='Counties:',
    style={'description_width': 'initial'},
    layout=Layout(width='300px', height='120px')
)

crime_types_select = widgets.SelectMultiple(
    options=crime_types,
    value=[crime_types[0]],  # Default to first crime type
    description='Crime Types:',
    style={'description_width': 'initial'},
    layout=Layout(width='300px', height='120px')
)

aggregate_dropdown = widgets.Dropdown(
    options=['mean', 'sum', 'median', 'min', 'max'],
    value='mean',
    description='Aggregate Method:',
    style={'description_width': 'initial'}
)

VBox(children=(HBox(children=(SelectMultiple(description='Counties:', index=(0,), layout=Layout(height='120px'…

In [7]:
output = widgets.Output()

def update_plot(*args):
    with output:
        output.clear_output(wait=True)
        plot_data(counties_select.value, crime_types_select.value, aggregate_dropdown.value)

counties_select.observe(update_plot, 'value')
crime_types_select.observe(update_plot, 'value')
aggregate_dropdown.observe(update_plot, 'value')

display(widgets.VBox([
    widgets.HBox([counties_select, crime_types_select]),
    aggregate_dropdown,
    help_text,
    output
]))

update_plot()

VBox(children=(HBox(children=(SelectMultiple(description='Counties:', index=(0,), layout=Layout(height='120px'…