In [None]:
# In this project, I examine how crime density varies across New York City’s five boroughs by combining two NYC Open Data datasets:
#	1.	NYPD Complaint Data (Current – Year to Date)
# Contains every police complaint record filed in NYC in 2025.
#	2.	NYC Population by Borough, 1950–2040
# Provides borough-level population estimates for selected years.

# The goal is to compute crime rate per 100,000 residents and visualize how population size relates to crime density.Hypothesis

# Although boroughs with larger populations (Brooklyn, Queens) are expected to have higher total crime counts, 
# the Bronx may have the highest crime rate per capita.



In [1]:
import pandas as pd
import plotly.express as px

In [2]:
crime = pd.read_csv("NYPD_Complaint_Data_Current_(Year_To_Date)_20251119.csv")
crime.head()

Unnamed: 0,CMPLNT_NUM,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,CRM_ATPT_CPTD_CD,HADEVELOPT,HOUSING_PSA,...,TRANSIT_DISTRICT,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon,New Georeferenced Column
0,299744592,121,STATEN ISLAND,01/20/2025,18:30:00,01/20/2025,18:35:00,COMPLETED,(null),,...,,UNKNOWN,UNKNOWN,D,938881,167697,40.626859,-74.16344,"(40.626859, -74.16344)",POINT (-74.16344 40.626859)
1,306868358,9,MANHATTAN,05/22/2025,14:30:00,05/22/2025,14:45:00,COMPLETED,(null),,...,,UNKNOWN,UNKNOWN,M,0,0,0.0,0.0,"(0.0, 0.0)",POINT (0 0)
2,303250435,13,MANHATTAN,03/20/2025,00:30:00,,(null),COMPLETED,(null),,...,,25-44,WHITE,M,988886,207857,40.737203,-73.983273,"(40.7372030985741, -73.9832725981497)",POINT (-73.9832725981497 40.7372030985741)
3,307271594,47,BRONX,03/05/2025,17:00:00,,(null),COMPLETED,(null),,...,,25-44,BLACK,F,1026480,262584,40.887314,-73.847272,"(40.8873136344706, -73.8472717577564)",POINT (-73.8472717577564 40.8873136344706)
4,309137838,14,MANHATTAN,07/05/2025,18:00:00,07/05/2025,18:31:00,COMPLETED,(null),,...,,18-24,BLACK,F,0,0,0.0,0.0,"(0.0, 0.0)",POINT (0 0)


In [3]:
crime_clean = crime[["KY_CD", "LAW_CAT_CD", "BORO_NM", "CMPLNT_FR_DT"]]
crime_clean.head()

Unnamed: 0,KY_CD,LAW_CAT_CD,BORO_NM,CMPLNT_FR_DT
0,109,FELONY,STATEN ISLAND,01/20/2025
1,344,MISDEMEANOR,MANHATTAN,05/22/2025
2,104,FELONY,MANHATTAN,03/20/2025
3,104,FELONY,BRONX,03/05/2025
4,344,MISDEMEANOR,MANHATTAN,07/05/2025


In [4]:
crime_clean = crime_clean[crime_clean["BORO_NM"].notna()]

In [5]:
crime_by_borough = (
    crime_clean.groupby("BORO_NM")
    .size()
    .reset_index(name="crime_count")
)

crime_by_borough

Unnamed: 0,BORO_NM,crime_count
0,(null),969
1,BRONX,99029
2,BROOKLYN,122257
3,MANHATTAN,103595
4,QUEENS,94000
5,STATEN ISLAND,18706


In [6]:
pop = pd.read_csv("New_York_City_Population_by_Borough,_1950_-_2040_20251119.csv")
pop.head()

Unnamed: 0,Age Group,Borough,1950,1950 - Boro share of NYC total,1960,1960 - Boro share of NYC total,1970,1970 - Boro share of NYC total,1980,1980 - Boro share of NYC total,...,2000,2000 - Boro share of NYC total,2010,2010 - Boro share of NYC total,2020,2020 - Boro share of NYC total,2030,2030 - Boro share of NYC total,2040,2040 - Boro share of NYC total
0,Total Population,NYC Total,7891957,100%,7781984,100%,7894862,100%,7071639,100%,...,8008278,100%,8242624,100%,8550971,100%,8821027,100%,9025145,100%
1,Total Population,Bronx,1451277,18.39%,1424815,18.31%,1471701,18.64%,1168972,16.53%,...,1332650,16.64%,1385108,16.8%,1446788,16.92%,1518998,17.22%,1579245,17.5%
2,Total Population,Brooklyn,2738175,34.7%,2627319,33.76%,2602012,32.96%,2230936,31.55%,...,2465326,30.78%,2552911,30.97%,2648452,30.97%,2754009,31.22%,2840525,31.47%
3,Total Population,Manhattan,1960101,24.84%,1698281,21.82%,1539233,19.5%,1428285,20.2%,...,1537195,19.2%,1585873,19.24%,1638281,19.16%,1676720,19.01%,1691617,18.74%
4,Total Population,Queens,1550849,19.65%,1809578,23.25%,1986473,25.16%,1891325,26.75%,...,2229379,27.84%,2250002,27.3%,2330295,27.25%,2373551,26.91%,2412649,26.73%


In [11]:
pop_clean = pop[["Borough", "2020"]].rename(columns={"2020": "Population"})
pop_clean

Unnamed: 0,Borough,Population
0,NYC Total,8550971
1,Bronx,1446788
2,Brooklyn,2648452
3,Manhattan,1638281
4,Queens,2330295
5,Staten Island,487155


In [17]:
# Clean borough names in both datasets
crime_by_borough["BORO_NM"] = crime_by_borough["BORO_NM"].str.strip()
pop_clean["Borough"] = pop_clean["Borough"].str.strip()

# Merge again
merged = pd.merge(
    crime_by_borough,
    pop_clean,
    left_on="BORO_NM",
    right_on="Borough",
    how="inner"
)

merged

Unnamed: 0,BORO_NM,crime_count,Borough,Population
0,Bronx,99029,Bronx,1446788
1,Brooklyn,122257,Brooklyn,2648452
2,Manhattan,103595,Manhattan,1638281
3,Queens,94000,Queens,2330295
4,Staten Island,18706,Staten Island,487155


In [18]:
merged["Population"] = merged["Population"].str.replace(",", "").astype(int)
merged

Unnamed: 0,BORO_NM,crime_count,Borough,Population
0,Bronx,99029,Bronx,1446788
1,Brooklyn,122257,Brooklyn,2648452
2,Manhattan,103595,Manhattan,1638281
3,Queens,94000,Queens,2330295
4,Staten Island,18706,Staten Island,487155


In [19]:
merged["crime_rate_per_100k"] = merged["crime_count"] / merged["Population"] * 100000
merged

Unnamed: 0,BORO_NM,crime_count,Borough,Population,crime_rate_per_100k
0,Bronx,99029,Bronx,1446788,6844.748505
1,Brooklyn,122257,Brooklyn,2648452,4616.168237
2,Manhattan,103595,Manhattan,1638281,6323.396292
3,Queens,94000,Queens,2330295,4033.824044
4,Staten Island,18706,Staten Island,487155,3839.845634


In [20]:
import plotly.express as px

fig = px.bar(
    merged,
    x="BORO_NM",
    y="crime_rate_per_100k",
    color="Population",
    title="NYC Crime Rate per 100,000 Residents by Borough (2025)",
    labels={
        "BORO_NM": "Borough",
        "crime_rate_per_100k": "Crime Rate per 100,000 Residents"
    },
    height=500
)
fig.show()

In [21]:
fig = px.bar(
    merged,
    x="BORO_NM",
    y="crime_count",
    title="Total Crime Incidents by Borough (2025)",
    labels={"BORO_NM": "Borough", "crime_count": "Total Crime Incidents"}
)
fig.show()

In [23]:
ranked = merged.sort_values("crime_rate_per_100k", ascending=False)

fig = px.bar(
    ranked,
    x="BORO_NM",
    y="crime_rate_per_100k",
    title="Ranked Crime Rate per 100k (Highest → Lowest)",
    labels={"BORO_NM": "Borough", "crime_rate_per_100k": "Crime Rate per 100k"}
)
fig.show()

In [25]:
import plotly.graph_objects as go

fig = go.Figure()

# Crime count (left y-axis)
fig.add_trace(go.Bar(
    x=merged["BORO_NM"],
    y=merged["crime_count"],
    name="Crime Count",
    marker_color="blue",
    yaxis="y1"
))

# Population (right y-axis)
fig.add_trace(go.Bar(
    x=merged["BORO_NM"],
    y=merged["Population"] / 1_000_000,
    name="Population (Millions)",
    marker_color="red",
    yaxis="y2"
))

fig.update_layout(
    title="Crime Count vs Population (Two Datasets in One Visualization)",
    xaxis=dict(title="Borough"),
    yaxis=dict(title="Crime Count", side="left", range=[0, max(merged["crime_count"]) * 1.2]),
    yaxis2=dict(
        title="Population (Millions)",
        overlaying="y",
        side="right",
        range=[0, (merged["Population"] / 1_000_000).max() * 1.5]
    ),
    barmode="group",
    height=500
)

fig.show()

In [26]:
import pandas as pd
import plotly.express as px

crime_points = merged[["BORO_NM", "crime_count"]].rename(columns={"crime_count": "value"})
crime_points["type"] = "Crime Count"

pop_points = merged[["BORO_NM", "Population"]].rename(columns={"Population": "value"})
pop_points["type"] = "Population"

combined = pd.concat([crime_points, pop_points])

fig = px.scatter(
    combined,
    x="BORO_NM",
    y="value",
    color="type",
    title="Crime Count and Population as Separate Points (Two Datasets in One Chart)",
    labels={"BORO_NM": "Borough", "value": "Value"},
    height=500
)

fig.show()

In [None]:
#takeaways
#After putting the two datasets together, a few things become much clearer:
# 1. Population shapes the overall crime picture, but it doesn’t tell the whole story.
# Boroughs with more people — especially Brooklyn and Queens — naturally show higher numbers of crime incidents. When you look at the bars side by side, it’s almost a one-to-one pattern. This suggests that a good portion of NYC’s crime variation is simply driven by how many people live (and interact) in each borough.
# 2. Manhattan doesn’t behave like a typical residential borough.
# Even though Manhattan’s population is nowhere near Brooklyn’s or Queens’, its crime count is surprisingly high. This is a good reminder that Manhattan’s “population” in daily life is way bigger than its resident population — commuters, tourists, and businesses change the dynamic. It’s almost like Manhattan is playing by its own rules.

#3. The Bronx stands out once you adjust for population size.
# When shifting from total crime to crime per 100,000 residents, the Bronx moves to the top. What looks “normal” in absolute numbers suddenly becomes disproportionate once we consider population. This suggests that the Bronx faces structural challenges that can’t be explained by population size alone.

# 4. Staten Island is consistently an outlier on both metrics.
# Small population, low crime, and the lowest crime rate. Nothing surprising here, but it reinforces that Staten Island behaves differently from the rest of NYC — more suburban, less dense, and far fewer opportunities for crime to occur.
#Overall:
# Looking at both datasets in the same visualization helps reveal something simple but important:
# crime totals tend to follow population, but crime risk (per capita) does not.
# The Bronx and Manhattan illustrate this gap especially well, and that’s where most of the policy questions — or real safety concerns — start to emerge.