In [27]:
# imports and stuff
import numpy as np
import pandas as pd

In [28]:
# load in the base dataframes we will be working with
base_rent = pd.read_excel('~/Downloads/opportunity mapper/raw datasets/FY2026-Rent.xlsx')
base_income = pd.read_csv('~/Downloads/opportunity mapper/raw datasets/MedianIncomeData/MedianIncomeData.csv')
base_crime = pd.read_csv('~/Downloads/opportunity mapper/raw datasets/ba_crime_combined.csv')
base_pop = pd.read_csv('~/Downloads/opportunity mapper/raw datasets/CA_county_pop.csv')

In [29]:
# new database with better names. Suffixes _90 and _110 denote their margin of error (RENT_STUDIO_90) 
# is the lower end of RENT_STUDIO, RENT_STUDIO_110 is the upper end. #BD refers to number of bedrooms.

rent = base_rent.rename(columns={
    'ZIP\nCode': "ZIP_CODE",
    'HUD Area Code': "HUD_AREA_CODE",
    'HUD Fair Market Rent Area Name': 'CITY_STATE',
    'SAFMR\n0BR': 'RENT_STUDIO',
    'SAFMR\n0BR - 90%\nPayment\nStandard': 'RENT_STUDIO_90',
    'SAFMR\n0BR - 110%\nPayment\nStandard': 'RENT_STUDIO_110',
    'SAFMR\n1BR': 'RENT_1BD',
    'SAFMR\n1BR - 90%\nPayment\nStandard': 'RENT_1BD_90',
    'SAFMR\n1BR - 110%\nPayment\nStandard': 'RENT_1BD_110',
    'SAFMR\n2BR': 'RENT_2BD',
    'SAFMR\n2BR - 90%\nPayment\nStandard': 'RENT_2BD_90',
    'SAFMR\n2BR - 110%\nPayment\nStandard': 'RENT_2BD_110',
    'SAFMR\n3BR': 'RENT_3BD',
    'SAFMR\n3BR - 90%\nPayment\nStandard': 'RENT_3BD_90',
    'SAFMR\n3BR - 110%\nPayment\nStandard': 'RENT_3BD_110',
    'SAFMR\n4BR': 'RENT_4BD',
    'SAFMR\n4BR - 90%\nPayment\nStandard': 'RENT_4BD_90',
    'SAFMR\n4BR - 110%\nPayment\nStandard': 'RENT_4BD_110'
})

In [30]:
# get only california
rent = rent[rent['CITY_STATE'].str.contains(', CA')]

In [31]:
# get only Bay Area
rent = rent[rent['CITY_STATE'].isin([
    "Oakland-Fremont, CA HUD Metro FMR Area",
    "San Francisco, CA HUD Metro FMR Area",
    "San Jose-Sunnyvale-Santa Clara, CA HUD Metro FMR Area",
    "Napa, CA MSA",
    "Santa Rosa-Petaluma, CA MSA",
    "Vallejo, CA MSA"])]

In [32]:
# rent for different types of living spaces in the Bay Area is now in the variable 'rent'
# Finally, we change ZIP_CODE to be of type str instead of int for general ease.
rent["ZIP_CODE"] = rent["ZIP_CODE"].astype(str)
# for a specific example, here's rent for zip codes in the general San Francisco area:
rent[rent["CITY_STATE"].str.contains("San Francisco")].head()

Unnamed: 0,ZIP_CODE,HUD_AREA_CODE,CITY_STATE,RENT_STUDIO,RENT_STUDIO_90,RENT_STUDIO_110,RENT_1BD,RENT_1BD_90,RENT_1BD_110,RENT_2BD,RENT_2BD_90,RENT_2BD_110,RENT_3BD,RENT_3BD_90,RENT_3BD_110,RENT_4BD,RENT_4BD_90,RENT_4BD_110
22533,94002,METRO41860MM7360,"San Francisco, CA HUD Metro FMR Area",2720,2448,2992,3260,2934,3586,3950,3555,4345,5050,4545,5555,5230,4707,5753
22534,94005,METRO41860MM7360,"San Francisco, CA HUD Metro FMR Area",2390,2151,2629,2870,2583,3157,3470,3123,3817,4430,3987,4873,4590,4131,5049
22535,94010,METRO41860MM7360,"San Francisco, CA HUD Metro FMR Area",2520,2268,2772,3020,2718,3322,3660,3294,4026,4680,4212,5148,4850,4365,5335
22536,94011,METRO41860MM7360,"San Francisco, CA HUD Metro FMR Area",2670,2403,2937,3200,2880,3520,3870,3483,4257,4940,4446,5434,5120,4608,5632
22537,94014,METRO41860MM7360,"San Francisco, CA HUD Metro FMR Area",2050,1845,2255,2510,2259,2761,2990,2691,3289,3760,3384,4136,3960,3564,4356


In [33]:
# NOW LOOKING AT BASE MEDIAN INCOME

In [34]:
# Now we set up our database for the median income. Before we merge the zip codes with the rent database to
# account only for Bay Area zip codes, we gotta do some cleaning up. We set a column just for zip codes,
# rename the columns to something readable, then filter out the unnecessary columns. This new dataframe
# is called income as opposed to base_income.
income = base_income
income['ZIP_CODE'] = base_income['NAME'].str.split().str[1]
income = income.rename(columns={
    'S1901_C01_001E': "TOTAL_HOUSEHOLDS_EST",
    'S1901_C01_012E': "MEDIAN_INCOME_HOUSEHOLD_EST"
})
income = income.loc[2:, ["GEO_ID", "ZIP_CODE", "MEDIAN_INCOME_HOUSEHOLD_EST", "TOTAL_HOUSEHOLDS_EST"]]

In [35]:
# Now we merge, specifically an inner merge with rent's already determined Bay Area zip codes.
# We DO NOT WANT zip codes that do not pertain to housing, like SFO Airport (94128) or PO Box
# zip codes. Inner merge accounts for this by making sure the zip code has both people living
# there, paying rent, and having enough households to measure median income. Of course, income
# still has missing values (represented by the hyphen: -) that are exclusive to the Bay Area,
# like the aforementioned SFO Airport. We will drop them after our merge. Them having < 200
# TOTAL_HOUSEHOLDS_EST is a red flag already, and those with high numbers tend to be AirBNB
# or hotel areas on large natural plots of land (e.g. 94923). Not what we're measuring with 
# this project. So, we remove those and any zip codes with fewer than 200 estimated households.
# The > 200 estimated households filter only removes 11 zip codes.

ba_data = rent.merge(income, on="ZIP_CODE", how="inner")
ba_data = ba_data[ba_data["MEDIAN_INCOME_HOUSEHOLD_EST"] != "-"]
ba_data = ba_data[ba_data["TOTAL_HOUSEHOLDS_EST"].astype(int) > 200]

In [36]:
# NOW LOOKING AT CRIME

In [37]:
# First things first, we pull population data per county to get a 
# measurement for crime rates. We filter down to only Bay Area
# counties. Then, we keep only years with entry '1' or '5' (2020 and 2024 approximately). We
# can use these to measure the crime rates for each year and see if crime is increasing or not.
pop = base_pop[base_pop["CTYNAME"].isin(
    ["Alameda County",
    "Contra Costa County",
    "Marin County",
    "Napa County",
    "San Francisco County",
    "San Mateo County",
    "Santa Clara County",
    "Solano County",
    "Sonoma County",]
)]
pop = pop[(pop["YEAR"] == 1) | (pop["YEAR"] == 5)] #keep pop stats in 2020 and 2024
pop = pop[["CTYNAME", "YEAR", "POPESTIMATE", "MEDIAN_AGE_TOT"]] #keep the columns we care about
pop

Unnamed: 0,CTYNAME,YEAR,POPESTIMATE,MEDIAN_AGE_TOT
0,Alameda County,1,1682296,38.1
4,Alameda County,5,1638142,39.3
36,Contra Costa County,1,1165983,40.0
40,Contra Costa County,5,1162179,40.8
120,Marin County,1,262325,47.0
124,Marin County,5,255706,47.9
162,Napa County,1,138014,42.3
166,Napa County,5,133444,43.3
222,San Francisco County,1,878392,38.6
226,San Francisco County,5,819151,40.3


In [38]:
# Now, we separate violent crime from property crime based on county. We can fix the entries in
# the COUNTY columns to make a future join with the pop database easier.

base_crime = base_crime.replace({
    "Santaclara": "Santa Clara",
    "Contracosta": "Contra Costa",
    "Sanfrancisco": "San Francisco",
    "Sanmateo": "San Mateo",
})
violent_crime = base_crime[base_crime['CRIME_CATEGORY'] == "Violent Crimes"]
violent_crime.loc[:, "2020"] = violent_crime["2020"].str.replace(",", "").astype(int) #turn values to int
violent_crime.loc[:, "2024"] = violent_crime["2024"].str.replace(",", "").astype(int) #turn values to int
property_crime = base_crime[base_crime['CRIME_CATEGORY'] == "Property Crimes"]
property_crime.loc[:, "2020"] = property_crime["2020"].str.replace(",", "").astype(int) #turn values to int 
property_crime.loc[:, "2024"] = property_crime["2024"].str.replace(",", "").astype(int) #turn values to int

In [39]:
# We clean the population table up a bit before 
pop["CTYNAME"] = pop["CTYNAME"].str.split(" County").str[0]
pop = pop.rename(columns={"CTYNAME":"COUNTY"})

In [40]:
# And finally, clearing up what the ambiguous 'year column' represents
pop = pop.rename(columns={"YEAR":"POPYEAR"})
pop["POPYEAR"] = pop["POPYEAR"].replace({1: 2020, 5: 2024})

In [41]:
# Now we merge population and crime, before using pivot_tables to get our year-based crime and
# population data side by side for each county.

violent_df = pop.merge(violent_crime, on="COUNTY")
property_df = pop.merge(property_crime, on="COUNTY")

# We now create a 'wide' population table using pivot_table.
# This reshapes the data, turning the values from 'POPYEAR' (2020, 2024) into new columns.
pop_df = property_df.pivot_table(
    index='COUNTY', 
    columns='POPYEAR', 
    values='POPESTIMATE'
).rename(columns={2020: 'POP2020', 2024: 'POP2024'})

# Step 2: Merge the population data, drop duplicates, and calculate rates in one chained command.
property_df = property_df.merge(pop_df, on='COUNTY') \
                      .drop_duplicates(subset=['COUNTY', 'CRIME_CATEGORY'])
property_df

#---------

# We do the same for the violent_df
vi_df = violent_df.pivot_table(
    index='COUNTY', 
    columns='POPYEAR', 
    values='POPESTIMATE'
).rename(columns={2020: 'POP2020', 2024: 'POP2024'})

# Step 2: Merge the population data, drop duplicates, and calculate rates in one chained command.
violent_df = violent_df.merge(vi_df, on='COUNTY') \
                      .drop_duplicates(subset=['COUNTY', 'CRIME_CATEGORY'])
violent_df

Unnamed: 0,COUNTY,POPYEAR,POPESTIMATE,MEDIAN_AGE_TOT,CRIME_CATEGORY,2020,2024,POP2020,POP2024
0,Alameda,2020,1682296,38.1,Violent Crimes,9645,13075,1682296.0,1638142.0
2,Contra Costa,2020,1165983,40.0,Violent Crimes,3870,4437,1165983.0,1162179.0
4,Marin,2020,262325,47.0,Violent Crimes,623,625,262325.0,255706.0
6,Napa,2020,138014,42.3,Violent Crimes,539,418,138014.0,133444.0
8,San Francisco,2020,878392,38.6,Violent Crimes,4922,4978,878392.0,819151.0
10,San Mateo,2020,764659,39.9,Violent Crimes,1754,1927,764659.0,734267.0
12,Santa Clara,2020,1936278,37.4,Violent Crimes,6141,8139,1936278.0,1897137.0
14,Solano,2020,453551,38.5,Violent Crimes,2350,2152,453551.0,451060.0
16,Sonoma,2020,488862,42.4,Violent Crimes,2212,1350,488862.0,483366.0


In [42]:
# Now, violent_df and property_df accurately show population and crime data per year 2020 and 2024
# Theoretically, we could make it clearer that columns 2020 and 2024 pertain to crime, but since
# we only care about crime rates for our purposes, I won't bother. We'll also shorten crime_category
# to just say the type of crime, not the word 'crimes'

violent_df["2020_CRIMERATE_VIOL"] = violent_df["2020"] / violent_df["POP2020"]
violent_df["2024_CRIMERATE_VIOL"] = violent_df["2024"] / violent_df["POP2024"]
violent_df["CHANGE_IN_CRIME_VIOL%"] = violent_df["2024_CRIMERATE_VIOL"] / violent_df["2020_CRIMERATE_VIOL"] * 100
violent_df["CRIME_CATEGORY"] = violent_df["CRIME_CATEGORY"].str.split().str[0]

property_df["2020_CRIMERATE_PROP"] = property_df["2020"] / property_df["POP2020"]
property_df["2024_CRIMERATE_PROP"] = property_df["2024"] / property_df["POP2024"]
property_df["CHANGE_IN_CRIME_PROP%"] = property_df["2024_CRIMERATE_PROP"] / property_df["2020_CRIMERATE_PROP"] * 100
property_df["CRIME_CATEGORY"] = property_df["CRIME_CATEGORY"].str.split().str[0]

In [43]:
# Finally, let's merge violent_df and property_df into one final crime dataframe.
# First, drop unnecessary columns entirely.

violent_clean = violent_df[['COUNTY', '2020_CRIMERATE_VIOL', '2024_CRIMERATE_VIOL', 'CHANGE_IN_CRIME_VIOL%']]
property_clean = property_df[['COUNTY', '2020_CRIMERATE_PROP', '2024_CRIMERATE_PROP', "CHANGE_IN_CRIME_PROP%"]]

# Step 2: Merge the two clean DataFrames on the 'COUNTY' column.
# Since both tables have a 'COUNTY' column, this is very easy.
# Voila, crime_df accounts for crime based on County and measures its change over time, as well as whether its violent or property-related
crime_df = pd.merge(violent_clean, property_clean, on='COUNTY')
crime_df

Unnamed: 0,COUNTY,2020_CRIMERATE_VIOL,2024_CRIMERATE_VIOL,CHANGE_IN_CRIME_VIOL%,2020_CRIMERATE_PROP,2024_CRIMERATE_PROP,CHANGE_IN_CRIME_PROP%
0,Alameda,0.005733,0.007982,139.216379,0.034651,0.043144,124.510628
1,Contra Costa,0.003319,0.003818,115.026435,0.020324,0.021601,106.279726
2,Marin,0.002375,0.002444,102.917857,0.020101,0.014544,72.35477
3,Napa,0.003905,0.003132,80.206877,0.016708,0.01241,74.272
4,San Francisco,0.005603,0.006077,108.452031,0.044858,0.040128,89.455703
5,San Mateo,0.002294,0.002624,114.41051,0.020406,0.016621,81.44793
6,Santa Clara,0.003172,0.00429,135.269837,0.022564,0.02355,104.368869
7,Solano,0.005181,0.004771,92.080192,0.023567,0.022392,95.011486
8,Sonoma,0.004525,0.002793,61.724677,0.013116,0.010903,83.124163


In [1]:
# NOW WE EXPORT!
# We can merge these cleaned dataframes in a separate jupyter file. As for our BART and CalTrain location info,
# that is recorded in latitude and longitude coordinates, not zip codes, so we can work with it in GeoPandas
# without necessarily needing to do any cleaning with it here.

#ba_data.to_csv('cl_rent_income.csv', index=False) 
#crime_df.to_csv('cl_crime.csv', index=False)

NameError: name 'ba_data' is not defined