In [6]:
# import ting
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.preprocessing import MinMaxScaler

In [8]:
# add final_df back in
final_df = pd.read_csv('~/Downloads/opportunity mapper/bay-area-opportunity-mapper/final_df1.csv')

In [9]:
# Check that there are no NaNs. There aren't, barring one unimportant ZPOP entry.
final_df.isna().sum()

ZIP                                    0
Area__                                 0
Length__                               0
geometry                               0
CITY                                   0
RENT_STUDIO                            0
RENT_STUDIO_90                         0
RENT_STUDIO_110                        0
RENT_1BD                               0
RENT_1BD_90                            0
RENT_1BD_110                           0
RENT_2BD                               0
RENT_2BD_90                            0
RENT_2BD_110                           0
RENT_3BD                               0
RENT_3BD_90                            0
RENT_3BD_110                           0
RENT_4BD                               0
RENT_4BD_90                            0
RENT_4BD_110                           0
GEO_ID                                 0
MEDIAN_INCOME_HOUSEHOLD_EST            0
TOTAL_HOUSEHOLDS_EST                   0
BART_COUNT                             0
CalTrain_COUNT  

In [10]:
# Time to start work on normalizing our metrics
scaler = MinMaxScaler()

# we're going to be adding a lot more normalized columns, so let's do it all in a new df
norm_df = final_df.copy()

# store all rent columns in list to iterate over and assign a scalar score
rent_cols = ['RENT_STUDIO', 'RENT_1BD', 'RENT_2BD', 'RENT_3BD', 'RENT_4BD']
# go through all the rent columns, generate normalized scalar value, store in final_df
for col in rent_cols:
    scaled = scaler.fit_transform(norm_df[[col]]) #scale 0-1
    new_col = f'norm_{col.lower()}' #Invert (1 - x) so that Cheap = 1.0, Expensive = 0.0
    norm_df[new_col] = 1 - scaled
    # print(f"Generated {new_col} from {col}")

# scalar transform the income measurements
norm_df['norm_income'] = scaler.fit_transform(norm_df[['MEDIAN_INCOME_HOUSEHOLD_EST']])

# do some kind of scoring with transit instead of just normalizing it. yeah its technically arbitrary
# but better than total mathematical noramlization for a variable with a max value of 4.
def score_transit(count):
    if count >= 3:
        return 1.0
    elif count == 2:
        return 0.8
    elif count == 1:
        return 0.4
    else:
        return 0.0

norm_df['norm_transit'] = norm_df['TOTAL_TRANSIT'].apply(score_transit)

# scale crime, but do 1 - scaled_crime since we obviously don't want high crime (similar idea to rent)
# we have different crime rates based on violence or property (VIOL vs PROP) so let's make two scalers
scaled_crime_viol = scaler.fit_transform(norm_df[['2024_CRIMERATE_VIOL']])
scaled_crime_prop = scaler.fit_transform(norm_df[['2024_CRIMERATE_PROP']])
norm_df['norm_crime_rate_viol'] = 1 - scaled_crime_viol
norm_df['norm_crime_rate_prop'] = 1 - scaled_crime_prop

# and in case we need it later, Imma scale the crime rate over time (trends)
# Massive drop in crime = 1.0 score, big spike in crime = 0.0 score.
scaled_crime_viol_trend = scaler.fit_transform(norm_df[['CHANGE_IN_CRIME_VIOL%']])
scaled_crime_prop_trend = scaler.fit_transform(norm_df[['CHANGE_IN_CRIME_PROP%']])
norm_df['norm_crime_viol_trend'] = 1 - scaled_crime_viol_trend
norm_df['norm_crime_prop_trend'] = 1 - scaled_crime_prop_trend

In [43]:
# Now some display columns so our data isn't ugly in the tooltips and popups going forward
rent_cols = ['RENT_STUDIO', 'RENT_STUDIO_90', 'RENT_STUDIO_110', 'RENT_1BD', 'RENT_1BD_90', 'RENT_1BD_110', 'RENT_2BD', 'RENT_2BD_90', 'RENT_2BD_110', 'RENT_3BD', 'RENT_3BD_90', 'RENT_3BD_110', 'RENT_4BD', 'RENT_4BD_90', 'RENT_4BD_110']
def display_rent_cols(df, rent_cols):
    for col in rent_cols:
        df[f'DISPLAY_{col}'] = df[col].map('${:,d}'.format) # turn the ints into dollar form
    return df
norm_df = display_rent_cols(norm_df, rent_cols)

def display_crime_trends(df, crime_cols):
    for col in crime_cols:
        df[f'DISPLAY_{col}'] = np.where(df[col] > 100, 'ðŸ“ˆ', 'ðŸ“‰') # condition that checks if crime is increasing, assigns right emoji
    return df
norm_df = display_crime_trends(norm_df, ['CHANGE_IN_CRIME_VIOL%', 'CHANGE_IN_CRIME_PROP%'])

norm_df['DISPLAY_2024_CRIMERATE_VIOL'] = (norm_df['2024_CRIMERATE_VIOL'] * 100000).round(0).astype(int).astype(str) + ' violent crimes reported per 100,000 people annually.'
norm_df['DISPLAY_2024_CRIMERATE_PROP'] = (norm_df['2024_CRIMERATE_PROP'] * 100000).round(0).astype(int).astype(str) + ' property crimes reported per 100,000 people annually.'
norm_df

Unnamed: 0,ZIP,Area__,Length__,geometry,CITY,RENT_STUDIO,RENT_STUDIO_90,RENT_STUDIO_110,RENT_1BD,RENT_1BD_90,...,DISPLAY_RENT_3BD,DISPLAY_RENT_3BD_90,DISPLAY_RENT_3BD_110,DISPLAY_RENT_4BD,DISPLAY_RENT_4BD_90,DISPLAY_RENT_4BD_110,DISPLAY_CHANGE_IN_CRIME_VIOL%,DISPLAY_CHANGE_IN_CRIME_PROP%,DISPLAY_2024_CRIMERATE_VIOL,DISPLAY_2024_CRIMERATE_PROP
0,94558,1.231326e+10,995176.225313,POLYGON ((-122.10329200180091 38.5132829986466...,Napa,2000,1800,2200,2210,1989,...,"$3,690","$3,321","$4,059","$4,320","$3,888","$4,752",ðŸ“‰,ðŸ“‰,"313 violent crimes reported per 100,000 people...","1241 property crimes reported per 100,000 peop..."
1,94558,1.231326e+10,995176.225313,POLYGON ((-122.10329200180091 38.5132829986466...,Vallejo,2000,1800,2200,2210,1989,...,"$3,690","$3,321","$4,059","$4,320","$3,888","$4,752",ðŸ“‰,ðŸ“‰,"313 violent crimes reported per 100,000 people...","1241 property crimes reported per 100,000 peop..."
2,95620,7.236950e+09,441860.201400,POLYGON ((-121.65335500334426 38.3133870006294...,Vallejo,1560,1404,1716,1710,1539,...,"$2,920","$2,628","$3,212","$3,300","$2,970","$3,630",ðŸ“‰,ðŸ“‰,"477 violent crimes reported per 100,000 people...","2239 property crimes reported per 100,000 peop..."
3,95476,3.001414e+09,311318.546326,POLYGON ((-122.40684300305698 38.1556819994163...,Napa,1970,1773,2167,2180,1962,...,"$3,930","$3,537","$4,323","$4,200","$3,780","$4,620",ðŸ“‰,ðŸ“‰,"279 violent crimes reported per 100,000 people...","1090 property crimes reported per 100,000 peop..."
4,95476,3.001414e+09,311318.546326,POLYGON ((-122.40684300305698 38.1556819994163...,Santa Rosa-Petaluma,1970,1773,2167,2180,1962,...,"$3,930","$3,537","$4,323","$4,200","$3,780","$4,620",ðŸ“‰,ðŸ“‰,"279 violent crimes reported per 100,000 people...","1090 property crimes reported per 100,000 peop..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,94022,4.211365e+08,137369.776481,POLYGON ((-122.09598200263689 37.3860539999608...,San Jose-Sunnyvale-Santa Clara,3160,2844,3476,3540,3186,...,"$5,510","$4,959","$6,061","$5,970","$5,373","$6,567",ðŸ“ˆ,ðŸ“ˆ,"429 violent crimes reported per 100,000 people...","2355 property crimes reported per 100,000 peop..."
180,94085,9.054792e+07,47415.860456,POLYGON ((-121.98688800346149 37.3886580004076...,San Jose-Sunnyvale-Santa Clara,3180,2862,3498,3610,3249,...,"$5,580","$5,022","$6,138","$6,070","$5,463","$6,677",ðŸ“ˆ,ðŸ“ˆ,"429 violent crimes reported per 100,000 people...","2355 property crimes reported per 100,000 peop..."
181,94086,1.289431e+08,65560.361543,POLYGON ((-122.05999500035823 37.3752980004996...,San Jose-Sunnyvale-Santa Clara,2910,2619,3201,3310,2979,...,"$5,110","$4,599","$5,621","$5,570","$5,013","$6,127",ðŸ“ˆ,ðŸ“ˆ,"429 violent crimes reported per 100,000 people...","2355 property crimes reported per 100,000 peop..."
182,94024,2.966973e+08,136286.642980,POLYGON ((-122.09598200263689 37.3860539999608...,San Jose-Sunnyvale-Santa Clara,3260,2934,3586,3710,3339,...,"$5,730","$5,157","$6,303","$6,240","$5,616","$6,864",ðŸ“ˆ,ðŸ“ˆ,"429 violent crimes reported per 100,000 people...","2355 property crimes reported per 100,000 peop..."


In [20]:
norm_df.columns

Index(['ZIP', 'Area__', 'Length__', 'geometry', 'CITY', 'RENT_STUDIO',
       'RENT_STUDIO_90', 'RENT_STUDIO_110', 'RENT_1BD', 'RENT_1BD_90',
       'RENT_1BD_110', 'RENT_2BD', 'RENT_2BD_90', 'RENT_2BD_110', 'RENT_3BD',
       'RENT_3BD_90', 'RENT_3BD_110', 'RENT_4BD', 'RENT_4BD_90',
       'RENT_4BD_110', 'GEO_ID', 'MEDIAN_INCOME_HOUSEHOLD_EST',
       'TOTAL_HOUSEHOLDS_EST', 'BART_COUNT', 'CalTrain_COUNT', 'TOTAL_TRANSIT',
       'ZPOP', 'COUNTY', '2020_CRIMERATE_VIOL', '2024_CRIMERATE_VIOL',
       'CHANGE_IN_CRIME_VIOL%', '2020_CRIMERATE_PROP', '2024_CRIMERATE_PROP',
       'CHANGE_IN_CRIME_PROP%', 'DISPLAY_MEDIAN_INCOME_HOUSEHOLD_EST',
       'norm_rent_studio', 'norm_rent_1bd', 'norm_rent_2bd', 'norm_rent_3bd',
       'norm_rent_4bd', 'norm_income', 'norm_transit', 'norm_crime_rate_viol',
       'norm_crime_rate_prop', 'norm_crime_viol_trend',
       'norm_crime_prop_trend', 'DISPLAY_RENT_STUDIO',
       'DISPLAY_RENT_STUDIO_90', 'DISPLAY_RENT_STUDIO_110', 'DISPLAY_RENT_1BD',
   

In [6]:
# Export
norm_df.to_csv('final_df_with_norms.csv', index=False) 