In [24]:
# import ting
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.preprocessing import MinMaxScaler

In [25]:
# add final_df back in
# YOU WILL NEED TO UPDATE THE PATH TO THIS FILE!!
final_df = pd.read_csv('final_df1.csv')

In [26]:
# Check that there are no NaNs. There aren't, barring one unimportant ZPOP entry.
final_df.isna().sum()

ZIP                                    0
Area__                                 0
Length__                               0
geometry                               0
CITY                                   0
RENT_STUDIO                            0
RENT_STUDIO_90                         0
RENT_STUDIO_110                        0
RENT_1BD                               0
RENT_1BD_90                            0
RENT_1BD_110                           0
RENT_2BD                               0
RENT_2BD_90                            0
RENT_2BD_110                           0
RENT_3BD                               0
RENT_3BD_90                            0
RENT_3BD_110                           0
RENT_4BD                               0
RENT_4BD_90                            0
RENT_4BD_110                           0
GEO_ID                                 0
MEDIAN_INCOME_HOUSEHOLD_EST            0
TOTAL_HOUSEHOLDS_EST                   0
BART_COUNT                             0
CalTrain_COUNT  

In [27]:
# Time to start work on normalizing our metrics
scaler = MinMaxScaler()

# we're going to be adding a lot more normalized columns, so let's do it all in a new df
norm_df = final_df.copy()

# store all rent columns in list to iterate over and assign a scalar score
rent_cols = ['RENT_STUDIO', 'RENT_1BD', 'RENT_2BD', 'RENT_3BD', 'RENT_4BD']
# go through all the rent columns, generate normalized scalar value, store in final_df
for col in rent_cols:
    scaled = scaler.fit_transform(norm_df[[col]]) #scale 0-1
    new_col = f'norm_{col.lower()}' #Invert (1 - x) so that Cheap = 1.0, Expensive = 0.0
    norm_df[new_col] = 1 - scaled
    # print(f"Generated {new_col} from {col}")

# scalar transform the income measurements
norm_df['norm_income'] = scaler.fit_transform(norm_df[['MEDIAN_INCOME_HOUSEHOLD_EST']])

# do some kind of scoring with transit instead of just normalizing it. yeah its technically arbitrary
# but better than total mathematical noramlization for a variable with a max value of 4.
def score_transit(count):
    if count >= 3:
        return 1.0
    elif count == 2:
        return 0.8
    elif count == 1:
        return 0.4
    else:
        return 0.0

norm_df['norm_transit'] = norm_df['TOTAL_TRANSIT'].apply(score_transit)

# scale crime, but do 1 - scaled_crime since we obviously don't want high crime (similar idea to rent)
# we have different crime rates based on violence or property (VIOL vs PROP) so let's make two scalers
scaled_crime_viol = scaler.fit_transform(norm_df[['2024_CRIMERATE_VIOL']])
scaled_crime_prop = scaler.fit_transform(norm_df[['2024_CRIMERATE_PROP']])
norm_df['norm_crime_rate_viol'] = 1 - scaled_crime_viol
norm_df['norm_crime_rate_prop'] = 1 - scaled_crime_prop

# and in case we need it later, Imma scale the crime rate over time (trends)
# Massive drop in crime = 1.0 score, big spike in crime = 0.0 score.
scaled_crime_viol_trend = scaler.fit_transform(norm_df[['CHANGE_IN_CRIME_VIOL%']])
scaled_crime_prop_trend = scaler.fit_transform(norm_df[['CHANGE_IN_CRIME_PROP%']])
norm_df['norm_crime_viol_trend'] = 1 - scaled_crime_viol_trend
norm_df['norm_crime_prop_trend'] = 1 - scaled_crime_prop_trend

In [29]:
# Now some display columns so our data isn't ugly in the tooltips and popups going forward
rent_cols = ['RENT_STUDIO', 'RENT_STUDIO_90', 'RENT_STUDIO_110', 'RENT_1BD', 'RENT_1BD_90', 'RENT_1BD_110', 'RENT_2BD', 'RENT_2BD_90', 'RENT_2BD_110', 'RENT_3BD', 'RENT_3BD_90', 'RENT_3BD_110', 'RENT_4BD', 'RENT_4BD_90', 'RENT_4BD_110']
def display_rent_cols(df, rent_cols):
    for col in rent_cols:
        df[f'DISPLAY_{col}'] = df[col].map('${:,d}'.format) # turn the ints into dollar form
    return df
norm_df = display_rent_cols(norm_df, rent_cols)

def display_crime_trends(df, crime_cols):
    for col in crime_cols:
        change = df[col] - 100
        
        # if change is greater than 0, crime goin up. otherwise, going down, and represent as much
        # use change.round(0) to get rid of decimals, then convert to integer and string
        df[f'DISPLAY_{col}'] = np.where(
            change > 0, 
            '+' + change.round(0).astype(int).astype(str) + '% ðŸ“ˆ',  
            change.round(0).astype(int).astype(str) + '% ðŸ“‰'
        )
    return df
norm_df = display_crime_trends(norm_df, ['CHANGE_IN_CRIME_VIOL%', 'CHANGE_IN_CRIME_PROP%'])

norm_df['DISPLAY_2024_CRIMERATE_VIOL'] = (norm_df['2024_CRIMERATE_VIOL'] * 100000).round(0).astype(int).astype(str) + ' violent crimes per 100K people annually.'
norm_df['DISPLAY_2024_CRIMERATE_PROP'] = (norm_df['2024_CRIMERATE_PROP'] * 100000).round(0).astype(int).astype(str) + ' property crimes per 100K people annually.'

base_rent_cols = ['DISPLAY_RENT_STUDIO', 'DISPLAY_RENT_1BD', 'DISPLAY_RENT_2BD', 'DISPLAY_RENT_3BD', 'DISPLAY_RENT_4BD']

# let's make a range for rent so we don't need three values later on
def display_rent_range(df, base_rent_cols):
    for col in base_rent_cols:
        df[f'{col}_RANGE'] = df[f'{col}_90'] + ' - ' +  df[f'{col}_110']     # make the range string
    return df
norm_df = display_rent_range(norm_df, base_rent_cols)
norm_df

Unnamed: 0,ZIP,Area__,Length__,geometry,CITY,RENT_STUDIO,RENT_STUDIO_90,RENT_STUDIO_110,RENT_1BD,RENT_1BD_90,...,DISPLAY_RENT_4BD_110,DISPLAY_CHANGE_IN_CRIME_VIOL%,DISPLAY_CHANGE_IN_CRIME_PROP%,DISPLAY_2024_CRIMERATE_VIOL,DISPLAY_2024_CRIMERATE_PROP,DISPLAY_RENT_STUDIO_RANGE,DISPLAY_RENT_1BD_RANGE,DISPLAY_RENT_2BD_RANGE,DISPLAY_RENT_3BD_RANGE,DISPLAY_RENT_4BD_RANGE
0,94558,1.231326e+10,995176.225313,POLYGON ((-122.10329200180091 38.5132829986466...,Napa,2000,1800,2200,2210,1989,...,"$4,752",-20% ðŸ“‰,-26% ðŸ“‰,313 violent crimes per 100K people annually.,1241 property crimes per 100K people annually.,"$1,800 - $2,200","$1,989 - $2,431","$2,610 - $3,190","$3,321 - $4,059","$3,888 - $4,752"
1,94558,1.231326e+10,995176.225313,POLYGON ((-122.10329200180091 38.5132829986466...,Vallejo,2000,1800,2200,2210,1989,...,"$4,752",-20% ðŸ“‰,-26% ðŸ“‰,313 violent crimes per 100K people annually.,1241 property crimes per 100K people annually.,"$1,800 - $2,200","$1,989 - $2,431","$2,610 - $3,190","$3,321 - $4,059","$3,888 - $4,752"
2,95620,7.236950e+09,441860.201400,POLYGON ((-121.65335500334426 38.3133870006294...,Vallejo,1560,1404,1716,1710,1539,...,"$3,630",-8% ðŸ“‰,-5% ðŸ“‰,477 violent crimes per 100K people annually.,2239 property crimes per 100K people annually.,"$1,404 - $1,716","$1,539 - $1,881","$1,962 - $2,398","$2,628 - $3,212","$2,970 - $3,630"
3,95476,3.001414e+09,311318.546326,POLYGON ((-122.40684300305698 38.1556819994163...,Napa,1970,1773,2167,2180,1962,...,"$4,620",-38% ðŸ“‰,-17% ðŸ“‰,279 violent crimes per 100K people annually.,1090 property crimes per 100K people annually.,"$1,773 - $2,167","$1,962 - $2,398","$2,574 - $3,146","$3,537 - $4,323","$3,780 - $4,620"
4,95476,3.001414e+09,311318.546326,POLYGON ((-122.40684300305698 38.1556819994163...,Santa Rosa-Petaluma,1970,1773,2167,2180,1962,...,"$4,620",-38% ðŸ“‰,-17% ðŸ“‰,279 violent crimes per 100K people annually.,1090 property crimes per 100K people annually.,"$1,773 - $2,167","$1,962 - $2,398","$2,574 - $3,146","$3,537 - $4,323","$3,780 - $4,620"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,94022,4.211365e+08,137369.776481,POLYGON ((-122.09598200263689 37.3860539999608...,San Jose-Sunnyvale-Santa Clara,3160,2844,3476,3540,3186,...,"$6,567",+35% ðŸ“ˆ,+4% ðŸ“ˆ,429 violent crimes per 100K people annually.,2355 property crimes per 100K people annually.,"$2,844 - $3,476","$3,186 - $3,894","$3,771 - $4,609","$4,959 - $6,061","$5,373 - $6,567"
180,94085,9.054792e+07,47415.860456,POLYGON ((-121.98688800346149 37.3886580004076...,San Jose-Sunnyvale-Santa Clara,3180,2862,3498,3610,3249,...,"$6,677",+35% ðŸ“ˆ,+4% ðŸ“ˆ,429 violent crimes per 100K people annually.,2355 property crimes per 100K people annually.,"$2,862 - $3,498","$3,249 - $3,971","$3,798 - $4,642","$5,022 - $6,138","$5,463 - $6,677"
181,94086,1.289431e+08,65560.361543,POLYGON ((-122.05999500035823 37.3752980004996...,San Jose-Sunnyvale-Santa Clara,2910,2619,3201,3310,2979,...,"$6,127",+35% ðŸ“ˆ,+4% ðŸ“ˆ,429 violent crimes per 100K people annually.,2355 property crimes per 100K people annually.,"$2,619 - $3,201","$2,979 - $3,641","$3,483 - $4,257","$4,599 - $5,621","$5,013 - $6,127"
182,94024,2.966973e+08,136286.642980,POLYGON ((-122.09598200263689 37.3860539999608...,San Jose-Sunnyvale-Santa Clara,3260,2934,3586,3710,3339,...,"$6,864",+35% ðŸ“ˆ,+4% ðŸ“ˆ,429 violent crimes per 100K people annually.,2355 property crimes per 100K people annually.,"$2,934 - $3,586","$3,339 - $4,081","$3,906 - $4,774","$5,157 - $6,303","$5,616 - $6,864"


In [33]:
# Export
norm_df.to_csv('final_df_with_norms.csv', index=False) 