In [74]:
# import ting
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.preprocessing import MinMaxScaler

In [75]:
# add final_df back in
final_df = pd.read_csv('~/Downloads/opportunity mapper/final_df1.csv')

In [76]:
# Check that there are no NaNs. There aren't, barring one unimportant ZPOP entry.
final_df.isna().sum()

ZIP                                    0
Area__                                 0
Length__                               0
geometry                               0
CITY                                   0
RENT_STUDIO                            0
RENT_STUDIO_90                         0
RENT_STUDIO_110                        0
RENT_1BD                               0
RENT_1BD_90                            0
RENT_1BD_110                           0
RENT_2BD                               0
RENT_2BD_90                            0
RENT_2BD_110                           0
RENT_3BD                               0
RENT_3BD_90                            0
RENT_3BD_110                           0
RENT_4BD                               0
RENT_4BD_90                            0
RENT_4BD_110                           0
GEO_ID                                 0
MEDIAN_INCOME_HOUSEHOLD_EST            0
TOTAL_HOUSEHOLDS_EST                   0
BART_COUNT                             0
CalTrain_COUNT  

In [77]:
# Time to start work on normalizing our metrics
scaler = MinMaxScaler()

# we're going to be adding a lot more normalized columns, so let's do it all in a new df
norm_df = final_df.copy()

# store all rent columns in list to iterate over and assign a scalar score
rent_cols = ['RENT_STUDIO', 'RENT_1BD', 'RENT_2BD', 'RENT_3BD', 'RENT_4BD']
# go through all the rent columns, generate normalized scalar value, store in final_df
for col in rent_cols:
    scaled = scaler.fit_transform(norm_df[[col]]) #scale 0-1
    new_col = f'norm_{col.lower()}' #Invert (1 - x) so that Cheap = 1.0, Expensive = 0.0
    norm_df[new_col] = 1 - scaled
    # print(f"Generated {new_col} from {col}")

# scalar transform the income measurements
norm_df['norm_income'] = scaler.fit_transform(norm_df[['MEDIAN_INCOME_HOUSEHOLD_EST']])

# do some kind of scoring with transit instead of just normalizing it. yeah its technically arbitrary
# but better than total mathematical noramlization for a variable with a max value of 4.
def score_transit(count):
    if count >= 3:
        return 1.0
    elif count == 2:
        return 0.8
    elif count == 1:
        return 0.4
    else:
        return 0.0

norm_df['norm_transit'] = norm_df['TOTAL_TRANSIT'].apply(score_transit)

# scale crime, but do 1 - scaled_crime since we obviously don't want high crime (similar idea to rent)
# we have different crime rates based on violence or property (VIOL vs PROP) so let's make two scalers
scaled_crime_viol = scaler.fit_transform(norm_df[['2024_CRIMERATE_VIOL']])
scaled_crime_prop = scaler.fit_transform(norm_df[['2024_CRIMERATE_PROP']])
norm_df['norm_crime_rate_viol'] = 1 - scaled_crime_viol
norm_df['norm_crime_rate_prop'] = 1 - scaled_crime_prop

# and in case we need it later, Imma scale the crime rate over time (trends)
# Massive drop in crime = 1.0 score, big spike in crime = 0.0 score.
scaled_crime_viol_trend = scaler.fit_transform(norm_df[['CHANGE_IN_CRIME_VIOL%']])
scaled_crime_prop_trend = scaler.fit_transform(norm_df[['CHANGE_IN_CRIME_PROP%']])
norm_df['norm_crime_viol_trend'] = 1 - scaled_crime_viol_trend
norm_df['norm_crime_prop_trend'] = 1 - scaled_crime_prop_trend

In [80]:
# With the normalized values, let's make a function to calculate the final score for
# each zip code (with the user's weight applied since they'll use a scaler in the final UI
