In [None]:
# PURPOSE: to determine the zipcodes that are most similar

In [129]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
sns.set()
# import math

In [130]:
# Read in data. Data is already clean.
precinct_data = pd.read_csv("ExternalDataByZipcode.csv")
income_age_data = precinct_data[["Zip Codesort column", "Median Household Income", "Median Age"]]
income_age_data = income_age_data.set_index('Zip Codesort column')
income_age_data

Unnamed: 0_level_0,Median Household Income,Median Age
Zip Codesort column,Unnamed: 1_level_1,Unnamed: 2_level_1
77002,"$87,888",37.2
77003,"$86,707",36.1
77004,"$55,289",35.9
77005,"$185,974",37.8
77006,"$90,467",41.3
...,...,...
77562,"$64,515",36.1
77571,"$87,410",36.9
77586,"$110,253",41.3
77587,"$50,779",31.1


In [None]:
import re

# Create matrix where every entry is the absolute difference in med household income in precinct
incomes = pd.DataFrame(columns=income_age_data.index, index=income_age_data.index)
max_income_diff = 0 # keep track of max diff for normalization purposes later on
for zipcode2 in incomes.columns:
    for zipcode1 in incomes.index:
        med_income1_str = income_age_data.loc[zipcode1, "Median Household Income"]
        med_income2_str = income_age_data.loc[zipcode2, "Median Household Income"]
        # remove non-numeric vals from med_income1 and med_income2
        med_income1 = re.sub('[^0-9]','', med_income1_str)
        med_income2 = re.sub('[^0-9]','', med_income2_str)

        incomes.loc[zipcode1, zipcode2] = abs(float(med_income1) - float(med_income2))
        max_income_diff = max(max_income_diff, abs(float(med_income1) - float(med_income2)))
    # cast to floats
    incomes[zipcode2] = pd.to_numeric(incomes[zipcode2])

# Normalize incomes matrix
for zipcode1 in incomes.index:
    for zipcode2 in incomes.columns:
        incomes.loc[zipcode1, zipcode2] /= max_income_diff

incomes

In [None]:
# Create matrix where every entry is the absolute difference in med age in precinct
ages = pd.DataFrame(columns=income_age_data.index, index=income_age_data.index)
max_age_diff = 0 # keep track of max diff for normalization purposes later on
for zipcode2 in ages.columns:
    for zipcode1 in ages.index:
        med_age1 = income_age_data.loc[zipcode1, "Median Age"]
        med_age2 = income_age_data.loc[zipcode2, "Median Age"]
        ages.loc[zipcode1, zipcode2] = abs(float(med_age1) - float(med_age2))
        max_age_diff = max(max_age_diff, abs(float(med_age1) - float(med_age2)))
    # cast to floats
    ages[zipcode2] = pd.to_numeric(ages[zipcode2])

# Normalize ages matrix
for zipcode1 in ages.index:
    for zipcode2 in ages.columns:
        ages.loc[zipcode1, zipcode2] /= max_age_diff
ages

In [None]:
# make side-by-side heat map of med household income and med age
income_hmap = sns.heatmap(incomes, robust=True, cmap=sns.color_palette("mako", as_cmap=True))
income_hmap

In [None]:
# make side-by-side heat map of med household income and med age
age_hmap = sns.heatmap(ages, robust=True, cmap=sns.color_palette("mako", as_cmap=True))
age_hmap

In [None]:
hmaps, axes = plt.subplots(1, 2, figsize=(28, 10))
hmaps.suptitle("Comparison of Zipcodes by Demographic")
sns.heatmap(ax=axes[0], data=incomes, robust=True, cmap=sns.color_palette("mako", as_cmap=True))
sns.heatmap(ax=axes[1], data=ages, robust=True, cmap=sns.color_palette("mako", as_cmap=True))
axes[0].set_title("Difference in Median Household Income")
axes[1].set_title("Difference in Median Age")