In [None]:
import collections
import functools
import IPython
import itertools
import matplotlib
import numpy as np
import os
import pandas as pd
import pathlib
import pypandoc
import requests
import string
import sys
import typing
import us
import zipfile

from datetime import datetime
from tqdm.notebook import tqdm_notebook

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_pipeline.utils import remove_all_from_dir, get_excel_column_name
from data_pipeline.etl.sources.census.etl_utils import get_state_information

# Turn on TQDM for pandas so that we can have progress bars when running `apply`.
tqdm_notebook.pandas()

In [None]:
# Suppress scientific notation in pandas (this shows up for census tract IDs)
pd.options.display.float_format = "{:.2f}".format

# Set some global parameters
DATA_DIR = pathlib.Path.cwd().parent / "data"

GEOID_FIELD_NAME = "GEOID10"
GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
GEOID_STATE_FIELD_NAME = "GEOID10_STATE"
COUNTRY_FIELD_NAME = "Country"
CENSUS_BLOCK_GROUP_POPULATION_FIELD = "Total population"

In [None]:
# Load CEJST score data
cejst_data_path = DATA_DIR / "score" / "csv" / "full" / "usa.csv"
cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: "string"})

cejst_df.head()

In [None]:
columns_to_plot = [
    "Respiratory hazard index",
    "Particulate matter (PM2.5)",
    "Poverty (Less than 200% of federal poverty line)",
    "Percent individuals age 25 or over with less than high school degree",
    "Unemployed civilians (percent)",
    "Linguistic isolation (percent)",
]

column_to_plot = columns_to_plot[0]
print(f"Plotting {column_to_plot}")
print(cejst_df[column_to_plot].hist())

In [None]:
for i in cejst_df.columns:
    print(i)