In [1]:
import json
import requests

In [2]:
import geopandas as gpd
import pandas as pd

In [3]:
from tqdm.notebook import tqdm

## Open State data

In [4]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

## Get Poplation Data

In [5]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [6]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [7]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

In [8]:
states_with_population_df = states_with_population_df.dropna()

## Get Data

In [9]:
url = "https://data.bls.gov/OESServices/resultsgeoocc"
headers = {
    "accept": "application/json, text/plain, */*",
    "accept-language": "en-US,en;q=0.9,ru;q=0.8,el;q=0.7",
    "cache-control": "no-cache",
    "content-type": "application/json;charset=UTF-8",
    "pragma": "no-cache",
    "sec-ch-ua": '"Google Chrome";v="141", "Not?A_Brand";v="8", "Chromium";v="141"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"Windows"',
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "Referer": "https://data.bls.gov/oes/",
}

In [10]:
body = {
    "areaTypeCode": "S",
    "areaCode": "1700000",
    "industryCode": "000000",
    "occupationCode": ["250000", "333051"],
    "datatype": ["01", "03", "08"],
    "releaseDateCode": ["2024A01", "2024A01"],
    "outputType": "H",
}
r = requests.post(url, headers=headers, data=json.dumps(body))

In [11]:
r.json()["resultsGeoOccVO"][0]["occupations"][1]

{'occupationCode': '333051',
 'occupationName': "Police and Sheriff's Patrol Officers",
 'values': [{'dataTypeCode': '01', 'value': '       29790'},
  {'dataTypeCode': '03', 'value': '       44.24'},
  {'dataTypeCode': '08', 'value': '       48.81'}]}

In [15]:
data = []
for i in tqdm(range(len(states_df)), desc="Parsing States"):
    body = {
        "areaTypeCode": "S",
        "areaCode": f"{states_df.iloc[i]['STATEFP']}00000",
        "industryCode": "000000",
        "industryCode": "000000",
        "occupationCode": ["250000", "333051"],
        "datatype": ["01", "03", "08"],
        "releaseDateCode": ["2024A01", "2024A01"],
        "outputType": "H",
    }
    r = requests.post(url, headers=headers, data=json.dumps(body))
    json_resp = r.json()
    if json_resp["resultsGeoOccVO"][0]["occupations"]:
        try:
            teacher_data = r.json()["resultsGeoOccVO"][0]["occupations"][0]
            police_data = r.json()["resultsGeoOccVO"][0]["occupations"][1]
        except Exception as e:
            print(states_df.iloc[i]["NAME"])

        teachers = int(teacher_data["values"][0]["value"])
        teacher_mean_wage = float(teacher_data["values"][1]["value"])
        teacher_median_wage = float(teacher_data["values"][2]["value"])
        police_officers = int(police_data["values"][0]["value"])
        police_mean_wage = float(police_data["values"][1]["value"])
        police_median_wage = float(police_data["values"][2]["value"])

        data.append(
            {
                "STUSPS": states_df.iloc[i]["STUSPS"],
                "teachers": teachers,
                "teacher_mean_wage": teacher_mean_wage,
                "teacher_median_wage": teacher_median_wage,
                "police_officers": police_officers,
                "police_mean_wage": police_mean_wage,
                "police_median_wage": police_median_wage,
            }
        )
    else:
        print(states_df.iloc[i]["NAME"])

Parsing States:   0%|          | 0/56 [00:00<?, ?it/s]

American Samoa
United States Virgin Islands
Guam
Commonwealth of the Northern Mariana Islands


In [16]:
occupation_df = pd.DataFrame(
    data,
)

In [18]:
occupation_df["hourly_mean_wage_diff"] = (
    occupation_df["teacher_mean_wage"] - occupation_df["police_mean_wage"]
)
occupation_df["hourly_median_wage_diff"] = (
    occupation_df["teacher_median_wage"] - occupation_df["police_median_wage"]
)

In [44]:
occupation_gdf["teachers_per_10k"] = (
    (occupation_gdf["teachers"] / (occupation_gdf["POPULATION"] / 10_000))
    .round(decimals=0)
    .astype(int)
)
occupation_gdf["officers_per_10k"] = (
    (occupation_gdf["police_officers"] / (occupation_gdf["POPULATION"] / 10_000))
    .round(decimals=0)
    .astype(int)
)

In [48]:
occupation_gdf[["STUSPS", "teachers_per_10k", "officers_per_10k"]].sort_values(
    "teachers_per_10k", ascending=False
)

Unnamed: 0,STUSPS,teachers_per_10k,officers_per_10k
36,DC,472,70
30,VT,406,17
7,MA,357,24
25,NY,353,28
51,CT,349,18
24,WY,338,22
32,IA,332,16
37,NJ,311,23
47,ND,307,23
42,RI,302,16


## Merge Data

In [27]:
occupation_gdf = states_with_population_df.merge(
    occupation_df, on="STUSPS", how="inner"
).dropna()

In [28]:
occupation_gdf = occupation_gdf.to_crs(9311)
occupation_gdf.to_file("data/Teachers_Vs_Police_Officers_Hourly_Wage.gpkg")