In [1]:
import json

In [2]:
import requests
import geopandas as gpd
import pandas as pd

In [3]:
from bs4 import BeautifulSoup
import country_converter

## Open GIS Data

In [4]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

## Get Population Data

In [17]:
state_populations = pd.read_excel(
    "data/NST-EST2023-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [18]:
state_populations_df = state_populations["NST-EST2023-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [19]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "POPULATION", "geometry"]
]

In [20]:
states_with_population_df = states_with_population_df.dropna()

## Get Russian pops

In [9]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}
r = requests.get(
    "https://worldpopulationreview.com/state-rankings/russian-population-by-state",
    headers=headers,
)

In [10]:
soup = BeautifulSoup(r.text, "html.parser")
table_rows = soup.find_all("tr", {"class": "table-row"})

In [11]:
state_list = []
for table_row in table_rows:
    state = table_row.find_all("a")[0].text
    tds = table_row.find_all(
        "td", {"class": "z-40 px-3 py-1.5 text-sm text-wpr-table_text md:px-4"}
    )
    state_list.append(
        {
            "NAME": state,
            "RUSSIANS": int(tds[0].text.replace(",", "")),
            "RUSSIAN_PERCENT": float(tds[1].text.replace("%", "")),
        }
    )

In [12]:
russian_pop_df = pd.DataFrame(state_list)

## Merge Data

In [34]:
states_abv_df = states_df[['STUSPS','NAME']]states_with_population_df

In [41]:
states_with_pop_gdf = states_with_population_df.merge(
    states_abv_df, on="STUSPS", how="left"
)

In [42]:
russians_per_state_df = states_with_pop_gdf.merge(russian_pop_df, on="NAME")

In [43]:
russians_per_state_df["per_100k"] = russians_per_state_df["RUSSIANS"] / (
    russians_per_state_df["POPULATION"] / 100000
)
russians_per_state_df["per_500k"] = russians_per_state_df["RUSSIANS"] / (
    russians_per_state_df["POPULATION"] / 500_000
)
russians_per_state_df["per_1m"] = russians_per_state_df["RUSSIANS"] / (
    russians_per_state_df["POPULATION"] / 1_000_000
)
russians_per_state_df["RUSSIAN_PERCENT_2023"] = (
    russians_per_state_df["RUSSIANS"] / russians_per_state_df["POPULATION"]
)

In [44]:
russians_per_state_df = russians_per_state_df[
    [
        "STUSPS",
        "geometry",
        "RUSSIANS",
        "RUSSIAN_PERCENT",
        "RUSSIAN_PERCENT_2023",
        "per_100k",
        "per_500k",
        "per_1m",
    ]
]

In [45]:
russians_per_state_df = russians_per_state_df.to_crs(9311)

In [46]:
russians_per_state_df.to_file("data/russians.gpkg")