In [2]:
import os

import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
import numpy as np
import pandas as pd
from scipy.integrate import simpson
from tqdm.notebook import tqdm

from lib import merge_in_geometry

In [3]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
plt.style.use('dark_background')  # for cool points

In [26]:
years = list(range(2017, 2023))  # years we have POUS data for
resample_freq = "1D"  # resample raw hourly data to this resolution, then check for outage state

root_dir = "data"
states = pd.read_csv(os.path.join(root_dir, "raw/states/state_codes.csv")).set_index("state_fips_code")
county_boundaries: gpd.GeoDataFrame = gpd.read_file(os.path.join(root_dir, "raw/counties/cb_2018_us_county_500k.shp"))
outage_integrals_path = os.path.join("data", "processed", "outage", f"{resample_freq}_county_outage_integrals.csv")
yearly_outage_integrals_path = os.path.join("data", "processed", "outage", "yearly_county_outage_integrals.csv")

In [6]:
# read source data
data_by_year = {}
for year in years:
    processed_path = os.path.join(root_dir, f"processed/outage/{year}.parquet")
    data = pd.read_parquet(processed_path)
    data.OutageFraction = np.clip(data.OutageFraction, 0, 1)
    data_by_year[year] = data

# find set of all counties in data
counties = set()
for year, data in data_by_year.items():
    counties = counties | set(data.index.get_level_values("CountyFIPS"))
counties = sorted(counties)

In [12]:
# construct complete timeseries
# resample to resample_freq and take mean of OutageFraction
# save to disk as cache

if os.path.exists(outage_integrals_path):
    df = pd.read_csv(outage_integrals_path, dtype={"CountyFIPS": str})    
    
else:
    resampled_data_by_year = []
    for county_code in tqdm(counties):

        # whole timeseries April-October for single county
        for year in years:
            df = data_by_year[year]
            try:
                data = df.loc(axis=0)[:, county_code].reset_index(level="CountyFIPS")
                complete_index = pd.date_range(f"{year}-04-01", f"{year}-10-31", freq="1H")
                data = data.reindex(index=complete_index, fill_value=0)
                data.index.name = "RecordDateTime"
            except KeyError:
                continue

            data = data.drop(columns=["CountyFIPS"]).resample(resample_freq).mean()
            data["CountyFIPS"] = county_code
            resampled_data_by_year.append(data)
            
    df = pd.concat(resampled_data_by_year)
    df.to_csv(outage_integrals_path)

In [37]:
for outage_threshold in [0.01, 0.05, 0.1, 0.2, 0.5, 0.75, 0.9, 0.95, 0.99]:

    # filter to outages
    outages = df.set_index(pd.to_datetime(df.RecordDateTime)).drop("RecordDateTime", axis=1)
    outages = outages.drop(["CustomersTracked", "CustomersOut"], axis=1)
    outages = outages[outages.OutageFraction > outage_threshold]

    # draw county chloropleth map of # time periods exceeding OutageFraction

    outage_counties = set(outages.CountyFIPS)
    bad_county_mask = county_boundaries.GEOID.isin(outage_counties)
    bad_counties_with_geometry = county_boundaries[bad_county_mask].set_index("GEOID").loc[:, "geometry"]
    bad_counties_outage_period_counts = gpd.GeoDataFrame(
        pd.DataFrame(outages.CountyFIPS.value_counts()).join(bad_counties_with_geometry)
    )
    bad_counties_outage_period_counts.rename(columns={"count": "periods_over_threshold"}, inplace=True)
    min_count = min(bad_counties_outage_period_counts.periods_over_threshold)
    max_count = max(bad_counties_outage_period_counts.periods_over_threshold)

    f, ax = plt.subplots(figsize=(16, 10))

    cmap = "Blues_r"
    bad_counties_outage_period_counts
    norm = colors.Normalize(vmin=min_count, vmax=max_count)
    cbar = plt.cm.ScalarMappable(norm=norm, cmap=cmap)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="3%", pad=-1)
    cbar = f.colorbar(cbar, cax=cax, format=lambda x, _: f"{int(x):00d}")
    cbar.set_label(f"Count of {resample_freq} periods over {outage_threshold} OutageFraction", labelpad=25)

    bad_counties_outage_period_counts.plot("periods_over_threshold", ax=ax, cmap=cmap)

    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")
    ax.set_title("Most outage prone US counties, 2017-2022")
    ax.grid(alpha=0.2)
    ax.set_frame_on(False)
    ax.set_xlim(-128, -63)
    ax.set_ylim(22, 51)
    plt.subplots_adjust(left=0.05, right=0.9)

    plot_dir = os.path.join("plots", "outage_propensity")
    os.makedirs(plot_dir, exist_ok=True)
    f.savefig(os.path.join(plot_dir, f"resample_freq_{resample_freq}_threshold_{outage_threshold:.2f}.png"))
    plt.close(f)