# Markov Chain Outlier Analysis

### Imports

In [1]:
import time

from metrics import *
from RedistrictingMarkovChain import RedistrictingMarkovChain

from gerrychain import Graph
import matplotlib.pyplot as plt

### Load Data

Estimated time to load graph (MacBook Pro Intel chip):  
~9 minutes

In [2]:
start_time = time.time()
print("Starting graph load")
ny_graph = Graph.from_file("./NY-lab/NY.shp")
print("Graph loaded")
end_time = time.time()

print(f"Graph load time: {(end_time - start_time) / 60} minutes")

Starting graph load



  areas = df.geometry.area.to_dict()


Graph loaded
Graph load time: 8.667707415421804 minutes


### Create Markov Chain

In [3]:
STEPS = [1000, 5000, 10000, 20000, 50000, 100000]
election = {"name": "G20PRE", "dem": "G20PRED", "rep": "G20PRER"}
pop_tol = 0.03

In [4]:
elec_name = election.get("name")

In [5]:
ny_markov_chain = RedistrictingMarkovChain(ny_graph,
                                                   26,
                                                   "CD",
                                                   election["name"],
                                                   election["dem"],
                                                   election["rep"],
                                                   "TOTPOP",
                                                   "HISP",
                                                   "BVAP",
                                                   pop_tolerance=pop_tol)

### Define Helping Functions

Function gets the initial values from the current map on each of the metrics

In [6]:
def get_initial_vals(initial_partition):
    
    _ce = ce(initial_partition)
    _lmaj = md(initial_partition, "HISP")
    _bmaj = md(initial_partition, "BVAP")
    _dwin = pd(initial_partition, "G20PRE", "Democratic")
    _mmd = mm(initial_partition, "G20PRE", "Democratic")
    _eg = eg(initial_partition, "G20PRE")
    _pb = pb(initial_partition, "G20PRE", "Democratic")

    return _ce, _lmaj, _bmaj, _dwin, _mmd, _eg, _pb

Function gets plots each of the metrics' histograms

In [7]:
def plot_histograms_nb(ensemble, filename, title, value, bins=10, format=False, xlabel=None, ylabel=None):
    plt.figure()
    plt.title(f"{title} Histogram")
    plt.axvline(x=value, color="red", linestyle="--", linewidth=2, alpha=0.7)
    if xlabel is not None and ylabel is not None:
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
    if format:
        # plt.gca().xaxis.set_major_formatter(FormatStrFormatter('%.3f'))
        plt.xticks(rotation=45, ha='right')
    plt.hist(ensemble, bins=bins, alpha=0.6, align="left")  # left
    # bin_centers = np.arange(len(bins) - 1)
    # plt.xticks(bin_centers + 0.5, bins[:-1])
    plt.savefig(filename)
    plt.close()

### Proof of Concept

Initialize Markov Chain with 200 steps to test functionality

In [8]:
ny_init_state = ny_markov_chain.init_markov_chain(steps=200)
ce_init, hisp_init, black_init, dem_init, mm_init, eg_init, pb_init = get_initial_vals(ny_init_state.initial_state)

Analyze the Markov Chain

In [9]:
start_time = time.time()

cutedge_ensemble, lmaj_ensemble, bmaj_ensemble, dem_win_ensemble, mmd_ensemble, eg_ensemble, pb_ensemble = ny_markov_chain.walk_the_run()

# Save test histograms
# 1. Cut edge
plot_histograms_nb(cutedge_ensemble,
                f"test_histograms/test_cutedge_ensemble_200_{pop_tol}_{elec_name}.png",
                "Cut Edges",
                ce_init,
                xlabel="Cut edges",
                ylabel="Frequency"
               )
# 2. Majority-Latino districts
plot_histograms_nb(lmaj_ensemble,
                f"test_histograms/test_lmaj_ensemble_200_{pop_tol}_{elec_name}.png",
                "Hispanic Majority Districts",
                hisp_init,
                bins=range(int(min(lmaj_ensemble)) - 3, int(max(lmaj_ensemble)) + 5),
                xlabel="Hispanic-Majority Districts",
                ylabel="Frequency"
               )
# 3. Majority-Black districts
plot_histograms_nb(bmaj_ensemble,
                f"test_histograms/test_bmaj_ensemble_200_{pop_tol}_{elec_name}.png",
                "Black Majority Districts",
                hisp_init,
                bins=range(int(min(bmaj_ensemble)) - 3, int(max(bmaj_ensemble)) + 5),
                xlabel="Black-Majority Districts",
                ylabel="Frequency"
               )
# 4. Democratic-won districts
plot_histograms_nb(dem_win_ensemble,
                f"test_histograms/test_dem_win_ensemble_200_{pop_tol}_{elec_name}.png",
                "Democratic-Won Districts",
                dem_init,
                bins=range(int(min(dem_win_ensemble)) - 3, int(max(dem_win_ensemble)) + 5),
                xlabel="Democratic-Won Districts",
                ylabel="Frequency"
               )
# 5. Mean-Median Difference
plot_histograms_nb(mmd_ensemble,
                f"test_histograms/test_mmd_ensemble_200_{pop_tol}_{elec_name}.png",
                "Mean-Median Difference",
                mm_init,
                bins=30,
                format=True,
               )
# 6. Efficiency Gap
plot_histograms_nb(eg_ensemble,
                f"test_histograms/test_eg_ensemble_200_{pop_tol}_{elec_name}.png",
                "Efficiency Gap",
                eg_init,
                bins=30,
                format=True,
               )
# 7. Partisan Bias
plot_histograms_nb(pb_ensemble,
                f"test_histograms/test_pb_ensemble_200_{pop_tol}_{elec_name}.png",
                "Partisan Bias",
                pb_init,
                bins=30,
                format=True,
               )

end_time = time.time()
print(f"Markov Chain time (steps=200): {(end_time - start_time) / 60} minutes")

Walking the ensemble
Walk complete
Markov Chain time (steps=200): 0.08124196529388428 minutes


# Markov Chain Analysis

Using the variables at the top of the script, perform a thorough analysis of the Markov Chain

Estimated time of completion per steps (MacBook Pro with Intel chip):  
    - 1k: ~1 minutes  
    - 5k: ~2 minutes  
    - 10k: ~3 minutes  
    - 20k: ~7 minutes  
    - 50k: ~17 minutes  
    - 100k: ~38 minutes  

In [13]:
for step in STEPS:
    start_time = time.time()

    ny_init_state = ny_markov_chain.init_markov_chain(steps=step)
    ce_init, hisp_init, black_init, dem_init, mm_init, eg_init, pb_init = get_initial_vals(ny_init_state.initial_state)
    cutedge_ensemble, lmaj_ensemble, bmaj_ensemble, dem_win_ensemble, mmd_ensemble, eg_ensemble, pb_ensemble = ny_markov_chain.walk_the_run()
    
    # Save test histograms
    # 1. Cut edge
    plot_histograms_nb(cutedge_ensemble,
                    f"histograms/random_flip_cutedge_ensemble_{step}_{pop_tol}_{elec_name}.png",
                    "Cut Edges",
                    ce_init,
                    xlabel="Cut edges",
                    ylabel="Frequency"
                   )
    # 2. Majority-Latino districts
    plot_histograms_nb(lmaj_ensemble,
                    f"histograms/random_flip_lmaj_ensemble_{step}_{pop_tol}_{elec_name}.png",
                    "Hispanic Majority Districts",
                    hisp_init,
                    bins=range(int(min(lmaj_ensemble)) - 3, int(max(lmaj_ensemble)) + 5),
                    xlabel="Hispanic-Majority Districts",
                    ylabel="Frequency"
                   )
    # 3. Majority-Black districts
    plot_histograms_nb(bmaj_ensemble,
                    f"histograms/random_flip_bmaj_ensemble_{step}_{pop_tol}_{elec_name}.png",
                    "Black Majority Districts",
                    hisp_init,
                    bins=range(int(min(bmaj_ensemble)) - 3, int(max(bmaj_ensemble)) + 5),
                    xlabel="Black-Majority Districts",
                    ylabel="Frequency"
                   )
    # 4. Democratic-won districts
    plot_histograms_nb(dem_win_ensemble,
                    f"histograms/random_flip_dem_win_ensemble_{step}_{pop_tol}_{elec_name}.png",
                    "Democratic-Won Districts",
                    dem_init,
                    bins=range(int(min(dem_win_ensemble)) - 3, int(max(dem_win_ensemble)) + 5),
                    xlabel="Democratic-Won Districts",
                    ylabel="Frequency"
                   )
    # 5. Mean-Median Difference
    plot_histograms_nb(mmd_ensemble,
                    f"histograms/random_flip_mmd_ensemble_{step}_{pop_tol}_{elec_name}.png",
                    "Mean-Median Difference",
                    mm_init,
                    bins=30,
                    format=True,
                   )
    # 6. Efficiency Gap
    plot_histograms_nb(eg_ensemble,
                    f"histograms/random_flip_eg_ensemble_{step}_{pop_tol}_{elec_name}.png",
                    "Efficiency Gap",
                    eg_init,
                    bins=30,
                    format=True,
                   )
    # 7. Partisan Bias
    plot_histograms_nb(pb_ensemble,
                    f"histograms/random_flip_pb_ensemble_{step}_{pop_tol}_{elec_name}.png",
                    "Partisan Bias",
                    pb_init,
                    bins=30,
                    format=True,
                   )
    
    end_time = time.time()
    print(f"Markov Chain time (steps={step}): {(end_time - start_time) / 60} minutes\n")

Walking the ensemble
Walk complete
Markov Chain time (steps=1000): 0.45683473348617554 minutes

Walking the ensemble
Walk complete
Markov Chain time (steps=5000): 1.9451512018839519 minutes

Walking the ensemble
Walk complete
Markov Chain time (steps=10000): 3.705683883031209 minutes

Walking the ensemble
Walk complete
Markov Chain time (steps=20000): 7.393579097588857 minutes

Walking the ensemble
Walk complete
Markov Chain time (steps=50000): 21.956915350755057 minutes

Walking the ensemble
Walk complete
Markov Chain time (steps=100000): 34.13705266714096 minutes

