In [1]:
import pandas as pd
import numpy as np
import plotnine as p9
from plotnine import ggplot, aes, facet_grid, labs, geom_point, geom_smooth, coord_flip, scale_color_manual
from sklearn.linear_model import LinearRegression as lm
import statsmodels.tools.tools as sm
from statsmodels.discrete.discrete_model import Probit
from statsmodels.discrete.discrete_model import Logit
import matplotlib.pyplot as plt
import scipy
from scipy.optimize import curve_fit
from scipy.optimize import differential_evolution
import warnings
import requests
import os
import glob
warnings.filterwarnings('ignore')
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
#load the district files into a single dataframe
dist_path = "/Users/xavier/Desktop/DSPP/solo_projects/redistricting_project/clean_data/full_districts"
all_files = sorted(glob.glob(dist_path + "/*.csv"))

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

districts = pd.concat(li, axis=0, ignore_index=False)

In [3]:
districts = districts[districts["year"] != 2022]

In [4]:
#confirm we have data from 9 Congressional Cycles
districts.shape[0] / 435

9.0

In [5]:
red = [2010,2014]
blu = [2006,2008,2018]
districts["wave"] = "Neutral"
districts["wave"] = np.where(districts["year"].isin(red), 'Red', districts["wave"])
districts["wave"] = np.where(districts["year"].isin(blu), 'Blue', districts["wave"])

## First, Recreate in the Binned Model from Previous Work

In [6]:
#split the dataframe into the smoothed bins
districts["bin"] = pd.cut(districts['metric'], 
    [0, .24, .42, .44, .45, .46, .47, .48, .49, .5,
        .51, .52, .53, .54, .57, .7, 1],
    labels=["D+26+","D+25 to D+8","D+7 to D+6","D+5","D+4","D+3","D+2","D+1",
        "EVEN","R+1","R+2","R+3","R+4","R+5 to R+7","R+8 to R+20","R+21+"])

In [7]:
def agg_probs_bin(df):
    '''Aggregate probailites of GOP represenation by bin
    Args:
        a datdistricts with historical election results
    Returns:
        a dataframe aggregating the probabilities of GOP representation
    '''
    sort = df.groupby("bin").mean()
    sort = sort.sort_values(by="metric")
    return pd.DataFrame(sort.is_GOP).reset_index().rename(columns={"is_GOP":"prob_GOP"})

In [8]:
#aggregate probabilites of historical data
prob_gop_bin = agg_probs_bin(districts)

In [9]:
prob_gop_bin

Unnamed: 0,bin,prob_GOP
0,D+26+,0.0
1,D+25 to D+8,0.004449
2,D+7 to D+6,0.044586
3,D+5,0.117647
4,D+4,0.1875
5,D+3,0.253521
6,D+2,0.33871
7,D+1,0.370968
8,EVEN,0.402439
9,R+1,0.591837


In [10]:
#load in the conversion datafile for binned data
converter = pd.read_csv("/Users/xavier/Desktop/DSPP/solo_projects/redistricting_project/processed/metric_converter.csv")

In [11]:
#clarify that the estimates are from binned
converter.columns = ["metric","pvi_range","prob_GOP_bin"]

## Next, include raw metrics just for comparison

In [12]:
def agg_probs(df):
    '''Aggregate probailites of GOP represenation by metric
    Args:
        a datdistricts with historical election results
    Returns:
        a dataframe aggregating the probabilities of GOP representation
    '''
    sort = df.groupby("metric").mean()
    sort = sort.sort_values(by="metric")
    return pd.DataFrame(sort.is_GOP).reset_index().rename(columns={"is_GOP":"prob_GOP_raw"})

In [13]:
#aggregate probabilites of historical data
prob_gops_metric = agg_probs(districts)
#attach associated probabilities
converter = pd.merge(converter,prob_gops_metric)

In [15]:
#reorg_columns
converter = converter[["metric","pvi_range","prob_GOP_raw","prob_GOP_bin"]]