In [1]:
import pandas as pd
import numpy as np
import plotnine as p9
from plotnine import ggplot, aes, facet_grid, labs, geom_point, geom_smooth, coord_flip, scale_color_manual
from sklearn.linear_model import LinearRegression as lm
import statsmodels.tools.tools as sm
from statsmodels.discrete.discrete_model import Probit
from statsmodels.discrete.discrete_model import Logit
import matplotlib.pyplot as plt
import scipy
from scipy.optimize import curve_fit
from scipy.optimize import differential_evolution
import warnings
import requests
import os
import glob
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression
warnings.filterwarnings('ignore')
pd.set_option("display.max_rows", None, "display.max_columns", None)
import seaborn as sns

In [2]:
#load the district files into a single dataframe
dist_path = "/Users/xavier/Desktop/DSPP/solo_projects/redistricting_project/clean_data/full_districts"
all_files = sorted(glob.glob(dist_path + "/*.csv"))

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

districts = pd.concat(li, axis=0, ignore_index=False)

In [3]:
districts = districts[districts["year"] != 2022]

In [4]:
#confirm we have data from 9 Congressional Cycles
districts.shape[0] / 435

9.0

In [5]:
red = [2010,2014]
blu = [2006,2008,2018]
districts["wave"] = "Neutral"
districts["wave"] = np.where(districts["year"].isin(red), 'Red', districts["wave"])
districts["wave"] = np.where(districts["year"].isin(blu), 'Blue', districts["wave"])

## First, Recreate in the Binned Model from Previous Work

In [6]:
#split the dataframe into the smoothed bins
districts["bin"] = pd.cut(districts['metric'], 
    [0, .24, .42, .44, .45, .46, .47, .48, .49, .5,
        .51, .52, .53, .54, .57, .7, 1],
    labels=["D+26+","D+25 to D+8","D+7 to D+6","D+5","D+4","D+3","D+2","D+1",
        "EVEN","R+1","R+2","R+3","R+4","R+5 to R+7","R+8 to R+20","R+21+"])

In [7]:
def agg_probs_bin(df):
    '''Aggregate probailites of GOP represenation by bin
    Args:
        a datdistricts with historical election results
    Returns:
        a dataframe aggregating the probabilities of GOP representation
    '''
    sort = df.groupby("bin").mean()
    sort = sort.sort_values(by="metric")
    return pd.DataFrame(sort.is_GOP).reset_index().rename(columns={"is_GOP":"prob_GOP"})

In [8]:
#aggregate probabilites of historical data
prob_gop_bin = agg_probs_bin(districts)

In [9]:
prob_gop_bin

Unnamed: 0,bin,prob_GOP
0,D+26+,0.0
1,D+25 to D+8,0.004449
2,D+7 to D+6,0.044586
3,D+5,0.117647
4,D+4,0.1875
5,D+3,0.253521
6,D+2,0.33871
7,D+1,0.370968
8,EVEN,0.402439
9,R+1,0.591837


In [10]:
#load in the conversion datafile for binned data
converter = pd.read_csv("/Users/xavier/Desktop/DSPP/solo_projects/redistricting_project/processed/metric_converter.csv")

In [11]:
#add in rows to even the distribution 
d = {'metric' : [.85, .86, .87, .88, .89, .9, .91, .92, .93, .94]}
add = pd.DataFrame(data=d)
add["pvi_range"] = "R+21+"
add["prob_GOP"] = 1
converter = pd.concat([converter, add])

In [12]:
#clarify that the estimates are from binned
converter.columns = ["metric","pvi_range","prob_GOP_bin"]

## Next, include raw metrics just for comparison

In [13]:
def agg_probs(df):
    '''Aggregate probailites of GOP represenation by metric
    Args:
        a datdistricts with historical election results
    Returns:
        a dataframe aggregating the probabilities of GOP representation
    '''
    sort = df.groupby("metric").mean()
    sort = sort.sort_values(by="metric")
    return pd.DataFrame(sort.is_GOP).reset_index().rename(columns={"is_GOP":"prob_GOP_all"})

In [14]:
#aggregate probabilites of historical data
prob_gops_metric = agg_probs(districts)
#aggregate on wave years
prob_gops_red = agg_probs(districts[districts["wave"] == "Red"])
prob_gops_blue = agg_probs(districts[districts["wave"] == "Blue"])
#rename columns
prob_gops_red = prob_gops_red.rename(columns={"prob_GOP_raw":"prob_GOP_red"})
prob_gops_blue = prob_gops_blue.rename(columns={"prob_GOP_raw":"prob_GOP_blue"})
#attach associated probabilities
converter = pd.merge(converter,prob_gops_metric,how='left')
converter = pd.merge(converter,prob_gops_red,how='left')
converter = pd.merge(converter,prob_gops_blue,how='left')

In [15]:
#fill in blanks
converter[converter["metric"] < .25] = converter[converter["metric"] < .25].fillna(0)
converter[converter["metric"] > .75] = converter[converter["metric"] > .75].fillna(1)

In [16]:
converter.head()

Unnamed: 0,metric,pvi_range,prob_GOP_bin,prob_GOP_all
0,0.06,D+26+,0.0,0.0
1,0.07,D+26+,0.0,0.0
2,0.08,D+26+,0.0,0.0
3,0.09,D+26+,0.0,0.0
4,0.1,D+26+,0.0,0.0


Now the converter has numeric values for the binned groupings and raw probabilities for each type of wave

# Begin Smoothing Models

Becuase the bins still don't result in smooth relationships between pvi and representation, it's worth creating a smoothed model that will account for the regular variation

In [17]:
X = districts["metric"].to_numpy().reshape(-1, 1)
y = districts["is_GOP"]
X_test = converter["metric"].to_numpy().reshape(-1, 1)

In [18]:
#halt probit

#fit the model with probit
#model = Probit(y, sm.add_constant(X).astype(float))
#prob_mod = model.fit()
#pred_y = prob_mod.predict(sm.add_constant(X_test))
#converter["probit_all"] = pred_y.round(2)
#prob_mod.summary()

In [19]:
#fit the model with logit
model = Logit(y, sm.add_constant(X).astype(float))
log_mod = model.fit()
pred_y = log_mod.predict(sm.add_constant(X_test))
converter["logit_all"] = pred_y.round(2)
#log_mod.summary()

Optimization terminated successfully.
         Current function value: 0.245845
         Iterations 8


In [20]:
#repeat for red waves
df = districts[districts["wave"] == "Red"]
X = df["metric"].to_numpy().reshape(-1, 1)
y = df["is_GOP"]
X_test = converter["metric"].to_numpy().reshape(-1, 1)
#fit the model with logit
model = Logit(y, sm.add_constant(X).astype(float))
log_mod = model.fit()
pred_y = log_mod.predict(sm.add_constant(X_test))
converter["logit_red"] = pred_y.round(2)

Optimization terminated successfully.
         Current function value: 0.181792
         Iterations 9


In [21]:
#repeat for blue waves
df = districts[districts["wave"] == "Red"]
X = df["metric"].to_numpy().reshape(-1, 1)
y = df["is_GOP"]
X_test = converter["metric"].to_numpy().reshape(-1, 1)
#fit the model with logit
model = Logit(y, sm.add_constant(X).astype(float))
log_mod = model.fit()
pred_y = log_mod.predict(sm.add_constant(X_test))
converter["logit_blue"] = pred_y.round(2)

Optimization terminated successfully.
         Current function value: 0.181792
         Iterations 9


In [22]:
converter

Unnamed: 0,metric,pvi_range,prob_GOP_bin,prob_GOP_all,logit_all,logit_red,logit_blue
0,0.06,D+26+,0.0,0.0,0.0,0.0,0.0
1,0.07,D+26+,0.0,0.0,0.0,0.0,0.0
2,0.08,D+26+,0.0,0.0,0.0,0.0,0.0
3,0.09,D+26+,0.0,0.0,0.0,0.0,0.0
4,0.1,D+26+,0.0,0.0,0.0,0.0,0.0
5,0.11,D+26+,0.0,0.0,0.0,0.0,0.0
6,0.12,D+26+,0.0,0.0,0.0,0.0,0.0
7,0.13,D+26+,0.0,0.0,0.0,0.0,0.0
8,0.14,D+26+,0.0,0.0,0.0,0.0,0.0
9,0.15,D+26+,0.0,0.0,0.0,0.0,0.0


In [23]:
converter.to_csv("/Users/xavier/Desktop/DSPP/solo_projects/redistricting_project/processed/converter.csv", index = False)