In [302]:
import pandas as pd

In [303]:
with open("codebook-us-house-1976–2020.md") as f:
    lines = f.readlines()

In [304]:
# Quick-and-dirty way to get the column names, since they seem to be misaligned
columns = [l.replace("###", "").strip() for l in lines if l.startswith("###")]

df = pd.read_csv("1976-2020-house.tab", sep="\t", header=None)

# Choose whatever columns you want to keep; I made these choices with manual inspection
df = df.iloc[:, [0, 1, 2, 7, 12, 15, 16]]
# Sets the name of each column to the ith column in the metadata
df.columns = [columns[i] for i in [0, 1, 2, 7, 11, 14, 15]]

In [305]:
# Only keep the past two elections
df = df[df["year"] >= 2018]
# Only look at the results for Democrats (ignore minor parties & assume R's share is 1-(D fraction))

In [306]:
# This surpresses a warning--maybe this was covered in the bootcamp
df = df[df["party"] == "DEMOCRAT"].copy()

In [307]:
# Drop the total number of votes & D votes, and just keep the fraction of D votes
df["fraction"] = df.loc[:, "candidatevotes"] / df.loc[:, "totalvotes"]
df = df.drop(columns=["candidatevotes", "totalvotes"])
df.reset_index(inplace=True, drop=True)

In [308]:
df_pivoted = pd.pivot_table(
    df,
    index=["state", "state_po", "district", "party"],
    columns="year",
    values="fraction"
)

In [309]:
df_pivoted

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,year,2018,2020
state,state_po,district,party,Unnamed: 4_level_1,Unnamed: 5_level_1
ALABAMA,AL,1,DEMOCRAT,0.367765,0.355387
ALABAMA,AL,2,DEMOCRAT,0.384259,0.346827
ALABAMA,AL,3,DEMOCRAT,0.362184,0.324593
ALABAMA,AL,4,DEMOCRAT,0.201291,0.176830
ALABAMA,AL,5,DEMOCRAT,0.388947,
...,...,...,...,...,...
WISCONSIN,WI,5,DEMOCRAT,0.379878,0.398330
WISCONSIN,WI,6,DEMOCRAT,0.444637,0.407204
WISCONSIN,WI,7,DEMOCRAT,0.385042,0.392140
WISCONSIN,WI,8,DEMOCRAT,0.362757,0.357933


In [313]:
# Some districts didn't have elections in both years; drop those & reset the index
df_final = df_pivoted.reset_index().dropna()

In [314]:
df_final

year,state,state_po,district,party,2018,2020
0,ALABAMA,AL,1,DEMOCRAT,0.367765,0.355387
1,ALABAMA,AL,2,DEMOCRAT,0.384259,0.346827
2,ALABAMA,AL,3,DEMOCRAT,0.362184,0.324593
3,ALABAMA,AL,4,DEMOCRAT,0.201291,0.176830
6,ALABAMA,AL,7,DEMOCRAT,0.978045,0.971640
...,...,...,...,...,...,...
427,WISCONSIN,WI,5,DEMOCRAT,0.379878,0.398330
428,WISCONSIN,WI,6,DEMOCRAT,0.444637,0.407204
429,WISCONSIN,WI,7,DEMOCRAT,0.385042,0.392140
430,WISCONSIN,WI,8,DEMOCRAT,0.362757,0.357933


In [318]:
# For example... what can we say about the average, etc. share of D votes in each state?
df_final.groupby("state")[2018].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ALABAMA,5.0,0.458709,0.299627,0.201291,0.362184,0.367765,0.384259,0.978045
ALASKA,1.0,0.464971,,0.464971,0.464971,0.464971,0.464971,0.464971
ARIZONA,9.0,0.532831,0.160023,0.305091,0.44534,0.538316,0.610856,0.856063
ARKANSAS,3.0,0.365595,0.080513,0.312282,0.319287,0.326293,0.392252,0.458211
CALIFORNIA,52.0,0.643074,0.129673,0.36283,0.532267,0.649776,0.742222,0.890755
COLORADO,7.0,0.529838,0.129406,0.39315,0.414677,0.541032,0.603594,0.738142
CONNECTICUT,5.0,0.586003,0.034686,0.527965,0.579906,0.603951,0.606095,0.6121
DELAWARE,1.0,0.64454,,0.64454,0.64454,0.64454,0.64454,0.64454
FLORIDA,25.0,0.576856,0.234482,0.323544,0.396646,0.508746,0.620237,1.0
GEORGIA,13.0,0.501298,0.230577,0.204945,0.37108,0.422613,0.596493,1.0
