# What is the likelihood a U.S. House seat will flip to Democratic?

In [1]:
import pandas as pd
import time
# Input is a start time and an end time
# Output is a print statement giving the time elapsed
def time_elapsed(start, end):
    # Compute time elapsed in seconds
    total_time_seconds = end-start 
    if total_time_seconds < 60:
        print("Total time elapsed =",total_time_seconds, "seconds")
    else:
        # In minutes 
        total_time_minutes = total_time_seconds/60 
        if total_time_minutes < 60: 
            print("Total time elapsed =", total_time_minutes, "minutes") 
        else: 
            # In hours
            total_time_hours = total_time_minutes/60 
            # Print the time elapsed in hours
            print("Total time elapsed =", total_time_hours, "hours")

## Data cleaning

In [2]:
with open("data\\codebook-us-house-1976–2020.md") as f:
    lines = f.readlines()

In [3]:
# Quick-and-dirty way to get the column names, since they seem to be misaligned
columns = [l.replace("###", "").strip() for l in lines if l.startswith("###")]
#print(columns) # for debugging

df = pd.read_csv("data\\1976-2020-house.tab", sep="\t", header=None)

# Choose whatever columns you want to keep; I made these choices with manual inspection
df = df.iloc[:, [0, 1, 7, 12, 15, 16]]
# Sets the name of each column to the ith column in the metadata
df.columns = [columns[i] for i in [0, 1, 7, 11, 14, 15]]
#print(df.columns) # for debugging
df # for debugging

Unnamed: 0,year,state,district,party,candidatevotes,totalvotes
0,1976,ALABAMA,1,DEMOCRAT,58906,157170
1,1976,ALABAMA,1,REPUBLICAN,98257,157170
2,1976,ALABAMA,1,,7,157170
3,1976,ALABAMA,2,DEMOCRAT,66288,156362
4,1976,ALABAMA,2,REPUBLICAN,90069,156362
...,...,...,...,...,...,...
31098,2020,WYOMING,0,DEMOCRAT,66576,278503
31099,2020,WYOMING,0,,1274,278503
31100,2020,WYOMING,0,LIBERTARIAN,10154,278503
31101,2020,WYOMING,0,,6337,278503


In [4]:
# Only keep the elections using Census 2010 districts
df = df[df["year"] >= 2012]
df # for debugging

Unnamed: 0,year,state,district,party,candidatevotes,totalvotes
24053,2012,ALABAMA,1,REPUBLICAN,196374,200676
24054,2012,ALABAMA,1,,4302,200676
24055,2012,ALABAMA,2,REPUBLICAN,180591,283953
24056,2012,ALABAMA,2,DEMOCRAT,103092,283953
24057,2012,ALABAMA,2,,270,283953
...,...,...,...,...,...,...
31098,2020,WYOMING,0,DEMOCRAT,66576,278503
31099,2020,WYOMING,0,,1274,278503
31100,2020,WYOMING,0,LIBERTARIAN,10154,278503
31101,2020,WYOMING,0,,6337,278503


In [5]:
# Add a column for the winner of each race
df["winner"] =  0
#print(df.winner.values.tolist()) # for debugging

# Make lists of the states and years
states = []
[states.append(state) for state in df.state.values.tolist() if state not in states]
years = []
[years.append(year) for year in df.year.values.tolist() if year not in years]

# Function to get the districts for each state
def districts(state):
    district_numbers_repeats = df.loc[(df.year==2012) & (df.state==state)]["district"].values.tolist()
    district_numbers = []
    [district_numbers.append(num) for num in district_numbers_repeats if num not in district_numbers]
    return district_numbers

for state in states:
    #print(state) # for debugging
    #start = time.time() # for debugging
    district_numbers = districts(state)
    #end = time.time() # for debugging
    #time_elapsed(start, end) # for debugging
    for year in years:
        #print(year) # for debugging
        #start = time.time() # for debugging
        for district in district_numbers:
            max_votes = df.loc[(df.year==year) & (df.state==state) & (df.district == district)]["candidatevotes"].max()
            #print("Max votes for", state, year,"district", district, "is", max_votes) # for debugging
            df.loc[(df.year==year) & (df.state==state) & (df.district==district) & (df.candidatevotes==max_votes), "winner"] = 1
        #end = time.time() # for debugging
        #time_elapsed(start, end) # for debugging 
df # for debugging        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["winner"] =  0


Unnamed: 0,year,state,district,party,candidatevotes,totalvotes,winner
24053,2012,ALABAMA,1,REPUBLICAN,196374,200676,1
24054,2012,ALABAMA,1,,4302,200676,0
24055,2012,ALABAMA,2,REPUBLICAN,180591,283953,1
24056,2012,ALABAMA,2,DEMOCRAT,103092,283953,0
24057,2012,ALABAMA,2,,270,283953,0
...,...,...,...,...,...,...,...
31098,2020,WYOMING,0,DEMOCRAT,66576,278503,0
31099,2020,WYOMING,0,,1274,278503,0
31100,2020,WYOMING,0,LIBERTARIAN,10154,278503,0
31101,2020,WYOMING,0,,6337,278503,0


In [6]:
# Need to include elections in which there was no 
# Democrat running before executing the next block
for state in states:
    district_numbers = districts(state)
    for year in years:
       for district in district_numbers:
            if "DEMOCRAT" not in df.loc[(df.year==year) & (df.state==state) & (df.district==district)]["party"].values.tolist():
                total_votes_rows = df.loc[(df.year==year) & (df.state==state) & (df.district==district)]["totalvotes"]
                total_votes = total_votes_rows.iloc[0]
                #print(total_votes) # for debugging
                new_row = {"year":year, "state":state, "district":district, "party":"DEMOCRAT", "candidatevotes":0, "totalvotes":total_votes, "winner":0}
                df.loc[len(df)] = new_row
df = df.sort_values(by=["year", "state", "district"])
df # for debugging

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFram

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFram

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFram

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFram

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df)] = new_row
A value is trying to be set on a copy of a slice from a DataFram

Unnamed: 0,year,state,district,party,candidatevotes,totalvotes,winner
24053,2012,ALABAMA,1,REPUBLICAN,196374,200676,1
24054,2012,ALABAMA,1,,4302,200676,0
7050,2012,ALABAMA,1,DEMOCRAT,0,200676,0
24055,2012,ALABAMA,2,REPUBLICAN,180591,283953,1
24056,2012,ALABAMA,2,DEMOCRAT,103092,283953,0
...,...,...,...,...,...,...,...
31098,2020,WYOMING,0,DEMOCRAT,66576,278503,0
31099,2020,WYOMING,0,,1274,278503,0
31100,2020,WYOMING,0,LIBERTARIAN,10154,278503,0
31101,2020,WYOMING,0,,6337,278503,0


In [7]:
# Only look at the results for Democrats (ignore minor parties & assume R's share is 1-(D fraction))
# This surpresses a warning--maybe this was covered in the bootcamp
df = df[df["party"] == "DEMOCRAT"].copy()
df # for debugging

Unnamed: 0,year,state,district,party,candidatevotes,totalvotes,winner
7050,2012,ALABAMA,1,DEMOCRAT,0,200676,0
24056,2012,ALABAMA,2,DEMOCRAT,103092,283953,0
24058,2012,ALABAMA,3,DEMOCRAT,98141,273930,0
24061,2012,ALABAMA,4,DEMOCRAT,69706,269118,0
24064,2012,ALABAMA,5,DEMOCRAT,101772,291293,0
...,...,...,...,...,...,...,...
31085,2020,WISCONSIN,5,DEMOCRAT,175902,441599,0
31088,2020,WISCONSIN,6,DEMOCRAT,164239,403333,0
31091,2020,WISCONSIN,7,DEMOCRAT,162741,415007,0
31093,2020,WISCONSIN,8,DEMOCRAT,149558,417838,0


In [8]:
# Drop the total number of votes & D votes, and just keep the fraction of D votes
df["fraction"] = df.loc[:, "candidatevotes"] / df.loc[:, "totalvotes"]
df = df.drop(columns=["candidatevotes", "totalvotes"])
df.reset_index(inplace=True, drop=True)
df # for debugging

Unnamed: 0,year,state,district,party,winner,fraction
0,2012,ALABAMA,1,DEMOCRAT,0,0.000000
1,2012,ALABAMA,2,DEMOCRAT,0,0.363060
2,2012,ALABAMA,3,DEMOCRAT,0,0.358270
3,2012,ALABAMA,4,DEMOCRAT,0,0.259016
4,2012,ALABAMA,5,DEMOCRAT,0,0.349380
...,...,...,...,...,...,...
2228,2020,WISCONSIN,5,DEMOCRAT,0,0.398330
2229,2020,WISCONSIN,6,DEMOCRAT,0,0.407204
2230,2020,WISCONSIN,7,DEMOCRAT,0,0.392140
2231,2020,WISCONSIN,8,DEMOCRAT,0,0.357933


In [None]:
# This code chunk and below hasn't been executed yet

# Add a column indicating whether the seat flipped
df["flipped"] = 0
for i in range(len(years)):
    if i >= 1:
        for state in states:
            district_numbers_repeats = df.loc[(df.year==2012) & (df.state==state)]["district"].values.tolist()
            district_numbers = []
            [district_numbers.append(num) for num in district_numbers_repeats if num not in district_numbers]
            for district in district_numbers:
                print(years[i], state, district) # for debugging
                current_outcome = df.loc[(df.year==years[i]) & (df.state==state) & (df.district==district)]["winner"]
                previous_outcome = df.loc[(df.year==years[i-1]) & (df.state==state) & (df.district==district)]["winner"] 
                print("In", years[i], "win was:", current_outcome) # for debugging
                print("Last election, win was:", previous_outcome) # for debugging

In [None]:
## Data exploration

In [None]:
df_pivoted = pd.pivot_table(
    df,
    index=["state", "district", "party"],
    columns="year",
    values="fraction"
)

In [None]:
df_pivoted

In [None]:
# Some districts didn't have elections in both years; drop those & reset the index
df_final = df_pivoted.reset_index().dropna()

In [None]:
df_final

In [None]:
# Ashley: I don't think we need this, 
# but maybe we should find something to include in the data exploration

# For example... what can we say about the average, etc. share of D votes in each state?
#df_final.groupby("state")[2018].describe()