In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path  

In [2]:
dem_candidates = pd.read_csv('../datasets/dem_candidates.csv')

In [115]:
#see shape of initial uncleaned data
#dem_candidates.shape

(811, 32)

# cleaning the datasets to prepare for merging

In [2]:
house = dem_candidates[dem_candidates['Office Type'] == 'Representative'] #filtering out non-house races
#house.shape

In [147]:
#seeing how many rows were NAs and using the README to determine the meaning.
#house.isna().sum()

In [3]:
qualities = ['Veteran?', 'LGBTQ?', 'STEM?', 'Race','Obama Alum?', 'Self-Funder?', 'Elected Official?'] #nulls in these columns mean no website for the candidate was found. 
# seeing if there were systematic patterns in rows with no website available. found a lot of Ohio. 
# house[house[qualities].isnull().any(axis=1)]

# Then realizing if i do that the dataset goes to shit so scrapping that and just putting 0s for no and nan. 
#house[house[qualities].isnull().any(axis=1)]['District'].value_counts()

# filling in the NaNs in 'qualities' columns with 0s. turning all Nos to 0s and Yes to 1. 
house[qualities] = house[qualities].replace({'No': 0, 'Yes': 1}).fillna(0)
house[qualities] = house[qualities].replace({'White': 0, 'Nonwhite': 1}).fillna(0)


In [119]:
#now dealing with un-updated data for Runoffs. 
#seeing 22 NaNs in General Status and investigating those. Found 22 such rows
#house['General Status'].value_counts(dropna=False)
#house[house['General Status'].isnull()].shape

(22, 32)

In [4]:
"""
of the resulting 22 rows, finding that 8 are NaNs due to the election not happenign yet. 
"""
runoff_winners = ['Kendra Horn','Jason Nichols', 'Tim Gilpin', 'Mary Brannon']
house.loc[house['Candidate'].isin(runoff_winners),'General Status'] = 'On the Ballot'
house.loc[house['Candidate'].isin(runoff_winners), 'Primary Runoff Status'] = 'Advanced'
runoff_losers = ['Tom Guild', 'Clay Padgett','Amanda Douglas','Fred Gipson']
house.loc[house['Candidate'].isin(runoff_losers),'General Status'] = 'None'


"""
okay now that we have done that, the remaining "NaNs" upon investigation, can be converted to 'None' (14 rows)
"""
house['General Status'] = house['General Status'].fillna(0)

#this should now return null. 
house[house['General Status'].isnull()]


"""
Now that the data is clean, we can binarize it
"""
house['General Status'] = house['General Status'].replace({'None': 0, 'On the Ballot': 1})


"""
one more thing drop all the special  elections
"""
house = house[house['Race Type'] != 'Special']
house.shape

(675, 32)

In [5]:
#Preparing for merging. Adding column called "District Abbrev"
house['dist_num'] = house['District'].str.extract('(\d+)$')
house['District Abbrev'] = house['State']+ '-' +house['dist_num']

In [6]:
brookings = pd.read_csv('brookings.csv')
brookings = brookings[(brookings['Candidate.Party'].str.contains('Democrat')) 
    & (brookings['Incumbent'].isnull())
    & (brookings['Primary.Outcome'].isin(['Winner','Loser']))]
brookings['Candidate'] = brookings['Candidate.First.Name'] + ' ' + brookings['Candidate.Last.Name']

brookings['Candidate'] = brookings['Candidate.First.Name'] + ' ' + brookings['Candidate.Last.Name']
brookings['District'] = brookings['Candidate.State'] + '-' + brookings['Candidate.District'].astype(str)
brookings = brookings.drop('Unnamed: 0', axis=1)

### Adding new columns

In [7]:
def count_candidates_by_district(df, district_column, new_column_name):
    # Calculate the number of candidates in each district
    district_counts = df.groupby(district_column).size().reset_index(name=new_column_name)
    # Merge the counts back into the original DataFrame
    df_merged = df.merge(district_counts, on=district_column)

    return df_merged


house = count_candidates_by_district(house, 'District', 'total_runners') #run the above func to get count of ppl in each race and add that as a new col


In [8]:
house = house[house['total_runners']>1] #filter out all the rows where only 1 person ran. 
house.to_csv(Path('house_cleaned.csv'))

In [17]:
house.shape

(633, 35)

In [10]:
brookings = count_candidates_by_district(brookings, 'District', 'total_runners') #run the above func to get count of ppl in each race and add that as a new col
brookings = brookings[brookings['total_runners']>1] #filter out all the rows where only 1 person ran. 

In [11]:
brookings.to_csv(Path('brookings_to_join.csv'))

In [12]:
brookings.columns
#

Index(['Candidate.First.Name', 'Candidate.Last.Name', 'Candidate.State',
       'Candidate.District', 'Candidate.Party', 'Incumbent', 'Freshman.Member',
       'Candidate.Website.URL', 'Candidate.Gender', 'Listed.military.service.',
       'Education', 'Marital.Status', 'Previous.Electoral.Experience',
       'Position.on.Affordable.Care.Act..ObamaCare.',
       'Position.on.Minimum.Wage', 'Position.on.Federal.Taxes',
       'Position.on.Business.Regulations', 'Position.on.National.Debt.Deficit',
       'Position.on.Social.Security', 'Position.on.Gun.Control',
       'Position.on.Immigration', 'Position.on.Abortion',
       'Position.on.Same.Sex.Marriage', 'Position.on.Criminal.Justice.Reform',
       'Position.on.Federal.K.12.Education.Policy',
       'Position.on.Climate.Change', 'Position.on.Campaign.Finance.Reform',
       'Position.on.Legalization.Decriminalization.of.Marijuana.Policy',
       'Position.on.Defense.Spending', 'Position.on.Handling.Terrorism.Abroad',
       'Positio

Binarizing brookigns Primary Outcome

In [18]:
brookings['Primary.Outcome'] = brookings['Primary.Outcome'].replace({'Loser': 0, 'Winner': 1})

In [19]:
brookings['Primary.Outcome'].value_counts()

0    627
1    183
Name: Primary.Outcome, dtype: int64

In [16]:
house['General Status'].value_counts(dropna=False)

0    463
1    170
Name: General Status, dtype: int64

In [23]:
def lowercase_column(df, column_name):
    if column_name in df.columns:
        df[column_name] = df[column_name].astype(str).str.lower()
    else:
        print(f"Column '{column_name}' not found in DataFrame.")
    return df
brookings = lowercase_column(brookings, 'Candidate')
house = lowercase_column(house, 'Candidate')

# we can merge on 
- brookings['Primary.Outcome'] == house['General Status']
- brookings['District'] == house['District Abbrev']
- brookings['Candidate'] fuzzy match with house['Candidate'] with 60 percent match (this should be enough so long as the other two conditions are also matched)


Exporting this to two csvs to merge. Merging code in Merging Datasets

In [24]:
brookings.to_csv(Path('brookings_to_join.csv'))
house.to_csv(Path('house_to_join.csv'))

In [12]:
# brookings = pd.read_csv('candidatesdatafinal2018.csv')
# brookings['Candidate'] = brookings['Candidate.First.Name'] + ' ' + brookings['Candidate.Last.Name']
# brookings['District'] = brookings['Candidate.State'] + '-' + brookings['Candidate.District'].astype(str)
# brookings

In [32]:
# merged_df = pd.merge(brookings, house, left_on=['Candidate', 'District'], right_on=['Candidate', 'District Abbrev'], how='inner')
# merged_df
#merged_df.drop(columns =['Unnamed: 0_x', 'Candidate.First.Name', 'Candidate.Last.Name','Candidate.State', 'Candidate.District', 'Candidate.Gender','Listed.military.service.', 'Party.Category.1','Incumbency',  'Democrat', 'Republican', 'District_x','Unnamed: 0_y', 'State', 'District_y','dist_num'])

In [33]:
# # Assuming you have a list of endorsement column names
# endorsement_columns = endorsement_cols

# def endorsement_effect(data):
#     # Calculating the 'Partisan Lean' percentiles and bins
#     percentiles = data['Partisan Lean'].quantile([0.25, 0.5, 0.75]).tolist()
#     data['Partisan Lean Percentile Bins'] = pd.cut(data['Partisan Lean'], bins=[-np.inf] + percentiles + [np.inf], labels=['0-25%', '26-50%', '51-75%', '76-100%'])

#     # Looping through each endorsement column
#     for endorsement in endorsement_columns:
#         plt.figure(figsize=(12, 8))
#         for i, bin in enumerate(data['Partisan Lean Percentile Bins'].cat.categories, start=1):
#             bin_data = data[data['Partisan Lean Percentile Bins'] == bin]

#             # Calculating success rate for each endorsement status within the bin
#             success_rate_endorsed = bin_data[bin_data[endorsement] == 1]['General Status'].mean()
#             success_rate_not_endorsed = bin_data[bin_data[endorsement] == 0]['General Status'].mean()

#             plt.subplot(2, 2, i)
#             sns.barplot(x=[f'{endorsement[:-1]}', f'Not {endorsement[:-1]}'], y=[success_rate_endorsed, success_rate_not_endorsed])
#             plt.title(f'Success Rate in Partisan Lean Percentile Bin: {bin}')
#             plt.ylabel('Success Rate')

#         plt.suptitle(f'Success Rate Analysis for {endorsement}')
#         plt.tight_layout()
#         plt.show()

In [17]:
# endorsement_effect(house)