# preprocessing.ipynb

This notebook is primarily concerned with the merging of three different datasets:

**brookings.csv**

**dem_candidates.csv**

**fec.csv**

In [48]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from imblearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error


import warnings
warnings.filterwarnings("ignore")

In [2]:
dems = pd.read_csv("data/dem_candidates.csv")

In [3]:
brookings = pd.read_csv("data/brookings.csv")

### How many states were involved in the 2018 primary elections for the House and Senate?

In [4]:
dems["State"].unique() #correct number of states involved without including incumbents

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'GA', 'IA', 'ID', 'IL', 'IN', 'KS',
       'KY', 'MD', 'ME', 'MI', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NJ',
       'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VA', 'WA', 'WV'], dtype=object)

In [5]:
brookings["Candidate.State"].unique()

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI',
       'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'MA', 'MD', 'ME', 'MI', 'MN',
       'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY',
       'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA',
       'VT', 'WA', 'WI', 'WV', 'WY'], dtype=object)

In [6]:
len(brookings), len(dems)

(2280, 811)

In [7]:
brookings = brookings[brookings["Candidate.Party"] == "Democrat"]
brookings = brookings[brookings["Incumbency"] == 0]

In [8]:
len(brookings), len(dems)

(973, 811)

In [9]:
brookings.describe()

Unnamed: 0.1,Unnamed: 0,Candidate.District,Incumbency,Female,Democrat,Republican
count,973.0,973.0,973.0,973.0,973.0,973.0
mean,1147.210689,8.892086,0.0,0.329908,1.0,0.0
std,623.668112,10.005845,0.0,0.470421,0.0,0.0
min,1.0,0.0,0.0,0.0,1.0,0.0
25%,642.0,2.0,0.0,0.0,1.0,0.0
50%,1152.0,5.0,0.0,0.0,1.0,0.0
75%,1661.0,12.0,0.0,1.0,1.0,0.0
max,2277.0,50.0,0.0,1.0,1.0,0.0


In [10]:
dems["Candidate"][0:10]

0    Anthony White (Alabama)
1     Christopher Countryman
2      Doug "New Blue" Smith
3            James C. Fields
4              Sue Bell Cobb
5                Walt Maddox
6    Lizzetta Hill McConnell
7         Robert Kennedy Jr.
8       Audri Scott Williams
9              Tabitha Isner
Name: Candidate, dtype: object

### Regularize the name of candidates

In [11]:
def extract_first_last(name):
    name = re.sub(r'\(.*?\)', '', name)
    name = re.sub(r'\".*?\"', '', name)
    parts = name.split()

    if len(parts) >= 2:
        return parts[0] + ' ' + parts[-1]
    elif len(parts) == 1:
        return parts[0]
    else:
        return ''

dems['Candidate'] = dems['Candidate'].apply(extract_first_last)

dems['Candidate']

0               Anthony White
1      Christopher Countryman
2                  Doug Smith
3                James Fields
4                    Sue Cobb
                ...          
806            Talley Sergent
807           Janice Hagerman
808                Paul Davis
809             Richard Ojeda
810              Shirley Love
Name: Candidate, Length: 811, dtype: object

In [12]:
dems['Candidate'] = dems['Candidate'].str.upper()

dems[['Candidate First Name', 'Candidate Last Name']] = dems['Candidate'].str.split(' ', n = 1, expand=True)
dems.head()

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,Sanders Endorsed?,Our Revolution Endorsed?,Justice Dems Endorsed?,PCCC Endorsed?,Indivisible Endorsed?,WFP Endorsed?,VoteVets Endorsed?,No Labels Support?,Candidate First Name,Candidate Last Name
0,ANTHONY WHITE,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,ANTHONY,WHITE
1,CHRISTOPHER COUNTRYMAN,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,CHRISTOPHER,COUNTRYMAN
2,DOUG SMITH,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,DOUG,SMITH
3,JAMES FIELDS,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,JAMES,FIELDS
4,SUE COBB,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,SUE,COBB


In [13]:
brookings['Candidate.First.Name'] = brookings['Candidate.First.Name'].str.upper()
brookings['Candidate.Last.Name'] = brookings['Candidate.Last.Name'].str.upper()

In [14]:
dems_candidates = set(dems['Candidate First Name'] + ' ' + dems['Candidate Last Name'])
brookings_candidates = set(brookings['Candidate.First.Name'] + ' ' + brookings['Candidate.Last.Name'])

missing_in_brookings = dems_candidates - brookings_candidates

missing_in_dems = brookings_candidates - dems_candidates

print(f"Candidates in 'dems' but not in 'brookings': {missing_in_brookings}")
print(f"Candidates in 'brookings' but not in 'dems': {missing_in_dems}")

Candidates in 'dems' but not in 'brookings': {'ANDY MCGUIRE', 'TAWANA CADIEN', 'GRETCHEN WHITMER', 'BOB KRIST', 'JONATHAN EBEL', 'JUAN BRIBIESCA', 'JESSICA KING', 'ALEC ROSS', 'JOHN CHIANG', 'MIKE BARKLEY', 'JOE HOEFFEL', 'VINCENT JENNINGS', 'JOHN NORRIS', 'STEVE SISOLAK', 'LIZZETTA MCCONNELL', 'NATHAN KLEINMAN', 'SID ZELLER', 'MICHAEL JR.', 'RICK TREVI̱O', 'JOSEPH SCHIAVONI', 'THOMAS CARES', 'STEVE LOUGH', 'ALBERT MEZZETTI', 'CARL BREWER', 'ABDUL EL-SAYED', 'DENISE ADAMS', 'DONNA DION', 'K. LAVE', 'KARL DEAN', 'CRAIG FITZHUGH', 'RICHARD CORDRAY', 'PAUL RAY', 'MARGUERITE WILLIS', 'MARGE DOYLE', 'JAMES FIELDS', 'WALT MADDOX', 'CLINT KOBLE', 'PETER DILL', 'AKINYEMI AGBEDE', 'M.J. HEGAR', 'JEFFREY PAYNE', 'ANDREW KIM', 'DOUG SMITH', 'BOBBY MAHENDRA', 'KLEMENT TINAJ', 'LIUBA SHIRLEY', 'TIO HARDIMAN', 'ADAM COTE', 'JOSHUA MAHONY', 'NATE MCMURRAY', 'LAURA KELLY', 'DELAINE EASTIN', 'J.B. PRITZKER', 'ANTHONY WHITE', 'COLEMAN II', 'JOHN ROBERSON', 'JACK JR.', 'RALPH JAFFE', 'GAVIN NEWSOM', 'JOS

### Merge the datasets

*I did an inner and outer merge, but I opted to keep the inner-merged dataset for the sake of having sufficient enough data and not having too many* `NaN` *entries.*

In [15]:
dems['Candidate First Name'] = dems['Candidate First Name'].str.upper()
dems['Candidate Last Name'] = dems['Candidate Last Name'].str.upper()
brookings['Candidate.First.Name'] = brookings['Candidate.First.Name'].str.upper()
brookings['Candidate.Last.Name'] = brookings['Candidate.Last.Name'].str.upper()

#outer merge (combine the datasets even if candidates not present in both
merged_df = pd.merge(dems, brookings, 
                     left_on=['Candidate First Name', 'Candidate Last Name'], 
                     right_on=['Candidate.First.Name', 'Candidate.Last.Name'], 
                     how='outer')

In [16]:
merged_df.head()

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,Republican,Trump.Mention,Obama.Mention,Sanders.Mention,Clinton.Mention,Special.Counsel.Mention,Travel.Ban.Mention,SinglePayer,Primary.Outcome,Primary.Runoff.Outcome
0,ANTHONY WHITE,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,
1,CHRISTOPHER COUNTRYMAN,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,
2,DOUG SMITH,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,
3,JAMES FIELDS,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,
4,SUE COBB,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,


In [17]:
def fill_candidate(row):
    if pd.isna(row['Candidate']):
        if not pd.isna(row['Candidate.First.Name']) and not pd.isna(row['Candidate.Last.Name']):
            return row['Candidate.First.Name'] + ' ' + row['Candidate.Last.Name']
    return row['Candidate']

merged_df['Candidate'] = merged_df.apply(fill_candidate, axis=1)

def fill_first_last_names(row):
    if pd.isna(row['Candidate.First.Name']) or pd.isna(row['Candidate.Last.Name']):
        if not pd.isna(row['Candidate']):
            names = row['Candidate'].split(' ', 1)
            if len(names) == 2:
                row['Candidate.First.Name'], row['Candidate.Last.Name'] = names
            elif len(names) == 1:
                row['Candidate.First.Name'] = names[0]
                row['Candidate.Last.Name'] = ''
    return row

merged_df = merged_df.apply(fill_first_last_names, axis=1)

merged_df.head(3)

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,Republican,Trump.Mention,Obama.Mention,Sanders.Mention,Clinton.Mention,Special.Counsel.Mention,Travel.Ban.Mention,SinglePayer,Primary.Outcome,Primary.Runoff.Outcome
0,ANTHONY WHITE,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,
1,CHRISTOPHER COUNTRYMAN,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,
2,DOUG SMITH,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,


In [18]:
merged_df.columns

Index(['Candidate', 'State', 'District', 'Office Type', 'Race Type',
       'Race Primary Election Date', 'Primary Status', 'Primary Runoff Status',
       'General Status', 'Partisan Lean', 'Primary %', 'Won Primary', 'Race',
       'Veteran?', 'LGBTQ?', 'Elected Official?', 'Self-Funder?', 'STEM?',
       'Obama Alum?', 'Party Support?', 'Emily Endorsed?',
       'Guns Sense Candidate?', 'Biden Endorsed?', 'Warren Endorsed? ',
       'Sanders Endorsed?', 'Our Revolution Endorsed?',
       'Justice Dems Endorsed?', 'PCCC Endorsed?', 'Indivisible Endorsed?',
       'WFP Endorsed?', 'VoteVets Endorsed?', 'No Labels Support?',
       'Candidate First Name', 'Candidate Last Name', 'Unnamed: 0',
       'Candidate.First.Name', 'Candidate.Last.Name', 'Candidate.State',
       'Candidate.District', 'Candidate.Party', 'Incumbent', 'Freshman.Member',
       'Candidate.Website.URL', 'Candidate.Gender', 'Listed.military.service.',
       'Education', 'Marital.Status', 'Previous.Electoral.Experien

**nasty.**

In [19]:
#inner merged dataset (include only entries present in both datasets
inner_merged_df = pd.merge(dems, brookings, 
                           left_on=['Candidate First Name', 'Candidate Last Name'], 
                           right_on=['Candidate.First.Name', 'Candidate.Last.Name'], 
                           how='inner')

inner_merged_df.head(3)

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,Republican,Trump.Mention,Obama.Mention,Sanders.Mention,Clinton.Mention,Special.Counsel.Mention,Travel.Ban.Mention,SinglePayer,Primary.Outcome,Primary.Runoff.Outcome
0,AUDRI WILLIAMS,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,Lost,,,-33.080002,...,0,NEGATIVE MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,"Yes, candidate supports universal healthcare r...",Loser,
1,TABITHA ISNER,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,Advanced,,On the Ballot,-33.080002,...,0,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,"Yes, candidate supports universal healthcare r...",Winner,
2,ADIA WINFREY,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,Lost,,,-33.66,...,0,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,,Loser,


**Inner merge is much more wholesome.**

### Inner merge FEC data onto Brookings-FiveThirtyEight

In [20]:
fec = pd.read_csv("data/fec.csv")
fec["CAND_NAME"][0:10]

0                SHEIN, DIMITRI
1               YOUNG, DONALD E
2           NELSON, THOMAS JOHN
3                 GALVIN, ALYSE
4           KENNEDY, ROBERT JR.
5      MCCONNELL, LIZZETTA HILL
6        BYRNE, BRADLEY ROBERTS
7    WILLIAMS, AUDRI SCOTT 1955
8            ISNER, TABITHA KAY
9                  ROBY, MARTHA
Name: CAND_NAME, dtype: object

In [21]:
def extract_and_reorder_name(name):
    parts = name.split()[:2] 
    if len(parts) == 2:
        last_name, first_name = parts
        last_name = last_name.replace(',', '')
        return f"{first_name} {last_name}"
    return name  

fec['CAND_NAME'] = fec['CAND_NAME'].apply(extract_and_reorder_name)
fec['CAND_NAME']

0         DIMITRI SHEIN
1          DONALD YOUNG
2         THOMAS NELSON
3          ALYSE GALVIN
4        ROBERT KENNEDY
             ...       
2674    DON BLANKENSHIP
2675       GARY TRAUNER
2676      CHARLES HARDY
2677      JOHN BARRASSO
2678       DAVID DODSON
Name: CAND_NAME, Length: 2679, dtype: object

In [22]:
#filter out incumbents
fec = fec[fec['CAND_ICI'] != 'I']

In [23]:
fec.head()

Unnamed: 0,CAND_ID,CAND_NAME,CAND_ICI,PTY_CD,CAND_PTY_AFFILIATION,TTL_RECEIPTS,TRANS_FROM_AUTH,TTL_DISB,TRANS_TO_AUTH,COH_BOP,...,SPEC_ELECTION,PRIM_ELECTION,RUN_ELECTION,GEN_ELECTION,GEN_ELECTION_PRECENT,OTHER_POL_CMTE_CONTRIB,POL_PTY_CONTRIB,CVG_END_DT,INDIV_REFUNDS,CMTE_REFUNDS
0,H8AK00132,DIMITRI SHEIN,C,1,DEM,209916.04,0.0,209574.16,0.0,0.0,...,,,,,,0.0,0.0,12/31/2018,0.0,0.0
2,H8AK01031,THOMAS NELSON,C,2,REP,9288.48,0.0,8821.97,0.0,0.0,...,,,,,,0.0,0.0,12/31/2018,600.0,0.0
3,H8AK00140,ALYSE GALVIN,C,3,IND,1949643.68,154.7,1943398.59,0.0,0.0,...,,,,,,114833.97,0.0,12/31/2018,8166.36,0.0
4,H8AL01066,ROBERT KENNEDY,C,1,DEM,166845.21,0.0,166845.21,0.0,0.0,...,,,,,,7750.0,0.0,12/31/2018,0.0,0.0
5,H8AL01082,LIZZETTA MCCONNELL,C,1,DEM,5127.0,0.0,6021.0,0.0,0.0,...,,,,,,0.0,0.0,06/30/2018,0.0,0.0


In [24]:
merged_final_df = pd.merge(inner_merged_df, fec, 
                           left_on='Candidate', 
                           right_on='CAND_NAME', 
                           how='inner')
merged_final_df.head(3)

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,SPEC_ELECTION,PRIM_ELECTION,RUN_ELECTION,GEN_ELECTION,GEN_ELECTION_PRECENT,OTHER_POL_CMTE_CONTRIB,POL_PTY_CONTRIB,CVG_END_DT,INDIV_REFUNDS,CMTE_REFUNDS
0,AUDRI WILLIAMS,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,Lost,,,-33.080002,...,,,,,,0.0,0.0,06/30/2018,0.0,0.0
1,TABITHA ISNER,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,Advanced,,On the Ballot,-33.080002,...,,,,,,8605.0,0.0,12/31/2018,4796.05,1000.0
2,ADIA WINFREY,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,Lost,,,-33.66,...,,,,,,0.0,0.0,12/31/2018,203.6,0.0


✨*beautiful*✨

In [25]:
merged_final_df["State"].unique()

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'GA', 'IA', 'ID', 'IL', 'IN', 'KS',
       'WV', 'KY', 'MD', 'ME', 'MI', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE',
       'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'SC', 'SD', 'TN',
       'TX', 'UT', 'VA', 'WA'], dtype=object)

In [26]:
merged_final_df.columns

Index(['Candidate', 'State', 'District', 'Office Type', 'Race Type',
       'Race Primary Election Date', 'Primary Status', 'Primary Runoff Status',
       'General Status', 'Partisan Lean',
       ...
       'SPEC_ELECTION', 'PRIM_ELECTION', 'RUN_ELECTION', 'GEN_ELECTION',
       'GEN_ELECTION_PRECENT', 'OTHER_POL_CMTE_CONTRIB', 'POL_PTY_CONTRIB',
       'CVG_END_DT', 'INDIV_REFUNDS', 'CMTE_REFUNDS'],
      dtype='object', length=111)

In [27]:
merged_final_df.describe()

Unnamed: 0.1,Partisan Lean,Primary %,Unnamed: 0,Candidate.District,Incumbency,Female,Democrat,Republican,PTY_CD,TTL_RECEIPTS,...,CAND_OFFICE_DISTRICT,SPEC_ELECTION,PRIM_ELECTION,RUN_ELECTION,GEN_ELECTION,GEN_ELECTION_PRECENT,OTHER_POL_CMTE_CONTRIB,POL_PTY_CONTRIB,INDIV_REFUNDS,CMTE_REFUNDS
count,450.0,442.0,450.0,450.0,450.0,450.0,450.0,450.0,450.0,450.0,...,450.0,0.0,0.0,0.0,0.0,0.0,450.0,450.0,450.0,450.0
mean,-13.8024,33.834955,1273.515556,9.993333,0.0,0.406667,1.0,0.0,1.0,966679.5,...,9.962222,,,,,,69861.07,1347.047889,14967.993178,647.503267
std,21.227295,27.029356,611.536402,10.556981,0.0,0.491758,0.0,0.0,0.0,2075941.0,...,10.59335,,,,,,165459.0,3543.482181,40894.230152,2399.330667
min,-62.060001,1.04,11.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,,,,,,0.0,0.0,0.0,0.0
25%,-26.56,13.09,775.0,3.0,0.0,0.0,1.0,0.0,1.0,44069.8,...,3.0,,,,,,0.0,0.0,0.0,0.0
50%,-14.81,27.440001,1366.5,6.0,0.0,0.0,1.0,0.0,1.0,177806.6,...,6.0,,,,,,1184.825,0.0,250.0,0.0
75%,-4.18,46.239999,1797.75,13.0,0.0,1.0,1.0,0.0,1.0,869807.5,...,13.0,,,,,,24914.98,969.9375,12256.25,0.0
max,65.089996,100.0,2261.0,50.0,0.0,1.0,1.0,0.0,1.0,26242150.0,...,50.0,,,,,,1271588.0,48296.95,437310.08,22798.0


In [28]:
len(merged_final_df)

450

### Selecting features of significance

At this point, there were 112 total present columns (nasty) and we only want a select few. I cherry-picked the ones below.

In [29]:
filtered_columns = [
    "Candidate", "State", "District", "Office Type", "Race Type", 
    "Race Primary Election Date", "Primary Status", "General Status",
    "CAND_CONTRIB", "OTHER_POL_CMTE_CONTRIB", "POL_PTY_CONTRIB", 
    "INDIV_REFUNDS", "CMTE_REFUNDS", "CVG_END_DT"
]

endorsement_columns = [col for col in merged_final_df.columns if 'Endorsed' in col]
filtered_columns.extend(endorsement_columns)

merged_final_df = merged_final_df[filtered_columns]

In [30]:
merged_final_df.columns

Index(['Candidate', 'State', 'District', 'Office Type', 'Race Type',
       'Race Primary Election Date', 'Primary Status', 'General Status',
       'CAND_CONTRIB', 'OTHER_POL_CMTE_CONTRIB', 'POL_PTY_CONTRIB',
       'INDIV_REFUNDS', 'CMTE_REFUNDS', 'CVG_END_DT', 'Emily Endorsed?',
       'Biden Endorsed?', 'Warren Endorsed? ', 'Sanders Endorsed?',
       'Our Revolution Endorsed?', 'Justice Dems Endorsed?', 'PCCC Endorsed?',
       'Indivisible Endorsed?', 'WFP Endorsed?', 'VoteVets Endorsed?'],
      dtype='object')

In [31]:
merged_final_df.head(3)

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,General Status,CAND_CONTRIB,OTHER_POL_CMTE_CONTRIB,...,Emily Endorsed?,Biden Endorsed?,Warren Endorsed?,Sanders Endorsed?,Our Revolution Endorsed?,Justice Dems Endorsed?,PCCC Endorsed?,Indivisible Endorsed?,WFP Endorsed?,VoteVets Endorsed?
0,AUDRI WILLIAMS,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,Lost,,2200.0,0.0,...,,,,,,,,,,
1,TABITHA ISNER,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,Advanced,On the Ballot,14285.73,8605.0,...,,,,,,,,,,
2,ADIA WINFREY,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,Lost,,140.0,0.0,...,,,,,,,,,,


In [32]:
merged_final_df["Biden Endorsed?"].unique() #sanity check

array([nan, 'No', 'Yes'], dtype=object)

## Summary of Brookings-FiveThirtyEight-FEC Dataset

**Candidate:** The name of the candidate.

**State:** The state where the candidate is running.

**District:** The specific district for House candidates.

**Office Type:** Whether the candidate is running for Senate, House, or Governor.

**Race Type:** Type of race (e.g., regular, special).

**Race Primary Election Date:** The date of the primary election.

**Primary Status:** Whether the candidate won or lost the primary.

**General Status:** The candidate's status in the general election.

**INDIV_CONTRIB:** Individual contributions.

**CAND_CONTRIB:** Candidate contributions.

**OTHER_POL_CMTE_CONTRIB:** Other political committee contributions.

**POL_PTY_CONTRIB:** Political party contributions.

**INDIV_REFUNDS:** Refunds to individuals.

**MTE_REFUNDS:** Refunds to committees.

**GEN_ELECTION:** Indicates if the candidate reached the general election.

**GEN_ELECTION_PRECENT:** The percentage of votes in the general election.

*As well as columns related to **endorsements** by various political entities and figures (e.g.: 'Biden Endorsed?', 'Emily Endorsed?', etc.)*


In [70]:
#merged_final_df.to_csv("demsmerge.csv")