In [40]:
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder

In [2]:
dems = pd.read_csv("dem_candidates.csv")

In [3]:
brookings = pd.read_csv("brookings.csv")

In [4]:
dems.head()

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,Biden Endorsed?,Warren Endorsed?,Sanders Endorsed?,Our Revolution Endorsed?,Justice Dems Endorsed?,PCCC Endorsed?,Indivisible Endorsed?,WFP Endorsed?,VoteVets Endorsed?,No Labels Support?
0,Anthony White (Alabama),AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,
1,Christopher Countryman,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,
2,"Doug ""New Blue"" Smith",AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,
3,James C. Fields,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,
4,Sue Bell Cobb,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,,


### How many states were involved in the 2018 primary elections for the House and Senate?

In [5]:
dems["State"].unique() #correct number of states involved without including incumbents

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'GA', 'IA', 'ID', 'IL', 'IN', 'KS',
       'KY', 'MD', 'ME', 'MI', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NJ',
       'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VA', 'WA', 'WV'], dtype=object)

In [None]:
brookings.head()

In [61]:
brookings["Candidate.State"].unique()

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI',
       'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'MA', 'MD', 'ME', 'MI', 'MN',
       'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY',
       'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA',
       'VT', 'WA', 'WI', 'WV', 'WY'], dtype=object)

In [8]:
len(brookings), len(dems)

(2280, 811)

In [9]:
brookings = brookings[brookings["Candidate.Party"] == "Democrat"]
brookings = brookings[brookings["Incumbency"] == 0]

In [10]:
len(brookings), len(dems)

(973, 811)

In [11]:
brookings.describe()

Unnamed: 0.1,Unnamed: 0,Candidate.District,Incumbency,Female,Democrat,Republican
count,973.0,973.0,973.0,973.0,973.0,973.0
mean,1147.210689,8.892086,0.0,0.329908,1.0,0.0
std,623.668112,10.005845,0.0,0.470421,0.0,0.0
min,1.0,0.0,0.0,0.0,1.0,0.0
25%,642.0,2.0,0.0,0.0,1.0,0.0
50%,1152.0,5.0,0.0,0.0,1.0,0.0
75%,1661.0,12.0,0.0,1.0,1.0,0.0
max,2277.0,50.0,0.0,1.0,1.0,0.0


In [14]:
dems["Candidate"][0:30]

0     Anthony White (Alabama)
1      Christopher Countryman
2       Doug "New Blue" Smith
3             James C. Fields
4               Sue Bell Cobb
5                 Walt Maddox
6     Lizzetta Hill McConnell
7          Robert Kennedy Jr.
8        Audri Scott Williams
9               Tabitha Isner
10     Adia McClellan Winfrey
11              Mallory Hagan
12                  Lee Auman
13             Rick Neighbors
14             Peter Joffrion
15               Danner Kline
16            Jared Henderson
17            Leticia Sanders
18              Chintan Desai
19              Clarke Tucker
20                 Gwen Combs
21           Jonathan Dunkley
22               Paul Spencer
23              Joshua Mahony
24              Hayden Shamel
25          Brianna Westbrook
26            Hiral Tipirneni
27            Akinyemi Agbede
28     Albert Caesar Mezzetti
29            Amanda Renteria
Name: Candidate, dtype: object

### Regularize the name of candidates

In [15]:
def extract_first_last(name):
    name = re.sub(r'\(.*?\)', '', name)
    name = re.sub(r'\".*?\"', '', name)
    parts = name.split()

    if len(parts) >= 2:
        return parts[0] + ' ' + parts[-1]
    elif len(parts) == 1:
        return parts[0]
    else:
        return ''

dems['Candidate'] = dems['Candidate'].apply(extract_first_last)

dems['Candidate']

0               Anthony White
1      Christopher Countryman
2                  Doug Smith
3                James Fields
4                    Sue Cobb
                ...          
806            Talley Sergent
807           Janice Hagerman
808                Paul Davis
809             Richard Ojeda
810              Shirley Love
Name: Candidate, Length: 811, dtype: object

In [16]:
dems['Candidate'] = dems['Candidate'].str.upper()

dems[['Candidate First Name', 'Candidate Last Name']] = dems['Candidate'].str.split(' ', n = 1, expand=True)
dems

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,Sanders Endorsed?,Our Revolution Endorsed?,Justice Dems Endorsed?,PCCC Endorsed?,Indivisible Endorsed?,WFP Endorsed?,VoteVets Endorsed?,No Labels Support?,Candidate First Name,Candidate Last Name
0,ANTHONY WHITE,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,ANTHONY,WHITE
1,CHRISTOPHER COUNTRYMAN,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,CHRISTOPHER,COUNTRYMAN
2,DOUG SMITH,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,DOUG,SMITH
3,JAMES FIELDS,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,JAMES,FIELDS
4,SUE COBB,AL,Governor of Alabama,Governor,Regular,6/5/18,Lost,,,-28.879999,...,,,,,,,,,SUE,COBB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806,TALLEY SERGENT,WV,U.S. House West Virginia District 2,Representative,Regular,5/8/18,Advanced,,On the Ballot,-35.330002,...,,,,,,,No,,TALLEY,SERGENT
807,JANICE HAGERMAN,WV,U.S. House West Virginia District 3,Representative,Regular,5/8/18,Lost,,,-47.480000,...,,,,,,No,No,,JANICE,HAGERMAN
808,PAUL DAVIS,WV,U.S. House West Virginia District 3,Representative,Regular,5/8/18,Lost,,,-47.480000,...,,,,,,No,No,,PAUL,DAVIS
809,RICHARD OJEDA,WV,U.S. House West Virginia District 3,Representative,Regular,5/8/18,Advanced,,On the Ballot,-47.480000,...,,,,,,Yes,Yes,,RICHARD,OJEDA


In [17]:
brookings['Candidate.First.Name'] = brookings['Candidate.First.Name'].str.upper()
brookings['Candidate.Last.Name'] = brookings['Candidate.Last.Name'].str.upper()

In [None]:
dems_candidates = set(dems['Candidate First Name'] + ' ' + dems['Candidate Last Name'])
brookings_candidates = set(brookings['Candidate.First.Name'] + ' ' + brookings['Candidate.Last.Name'])

missing_in_brookings = dems_candidates - brookings_candidates

missing_in_dems = brookings_candidates - dems_candidates

print(f"Candidates in 'dems' but not in 'brookings': {missing_in_brookings}")
print(f"Candidates in 'brookings' but not in 'dems': {missing_in_dems}")

### Merge the datasets

*I did an inner and outer merge, but I opted to keep the inner-merged dataset for the sake of having enough data and not having too many* `NaN` *entries.*

In [19]:
dems['Candidate First Name'] = dems['Candidate First Name'].str.upper()
dems['Candidate Last Name'] = dems['Candidate Last Name'].str.upper()
brookings['Candidate.First.Name'] = brookings['Candidate.First.Name'].str.upper()
brookings['Candidate.Last.Name'] = brookings['Candidate.Last.Name'].str.upper()

#outer merge (combine the datasets even if candidates not present in both
merged_df = pd.merge(dems, brookings, 
                     left_on=['Candidate First Name', 'Candidate Last Name'], 
                     right_on=['Candidate.First.Name', 'Candidate.Last.Name'], 
                     how='outer')

In [None]:
merged_df

In [None]:
def fill_candidate(row):
    if pd.isna(row['Candidate']):
        if not pd.isna(row['Candidate.First.Name']) and not pd.isna(row['Candidate.Last.Name']):
            return row['Candidate.First.Name'] + ' ' + row['Candidate.Last.Name']
    return row['Candidate']

merged_df['Candidate'] = merged_df.apply(fill_candidate, axis=1)

def fill_first_last_names(row):
    if pd.isna(row['Candidate.First.Name']) or pd.isna(row['Candidate.Last.Name']):
        if not pd.isna(row['Candidate']):
            names = row['Candidate'].split(' ', 1)
            if len(names) == 2:
                row['Candidate.First.Name'], row['Candidate.Last.Name'] = names
            elif len(names) == 1:
                row['Candidate.First.Name'] = names[0]
                row['Candidate.Last.Name'] = ''
    return row

merged_df = merged_df.apply(fill_first_last_names, axis=1)

merged_df

In [22]:
merged_df.columns

Index(['Candidate', 'State', 'District', 'Office Type', 'Race Type',
       'Race Primary Election Date', 'Primary Status', 'Primary Runoff Status',
       'General Status', 'Partisan Lean', 'Primary %', 'Won Primary', 'Race',
       'Veteran?', 'LGBTQ?', 'Elected Official?', 'Self-Funder?', 'STEM?',
       'Obama Alum?', 'Party Support?', 'Emily Endorsed?',
       'Guns Sense Candidate?', 'Biden Endorsed?', 'Warren Endorsed? ',
       'Sanders Endorsed?', 'Our Revolution Endorsed?',
       'Justice Dems Endorsed?', 'PCCC Endorsed?', 'Indivisible Endorsed?',
       'WFP Endorsed?', 'VoteVets Endorsed?', 'No Labels Support?',
       'Candidate First Name', 'Candidate Last Name', 'Unnamed: 0',
       'Candidate.First.Name', 'Candidate.Last.Name', 'Candidate.State',
       'Candidate.District', 'Candidate.Party', 'Incumbent', 'Freshman.Member',
       'Candidate.Website.URL', 'Candidate.Gender', 'Listed.military.service.',
       'Education', 'Marital.Status', 'Previous.Electoral.Experien

**nasty.**

In [23]:
#inner merged dataset (include only entries present in both datasets
inner_merged_df = pd.merge(dems, brookings, 
                           left_on=['Candidate First Name', 'Candidate Last Name'], 
                           right_on=['Candidate.First.Name', 'Candidate.Last.Name'], 
                           how='inner')

inner_merged_df

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,Republican,Trump.Mention,Obama.Mention,Sanders.Mention,Clinton.Mention,Special.Counsel.Mention,Travel.Ban.Mention,SinglePayer,Primary.Outcome,Primary.Runoff.Outcome
0,AUDRI WILLIAMS,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,Lost,,,-33.080002,...,0,NEGATIVE MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,"Yes, candidate supports universal healthcare r...",Loser,
1,TABITHA ISNER,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,Advanced,,On the Ballot,-33.080002,...,0,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,"Yes, candidate supports universal healthcare r...",Winner,
2,ADIA WINFREY,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,Lost,,,-33.660000,...,0,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,,Loser,
3,MALLORY HAGAN,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,Advanced,,On the Ballot,-33.660000,...,0,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,,Winner,
4,LEE AUMAN,AL,U.S. House Alabama District 4,Representative,Regular,6/5/18,Advanced,,On the Ballot,-62.480000,...,0,NEUTRAL MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,,Winner,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627,AARON SCHEINBERG,WV,U.S. House West Virginia District 2,Representative,Regular,5/8/18,Lost,,,-35.330002,...,0,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,,Loser,
628,TALLEY SERGENT,WV,U.S. House West Virginia District 2,Representative,Regular,5/8/18,Advanced,,On the Ballot,-35.330002,...,0,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,,Winner,
629,JANICE HAGERMAN,WV,U.S. House West Virginia District 3,Representative,Regular,5/8/18,Lost,,,-47.480000,...,0,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,,Loser,
630,RICHARD OJEDA,WV,U.S. House West Virginia District 3,Representative,Regular,5/8/18,Advanced,,On the Ballot,-47.480000,...,0,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,NO MENTION,,Winner,


**Inner merge is much more wholesome.**

### Inner merge FEC data onto Brookings-FiveThirtyEight

In [24]:
fec = pd.read_csv("fec.csv")
fec["CAND_NAME"][0:10]

0                SHEIN, DIMITRI
1               YOUNG, DONALD E
2           NELSON, THOMAS JOHN
3                 GALVIN, ALYSE
4           KENNEDY, ROBERT JR.
5      MCCONNELL, LIZZETTA HILL
6        BYRNE, BRADLEY ROBERTS
7    WILLIAMS, AUDRI SCOTT 1955
8            ISNER, TABITHA KAY
9                  ROBY, MARTHA
Name: CAND_NAME, dtype: object

In [25]:
def extract_and_reorder_name(name):
    parts = name.split()[:2] 
    if len(parts) == 2:
        last_name, first_name = parts
        last_name = last_name.replace(',', '')
        return f"{first_name} {last_name}"
    return name  

fec['CAND_NAME'] = fec['CAND_NAME'].apply(extract_and_reorder_name)
fec['CAND_NAME']

0         DIMITRI SHEIN
1          DONALD YOUNG
2         THOMAS NELSON
3          ALYSE GALVIN
4        ROBERT KENNEDY
             ...       
2674    DON BLANKENSHIP
2675       GARY TRAUNER
2676      CHARLES HARDY
2677      JOHN BARRASSO
2678       DAVID DODSON
Name: CAND_NAME, Length: 2679, dtype: object

In [26]:
#filter out incumbents
fec = fec[fec['CAND_ICI'] != 'I']

In [27]:
fec

Unnamed: 0,CAND_ID,CAND_NAME,CAND_ICI,PTY_CD,CAND_PTY_AFFILIATION,TTL_RECEIPTS,TRANS_FROM_AUTH,TTL_DISB,TRANS_TO_AUTH,COH_BOP,...,SPEC_ELECTION,PRIM_ELECTION,RUN_ELECTION,GEN_ELECTION,GEN_ELECTION_PRECENT,OTHER_POL_CMTE_CONTRIB,POL_PTY_CONTRIB,CVG_END_DT,INDIV_REFUNDS,CMTE_REFUNDS
0,H8AK00132,DIMITRI SHEIN,C,1,DEM,209916.04,0.00,209574.16,0.0,0.0,...,,,,,,0.00,0.00,12/31/2018,0.00,0.0
2,H8AK01031,THOMAS NELSON,C,2,REP,9288.48,0.00,8821.97,0.0,0.0,...,,,,,,0.00,0.00,12/31/2018,600.00,0.0
3,H8AK00140,ALYSE GALVIN,C,3,IND,1949643.68,154.70,1943398.59,0.0,0.0,...,,,,,,114833.97,0.00,12/31/2018,8166.36,0.0
4,H8AL01066,ROBERT KENNEDY,C,1,DEM,166845.21,0.00,166845.21,0.0,0.0,...,,,,,,7750.00,0.00,12/31/2018,0.00,0.0
5,H8AL01082,LIZZETTA MCCONNELL,C,1,DEM,5127.00,0.00,6021.00,0.0,0.0,...,,,,,,0.00,0.00,06/30/2018,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2673,S8WV00143,PATRICK MORRISEY,C,2,REP,6200876.51,568624.22,6153109.25,0.0,0.0,...,,,,,,670363.00,54477.64,12/31/2018,57800.00,0.0
2674,S8WV00150,DON BLANKENSHIP,C,3,CON,4165210.09,0.00,4165209.19,0.0,0.0,...,,,,,,0.00,0.00,10/17/2018,0.00,0.0
2675,S8WY00189,GARY TRAUNER,C,1,DEM,910722.70,0.00,910723.29,0.0,0.0,...,,,,,,46000.00,0.00,12/07/2018,10572.32,0.0
2676,S4WY00097,CHARLES HARDY,C,2,REP,9058.00,0.00,8891.00,0.0,0.0,...,,,,,,0.00,0.00,12/31/2018,0.00,0.0


In [28]:
merged_final_df = pd.merge(inner_merged_df, fec, 
                           left_on='Candidate', 
                           right_on='CAND_NAME', 
                           how='inner')
merged_final_df

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,SPEC_ELECTION,PRIM_ELECTION,RUN_ELECTION,GEN_ELECTION,GEN_ELECTION_PRECENT,OTHER_POL_CMTE_CONTRIB,POL_PTY_CONTRIB,CVG_END_DT,INDIV_REFUNDS,CMTE_REFUNDS
0,AUDRI WILLIAMS,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,Lost,,,-33.080002,...,,,,,,0.00,0.0,06/30/2018,0.00,0.0
1,TABITHA ISNER,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,Advanced,,On the Ballot,-33.080002,...,,,,,,8605.00,0.0,12/31/2018,4796.05,1000.0
2,ADIA WINFREY,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,Lost,,,-33.660000,...,,,,,,0.00,0.0,12/31/2018,203.60,0.0
3,MALLORY HAGAN,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,Advanced,,On the Ballot,-33.660000,...,,,,,,30500.00,500.0,12/31/2018,3935.43,0.0
4,PETER JOFFRION,AL,U.S. House Alabama District 5,Representative,Regular,6/5/18,Advanced,,On the Ballot,-34.830002,...,,,,,,29823.41,0.0,12/31/2018,5670.30,4000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,AARON SCHEINBERG,WV,U.S. House West Virginia District 2,Representative,Regular,5/8/18,Lost,,,-35.330002,...,,,,,,11794.18,0.0,12/31/2018,1500.00,0.0
446,TALLEY SERGENT,WV,U.S. House West Virginia District 2,Representative,Regular,5/8/18,Advanced,,On the Ballot,-35.330002,...,,,,,,90850.00,0.0,12/31/2018,9595.00,0.0
447,RICHARD OJEDA,WV,U.S. House West Virginia District 3,Representative,Regular,5/8/18,Advanced,,On the Ballot,-47.480000,...,,,,,,429886.46,5200.0,12/31/2018,500.00,1000.0
448,RICHARD OJEDA,WV,U.S. House West Virginia District 3,Representative,Regular,5/8/18,Advanced,,On the Ballot,-47.480000,...,,,,,,0.00,0.0,12/31/2018,230.00,0.0


✨*beautiful*✨

In [29]:
merged_final_df["State"].unique()

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'GA', 'IA', 'ID', 'IL', 'IN', 'KS',
       'WV', 'KY', 'MD', 'ME', 'MI', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE',
       'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'SC', 'SD', 'TN',
       'TX', 'UT', 'VA', 'WA'], dtype=object)

In [30]:
merged_final_df.columns

Index(['Candidate', 'State', 'District', 'Office Type', 'Race Type',
       'Race Primary Election Date', 'Primary Status', 'Primary Runoff Status',
       'General Status', 'Partisan Lean',
       ...
       'SPEC_ELECTION', 'PRIM_ELECTION', 'RUN_ELECTION', 'GEN_ELECTION',
       'GEN_ELECTION_PRECENT', 'OTHER_POL_CMTE_CONTRIB', 'POL_PTY_CONTRIB',
       'CVG_END_DT', 'INDIV_REFUNDS', 'CMTE_REFUNDS'],
      dtype='object', length=111)

In [31]:
merged_final_df.describe()

Unnamed: 0.1,Partisan Lean,Primary %,Unnamed: 0,Candidate.District,Incumbency,Female,Democrat,Republican,PTY_CD,TTL_RECEIPTS,...,CAND_OFFICE_DISTRICT,SPEC_ELECTION,PRIM_ELECTION,RUN_ELECTION,GEN_ELECTION,GEN_ELECTION_PRECENT,OTHER_POL_CMTE_CONTRIB,POL_PTY_CONTRIB,INDIV_REFUNDS,CMTE_REFUNDS
count,450.0,442.0,450.0,450.0,450.0,450.0,450.0,450.0,450.0,450.0,...,450.0,0.0,0.0,0.0,0.0,0.0,450.0,450.0,450.0,450.0
mean,-13.8024,33.834955,1273.515556,9.993333,0.0,0.406667,1.0,0.0,1.0,966679.5,...,9.962222,,,,,,69861.07,1347.047889,14967.993178,647.503267
std,21.227295,27.029356,611.536402,10.556981,0.0,0.491758,0.0,0.0,0.0,2075941.0,...,10.59335,,,,,,165459.0,3543.482181,40894.230152,2399.330667
min,-62.060001,1.04,11.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,,,,,,0.0,0.0,0.0,0.0
25%,-26.56,13.09,775.0,3.0,0.0,0.0,1.0,0.0,1.0,44069.8,...,3.0,,,,,,0.0,0.0,0.0,0.0
50%,-14.81,27.440001,1366.5,6.0,0.0,0.0,1.0,0.0,1.0,177806.6,...,6.0,,,,,,1184.825,0.0,250.0,0.0
75%,-4.18,46.239999,1797.75,13.0,0.0,1.0,1.0,0.0,1.0,869807.5,...,13.0,,,,,,24914.98,969.9375,12256.25,0.0
max,65.089996,100.0,2261.0,50.0,0.0,1.0,1.0,0.0,1.0,26242150.0,...,50.0,,,,,,1271588.0,48296.95,437310.08,22798.0


In [32]:
len(merged_final_df)

450

### Selecting features of significance

At this point, there were 112 total present columns (nasty) and we only want a select few. I cherry-picked the ones below.

In [33]:
filtered_columns = [
    "Candidate", "State", "District", "Office Type", "Race Type", 
    "Race Primary Election Date", "Primary Status", "General Status",
    "CAND_CONTRIB", "OTHER_POL_CMTE_CONTRIB", "POL_PTY_CONTRIB", 
    "INDIV_REFUNDS", "CMTE_REFUNDS", "CVG_END_DT"
]

endorsement_columns = [col for col in merged_final_df.columns if 'Endorsed' in col]
filtered_columns.extend(endorsement_columns)

merged_final_df = merged_final_df[filtered_columns]

In [34]:
merged_final_df.columns

Index(['Candidate', 'State', 'District', 'Office Type', 'Race Type',
       'Race Primary Election Date', 'Primary Status', 'General Status',
       'CAND_CONTRIB', 'OTHER_POL_CMTE_CONTRIB', 'POL_PTY_CONTRIB',
       'INDIV_REFUNDS', 'CMTE_REFUNDS', 'CVG_END_DT', 'Emily Endorsed?',
       'Biden Endorsed?', 'Warren Endorsed? ', 'Sanders Endorsed?',
       'Our Revolution Endorsed?', 'Justice Dems Endorsed?', 'PCCC Endorsed?',
       'Indivisible Endorsed?', 'WFP Endorsed?', 'VoteVets Endorsed?'],
      dtype='object')

In [35]:
merged_final_df.head()

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,General Status,CAND_CONTRIB,OTHER_POL_CMTE_CONTRIB,...,Emily Endorsed?,Biden Endorsed?,Warren Endorsed?,Sanders Endorsed?,Our Revolution Endorsed?,Justice Dems Endorsed?,PCCC Endorsed?,Indivisible Endorsed?,WFP Endorsed?,VoteVets Endorsed?
0,AUDRI WILLIAMS,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,Lost,,2200.0,0.0,...,,,,,,,,,,
1,TABITHA ISNER,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,Advanced,On the Ballot,14285.73,8605.0,...,,,,,,,,,,
2,ADIA WINFREY,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,Lost,,140.0,0.0,...,,,,,,,,,,
3,MALLORY HAGAN,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,Advanced,On the Ballot,0.0,30500.0,...,,,,,,,,,,
4,PETER JOFFRION,AL,U.S. House Alabama District 5,Representative,Regular,6/5/18,Advanced,On the Ballot,1156.51,29823.41,...,,,,,,,,Yes,,


In [42]:
merged_final_df["Biden Endorsed?"].unique()

array([nan, 'No', 'Yes'], dtype=object)

## Summary of Brookings-FiveThirtyEight-FEC Dataset

**Candidate:** The name of the candidate.

**State:** The state where the candidate is running.

**District:** The specific district for House candidates.

**Office Type:** Whether the candidate is running for Senate, House, or Governor.

**Race Type:** Type of race (e.g., regular, special).

**Race Primary Election Date:** The date of the primary election.

**Primary Status:** Whether the candidate won or lost the primary.

**General Status:** The candidate's status in the general election.

**INDIV_CONTRIB:** Individual contributions.

**CAND_CONTRIB:** Candidate contributions.

**OTHER_POL_CMTE_CONTRIB:** Other political committee contributions.

**POL_PTY_CONTRIB:** Political party contributions.

**INDIV_REFUNDS:** Refunds to individuals.

**MTE_REFUNDS:** Refunds to committees.

**GEN_ELECTION:** Indicates if the candidate reached the general election.

**GEN_ELECTION_PRECENT:** The percentage of votes in the general election.

*As well as columns related to **endorsements** by various political entities and figures (e.g.: 'Biden Endorsed?', 'Emily Endorsed?', etc.)*


In [36]:
#merged_final_df.to_csv("demsmerge.csv")

### OHE Categorical Columns

In [None]:
target_variable = 'Primary Status'

categorical_columns = [col for col in merged_final_df.columns if merged_final_df[col].dtype == 'object' and col != target_variable]
numerical_columns = [col for col in merged_final_df.columns if merged_final_df[col].dtype != 'object' and col != target_variable]

#One-Hot Encoding to categorical columns
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(merged_final_df[categorical_columns])

encoded_columns = encoder.get_feature_names_out(categorical_columns)
df_encoded = pd.DataFrame(encoded_data, columns=encoded_columns)
df_prepared = pd.concat([merged_final_df[numerical_columns], df_encoded], axis=1)

#extract the target variable
y = merged_final_df[target_variable].apply(lambda x: 1 if x == 'Advanced' else 0) # Assuming 'Advanced' means success


### The GLM using Logistic Regression

I naively selected logistic reg. since the target variable is whether or not the candidate makes it past the primaries.

In [56]:
X_train, X_test, y_train, y_test = train_test_split(df_prepared, y, test_size=0.3, random_state=42)

In [57]:
#Logistic Regression model with a binomial family and logit link function
glm_model = LogisticRegression(max_iter=1000)
glm_model.fit(X_train, y_train)

In [58]:
#model eval
y_pred = glm_model.predict(X_test)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [59]:
conf_matrix

array([[76,  4],
       [23, 32]])

In [60]:
print(report)

              precision    recall  f1-score   support

           0       0.77      0.95      0.85        80
           1       0.89      0.58      0.70        55

    accuracy                           0.80       135
   macro avg       0.83      0.77      0.78       135
weighted avg       0.82      0.80      0.79       135



#### 80% accuracy ?????