In [507]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import statsmodels.api as sm
from fuzzywuzzy import fuzz

pip install fuzzywuzzy python-Levenshtein

In [508]:
dem_candidates = pd.read_csv('../datasets/dem_candidates.csv')

# cleaning the datasets to prepare for merging

In [531]:
def binarize_cols(df, nan_threshold=20):
    few_NaNs_cols = []
    for col in df.columns:
        if df[col].nunique() == 2:
            unique_vals = df[col].dropna().unique()  # Excludes NaN from mapping
            print(f"col:{col}\n0 means:{unique_vals[0]}\n1 means: {unique_vals[1]}\n")
            df[col] = df[col].replace({unique_vals[0]: 0, unique_vals[1]: 1})
            
        if set(df[col].dropna().unique()) == {0, 1} and (df[col].isna().sum() < nan_threshold) and (df[col].isna().sum() != 0):
            df[col].fillna(0, inplace=True)
            few_NaNs_cols.append(col)
    print(f"NaNs were turned to zeros for: {few_NaNs_cols}")         
    return df


def drop_columns(df, col_lst):
    for i in col_lst:
        if i in df.columns:
            df.drop(i, axis=1, inplace=True)
    return df

def lowercase_column(df, column_name):
    if column_name in df.columns:
        df[column_name] = df[column_name].astype(str).str.lower()
    return df

def count_candidates_by_district(df, district_column, new_column_name):
    district_counts = df.groupby(district_column).size().reset_index(name=new_column_name)
    df_merged = df.merge(district_counts, on=district_column)
    df_merged = df_merged[df_merged[new_column_name]>1]
    return df_merged

def move_column_to_front(df, col_lst):
    for column_name in col_lst:
        if column_name in df.columns:
            df = df[[column_name] + [col for col in df.columns if col != column_name]]
    return df

def get_info(df):
    print(f"COL VALUE TYPES \n{df.dtypes} \n\ndf shape:{df.shape}\n\nall the columns:\n{df.columns}")
    
    
def rename_column_if_exists(df, old_name, new_name):
    if old_name in df.columns:
        df.rename(columns={old_name: new_name}, inplace=True)
    return df
    
def replace_values_fill_na(df, column_name, replace_dict):
    if column_name in df.columns:
        df[column_name] = df[column_name].replace(replace_dict).fillna(0)
    return df
    
    
def get_ohe_cols(df, unique_limit=16, columns_to_drop=['total_runners_house','total_runners_brookings','total_runners']):
    #object_cols = df.select_dtypes(include=['object']).columns
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')
    res = [col for col in df.columns if (df[col].nunique(dropna=False) <= unique_limit) and df[col].nunique(dropna=False)>2]
    return res


def convert_type(df, col_lst):
    for column_name in col_lst:
        if column_name in df.columns and df[column_name].dtype != 'int64':
            df[column_name] = df[column_name].astype(int)

            
def get_X_df(df, y_col):
    return df.drop(y_col, axis=1)

In [510]:
house = dem_candidates[dem_candidates['Office Type'] == 'Representative'] #filtering out non-house races
house = house[house['Race Type'] != 'Special']
house = rename_column_if_exists(house, 'Candidate', 'House Candidate')

house['dist_num'] = house['District'].str.extract('(\d+)$')
house['District Abbrev'] = house['State']+ '-' +house['dist_num']


In [511]:
qualities = ['Veteran?', 'LGBTQ?', 'STEM?', 'Race','Obama Alum?', 'Self-Funder?', 'Elected Official?'] #nulls in these columns mean no website for the candidate was found. 
# house[qualities] = house[qualities].replace({'No': 0, 'Yes': 1}).fillna(0)
# house[qualities] = house[qualities].replace({'White': 0, 'Nonwhite': 1}).fillna(0)
house = replace_values_fill_na(house,'General Status', {'None': 0, 'On the Ballot': 1})
house = house.reset_index(drop=True)

In [512]:
#of the resulting 22 rows, finding that 8 are NaNs due to the election not happenign yet. 
runoff_winners = ['Kendra Horn','Jason Nichols', 'Tim Gilpin', 'Mary Brannon']
house.loc[house['House Candidate'].isin(runoff_winners),'General Status'] = 'On the Ballot'
house.loc[house['House Candidate'].isin(runoff_winners), 'Primary Runoff Status'] = 'Advanced'
runoff_losers = ['Tom Guild', 'Clay Padgett','Amanda Douglas','Fred Gipson']
house.loc[house['House Candidate'].isin(runoff_losers),'General Status'] = 'None'

In [513]:
brookings = pd.read_csv('brookings.csv')

brookings = brookings[(brookings['Candidate.Party'].str.contains('Democrat')) & (brookings['Incumbent'].isnull())& (brookings['Primary.Outcome'].isin(['Winner','Loser']))]
brookings['Brookings Candidate'] = brookings['Candidate.First.Name'] + ' ' + brookings['Candidate.Last.Name']
brookings['District'] = brookings['Candidate.State'] + '-' + brookings['Candidate.District'].astype(str)
#brookings['Primary.Outcome'] = brookings['Primary.Outcome'].replace({'Loser': 0, 'Winner': 1})

replace_values_fill_na(brookings, 'Primary.Outcome', {'Loser': 0, 'Winner': 1})
brookings = drop_columns(brookings, ['Candidate.Party', 'Incumbent', 'Freshman.Member','Candidate.First.Name','Candidate.Last.Name','Republican',
                                     'Candidate.Website.URL', 'Primary.Runoff.Outcome', 'Party.Category.1','Candidate.State','Candidate.District',
                                     'Incumbency','Candidate.Gender','Democrat','Unnamed: 0'
                                    ])
brookings = move_column_to_front(brookings, ['Female','Brookings Candidate','District','Primary.Outcome'])
brookings = convert_to_int(brookings)
brookings = brookings.reset_index(drop=True)


In [514]:
house = drop_columns(house, ['dist_num', 'State', 'District', 'Office Type', 'Race Type',
       'Race Primary Election Date', 'Primary Status', 'Primary Runoff Status','Primary %', 'Won Primary'])

house = move_column_to_front(house, ['House Candidate','District Abbrev','General Status'])

In [515]:
house = count_candidates_by_district(house, 'District Abbrev', 'total_runners_house')
brookings = count_candidates_by_district(brookings, 'District', 'total_runners_brookings')

In [516]:
brookings = lowercase_column(brookings, 'Brookings Candidate')
house = lowercase_column(house, 'House Candidate')

In [517]:
house = house.reset_index(drop=True)
brookings = brookings.reset_index(drop=True)

# we can merge on 
- brookings['Primary.Outcome'] == house['General Status']
- brookings['District'] == house['District Abbrev']
- brookings['Candidate'] fuzzy match with house['Candidate'] with 60 percent match (this should be enough so long as the other two conditions are also matched)


Exporting this to two csvs to merge. Merging code in Merging Datasets

In [518]:
def fuzzy_join(brookings, house, threshold=50):
    matched_pairs = []

    for index1, row1 in brookings.iterrows():
        for index2, row2 in house.iterrows():
            if (row1['District'] == row2['District Abbrev'] and row1['Primary.Outcome'] == row2['General Status']):
                similarity_score = fuzz.ratio(row1['Brookings Candidate'], row2['House Candidate'])
                if similarity_score > threshold:
                    matched_pairs.append((index1, index2))

    # Joining matched rows
    matched_df = pd.DataFrame(columns=list(brookings.columns) + list(house.columns))
    for index1, index2 in matched_pairs:
        matched_row = pd.concat([brookings.iloc[[index1]].reset_index(drop=True), 
                                 house.iloc[[index2]].reset_index(drop=True)], axis=1)
        matched_df = matched_df.append(matched_row, ignore_index=True)

    return matched_df


output = fuzzy_join(brookings, house, threshold=70)


In [532]:
new_df = binarize_cols(output.copy())

col:Primary.Outcome
0 means:0
1 means: 1

col:Female
0 means:1
1 means: 0

col:Listed.military.service.
0 means:No, the candidate does not include any information about serving in the military
1 means: Yes, the candidate serves or served in the military

col:Position.on.Same.Sex.Marriage
0 means:Candidate provides no information
1 means: Candidate supports marriage equality measures

col:General Status
0 means:0.0
1 means: 1.0

col:Race
0 means:Nonwhite
1 means: White

col:Veteran?
0 means:No
1 means: Yes

col:LGBTQ?
0 means:No
1 means: Yes

col:Elected Official?
0 means:No
1 means: Yes

col:Self-Funder?
0 means:No
1 means: Yes

col:STEM?
0 means:Yes
1 means: No

col:Obama Alum?
0 means:No
1 means: Yes

col:Party Support?
0 means:No
1 means: Yes

col:Emily Endorsed?
0 means:No
1 means: Yes

col:Guns Sense Candidate?
0 means:Yes
1 means: No

col:Biden Endorsed?
0 means:No
1 means: Yes

col:Warren Endorsed? 
0 means:No
1 means: Yes

col:Sanders Endorsed?
0 means:Yes
1 means: No

col:Our 

In [533]:
new_df = move_column_to_front(new_df, ['House Candidate','Brookings Candidate','General Outcome','Primary.Outcome'])

In [534]:
# valid = (ohe_df[nan_col]==1).sum()>50

In [536]:
ohe_cols = get_ohe_cols(new_df)
#df_of_ohe_cols = new_df[get_ohe_cols(new_df)]
ohe_cols

['Education',
 'Marital.Status',
 'Position.on.Affordable.Care.Act..ObamaCare.',
 'Position.on.Minimum.Wage',
 'Position.on.Federal.Taxes',
 'Position.on.Business.Regulations',
 'Position.on.National.Debt.Deficit',
 'Position.on.Social.Security',
 'Position.on.Gun.Control',
 'Position.on.Immigration',
 'Position.on.Abortion',
 'Position.on.Criminal.Justice.Reform',
 'Position.on.Federal.K.12.Education.Policy',
 'Position.on.Climate.Change',
 'Position.on.Campaign.Finance.Reform',
 'Position.on.Legalization.Decriminalization.of.Marijuana.Policy',
 'Position.on.Defense.Spending',
 'Position.on.Handling.Terrorism.Abroad',
 'Position.on.Russia',
 'Party.Category',
 'Trump.Mention',
 'Obama.Mention',
 'Sanders.Mention',
 'Clinton.Mention',
 'Special.Counsel.Mention',
 'Travel.Ban.Mention',
 'Race',
 'Party Support?',
 'Emily Endorsed?',
 'Guns Sense Candidate?',
 'Biden Endorsed?',
 'Warren Endorsed? ',
 'Sanders Endorsed?',
 'Our Revolution Endorsed?',
 'Justice Dems Endorsed?',
 'PCCC End

In [537]:
def ohe_and_avoid_multicollinearity(df, columns_to_ohe):
    nan_columns_dropped = []
    first_cols_dropped = []
    for col in columns_to_ohe:
        if col in df.columns:
            # Perform one-hot encoding
            ohe_df = pd.get_dummies(df[col], prefix=col, dummy_na=True)

            # Drop the original column
            df = df.drop(col, axis=1)

            # Drop the column for NaN values if it exists, otherwise drop the first column
           
            nan_col = col + '_nan'
            valid = (ohe_df[nan_col]==1).sum()>50
            if (nan_col in ohe_df.columns and valid):
                ohe_df = ohe_df.drop(nan_col, axis=1)
                nan_columns_dropped.append([col, nan_col])
            else:
                first_cols_dropped.append([col, ohe_df.iloc[:, 0]])
                ohe_df = ohe_df.iloc[:, 1:]

            # Merge the OHE columns back into the original dataframe
            df = pd.concat([df, ohe_df], axis=1)
            #print("\n".join(my_list))
    for i in nan_columns_dropped:
        print(f"col:  {i[0]}\ndrop: {i[1]}\n")
    for i in first_cols_dropped:
        print(f"col:  {i[0]}\ndrop: {i[1].name}\n")
    print(f"number of cols expected to OHE: {len(columns_to_ohe)}\nnumber of cols dropped: {len(nan_columns_dropped) + len(first_cols_dropped)}")    
    return df

In [538]:
new_df = ohe_and_avoid_multicollinearity(new_df.copy(), ohe_cols)

col:  Race
drop: Race_nan

col:  Party Support?
drop: Party Support?_nan

col:  Emily Endorsed?
drop: Emily Endorsed?_nan

col:  Guns Sense Candidate?
drop: Guns Sense Candidate?_nan

col:  Biden Endorsed?
drop: Biden Endorsed?_nan

col:  Warren Endorsed? 
drop: Warren Endorsed? _nan

col:  Sanders Endorsed?
drop: Sanders Endorsed?_nan

col:  Our Revolution Endorsed?
drop: Our Revolution Endorsed?_nan

col:  Justice Dems Endorsed?
drop: Justice Dems Endorsed?_nan

col:  PCCC Endorsed?
drop: PCCC Endorsed?_nan

col:  Indivisible Endorsed?
drop: Indivisible Endorsed?_nan

col:  WFP Endorsed?
drop: WFP Endorsed?_nan

col:  VoteVets Endorsed?
drop: VoteVets Endorsed?_nan

col:  No Labels Support?
drop: No Labels Support?_nan

col:  Education
drop: Education_Associate's Degree

col:  Marital.Status
drop: Marital.Status_Divorced

col:  Position.on.Affordable.Care.Act..ObamaCare.
drop: Position.on.Affordable.Care.Act..ObamaCare._Candidate explicitly supports ACA

col:  Position.on.Minimum.Wag

In [544]:
def drop_columns(df, col_lst):
    for i in col_lst:
        if i in df.columns:
            df.drop(i, axis=1, inplace=True)
    return df

new_df = drop_columns(new_df.copy(),['Primary.Outcome', 'District Abbrev','House Candidate','Brookings Candidate','total_runners_house','District','Previous.Electoral.Experience','SinglePayer'])
new_df = move_column_to_front(new_df, ['General Status'])


In [546]:
new_df.to_csv(Path('ready_for_modelling.csv'))

In [529]:
correlation_matrix = new_df.corr()
cols = correlation_matrix.columns
correlation_matrix

Unnamed: 0,General Status,Female,Listed.military.service.,Position.on.Same.Sex.Marriage,Partisan Lean,Veteran?,LGBTQ?,Elected Official?,Self-Funder?,STEM?,...,PCCC Endorsed?_0.0,PCCC Endorsed?_1.0,Indivisible Endorsed?_0.0,Indivisible Endorsed?_1.0,WFP Endorsed?_0.0,WFP Endorsed?_1.0,VoteVets Endorsed?_0.0,VoteVets Endorsed?_1.0,No Labels Support?_0.0,No Labels Support?_1.0
General Status,1.000000,-0.262351,-0.011614,0.062630,-0.122069,-0.006360,-0.029316,0.090913,0.014290,0.104971,...,-0.096437,0.168076,-0.121164,0.191482,-0.084553,0.096968,-0.107379,0.119615,0.007024,-0.034099
Female,-0.262351,1.000000,0.102133,0.014886,-0.084285,0.124771,-0.056995,-0.081073,0.043787,-0.043662,...,-0.020387,-0.073702,-0.000242,-0.099271,0.024853,-0.058037,0.003065,0.001777,-0.007738,0.042046
Listed.military.service.,-0.011614,0.102133,1.000000,-0.045511,-0.037005,0.872525,0.015289,-0.075680,-0.026332,0.076103,...,0.067791,-0.059216,0.002888,-0.006401,0.020305,0.009707,-0.087146,0.466379,0.005886,0.052869
Position.on.Same.Sex.Marriage,0.062630,0.014886,-0.045511,1.000000,0.185649,-0.069655,0.179514,0.009618,0.138928,-0.056726,...,0.053642,0.070643,-0.026396,0.061764,0.134143,0.064314,0.035378,-0.025598,0.081817,0.007607
Partisan Lean,-0.122069,-0.084285,-0.037005,0.185649,1.000000,-0.060830,-0.005460,0.283610,0.139214,0.076425,...,0.280386,0.116806,-0.001172,-0.058844,0.101199,0.037799,0.102531,0.040721,0.172503,0.059043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WFP Endorsed?_1.0,0.096968,-0.058037,0.009707,0.064314,0.037799,0.014794,-0.036619,0.115449,-0.035876,0.085458,...,-0.038780,0.301120,-0.004081,0.005266,-0.050199,1.000000,0.038895,0.123465,0.130861,-0.009942
VoteVets Endorsed?_0.0,-0.107379,0.003065,-0.087146,0.035378,0.102531,-0.094174,-0.025429,-0.033017,0.164336,0.073877,...,-0.032680,0.035801,0.035341,0.032019,0.132023,0.038895,1.000000,-0.074729,0.149060,-0.021156
VoteVets Endorsed?_1.0,0.119615,0.001777,0.466379,-0.025598,0.040721,0.484662,-0.003330,0.042896,-0.001623,0.083902,...,0.071343,-0.027617,0.046009,0.025457,0.033267,0.123465,-0.074729,1.000000,-0.028990,0.135422
No Labels Support?_0.0,0.007024,-0.007738,0.005886,0.081817,0.172503,-0.024893,0.030863,0.035625,-0.029615,0.039131,...,-0.032012,0.175740,-0.054605,-0.033720,0.143873,0.130861,0.149060,-0.028990,1.000000,-0.008207


The smallest possible value for VIF is 1, which indicates the complete absence of collinearity. Typically in practice there is a small amount of collinearity among the predictors. As a rule of thumb, a VIF value that exceeds 5 or 10 indicates a problematic amount of collinearity.In the Credit data, a regression of balance on age, rating, and limit indicates that the predictors have VIF values of 1.01, 160.67, and 160.59. As we suspected, there is considerable collinearity in the data!When faced with the problem of collinearity, there are two simple solu- tions. The first is to drop one of the problematic variables from the regres- sion. This can usually be done without much compromise to the regression fit, since the presence of collinearity implies that the information that this variable provides about the response is redundant in the presence of the other variables. For instance, if we regress balance onto age and limit, without the rating predictor, then the resulting VIF values are close to the minimum possible value of 1, and the R2 drops from 0.754 to 0.75. So dropping rating from the set of predictors has effectively solved the collinearity problem without compromising the fit.

In [530]:
#new_df

In [384]:
correlation_df = pd.DataFrame(correlation_matrix.iloc[0]).abs().sort_values("General Status",ascending=False)
correlation_df.head(3)
#print(f"columns:\n{correlation_df.index}")

Unnamed: 0,General Status
General Status,1.0
Party Support?_1.0,0.352727
Female,0.262351


In [502]:
#get_X_df(new_df, 'General Status').columns

Index(['District', 'Female', 'Listed.military.service.',
       'Previous.Electoral.Experience', 'Position.on.Same.Sex.Marriage',
       'SinglePayer', 'total_runners_brookings', 'District Abbrev',
       'Partisan Lean', 'Veteran?',
       ...
       'PCCC Endorsed?_0.0', 'PCCC Endorsed?_1.0', 'Indivisible Endorsed?_0.0',
       'Indivisible Endorsed?_1.0', 'WFP Endorsed?_0.0', 'WFP Endorsed?_1.0',
       'VoteVets Endorsed?_0.0', 'VoteVets Endorsed?_1.0',
       'No Labels Support?_0.0', 'No Labels Support?_1.0'],
      dtype='object', length=156)

In [505]:
# from statsmodels.stats.outliers_influence import variance_inflation_factor 

# vif_data = pd.DataFrame() 
# X = get_X_df(new_df, 'General Status')
# vif_data["feature"] = X.columns

# # calculating VIF for each feature 
# vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
# vif_data

## Deprecated

In [353]:
# from sklearn.preprocessing import OneHotEncoder
# encoder = OneHotEncoder(drop='first')
# encoder.fit(new_df[to_ohe])
# encoded_data = encoder.transform(new_df[to_ohe]).toarray()
# encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())
# final_df = pd.concat([new_df, encoded_df], axis=1)
# final_df.drop(to_ohe, axis=1, inplace=True)

In [33]:
# #new_df.columns

# new_df[['General Status', 'Partisan Lean','Candidate.Gender',
#        'Listed.military.service.', 'Education', 'Marital.Status',
#        'Previous.Electoral.Experience',
#        'Position.on.Affordable.Care.Act..ObamaCare.',
#        'Position.on.Minimum.Wage', 'Position.on.Federal.Taxes',
#        'Position.on.Business.Regulations', 'Position.on.National.Debt.Deficit',
#        'Position.on.Social.Security', 'Position.on.Gun.Control',
#        'Position.on.Immigration', 'Position.on.Abortion',
#        'Position.on.Same.Sex.Marriage', 'Position.on.Criminal.Justice.Reform',
#        'Position.on.Federal.K.12.Education.Policy',
#        'Position.on.Climate.Change', 'Position.on.Campaign.Finance.Reform',
#        'Position.on.Legalization.Decriminalization.of.Marijuana.Policy',
#        'Position.on.Defense.Spending', 'Position.on.Handling.Terrorism.Abroad',
#        'Position.on.Russia', 'Party.Category', 'Trump.Mention',
#        'Obama.Mention', 'Sanders.Mention', 'Clinton.Mention',
#        'Special.Counsel.Mention', 'Travel.Ban.Mention', 'SinglePayer',
#         'Race', 'Veteran?', 'LGBTQ?', 'Elected Official?', 'Self-Funder?', 'STEM?', 'Obama Alum?',
#        'Party Support?', 'Emily Endorsed?', 'Guns Sense Candidate?',
#        'Biden Endorsed?', 'Warren Endorsed? ', 'Sanders Endorsed?',
#        'Our Revolution Endorsed?', 'Justice Dems Endorsed?', 'PCCC Endorsed?',
#        'Indivisible Endorsed?', 'WFP Endorsed?', 'VoteVets Endorsed?',
#        'No Labels Support?']]

Unnamed: 0,General Status,Partisan Lean,Candidate.Gender,Listed.military.service.,Education,Marital.Status,Previous.Electoral.Experience,Position.on.Affordable.Care.Act..ObamaCare.,Position.on.Minimum.Wage,Position.on.Federal.Taxes,...,Biden Endorsed?,Warren Endorsed?,Sanders Endorsed?,Our Revolution Endorsed?,Justice Dems Endorsed?,PCCC Endorsed?,Indivisible Endorsed?,WFP Endorsed?,VoteVets Endorsed?,No Labels Support?
0,0.0,-30.680000,Female,"No, the candidate does not include any informa...",Other,Married,,Candidate provides no information,Candidate provides no information,Candidate provides no information,...,,,,,,,,,,
1,1.0,-30.680000,Male,"Yes, the candidate serves or served in the mil...",Master's Degree (includes MBA),Married,,Candidate provides complicated/complex/unclear...,Candidate provides no information,Candidate provides no information,...,,,,,,,,,,
2,0.0,-33.080002,Female,"Yes, the candidate serves or served in the mil...",No Education Information Listed,Other,,Candidate provides complicated/complex/unclear...,Candidate supports raising the minimum wage,Candidate provides no information,...,,,,,,,,,,
3,1.0,-33.080002,Female,"No, the candidate does not include any informa...",Master's Degree (includes MBA),Married,,Candidate provides complicated/complex/unclear...,Candidate provides no information,Candidate supports raising taxes on the wealth...,...,,,,,,,,,,
4,0.0,-33.660000,Female,"No, the candidate does not include any informa...",Psy.D.,No information,,Candidate provides no information,Candidate provides no information,Candidate provides no information,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573,1.0,-35.330002,Female,"No, the candidate does not include any informa...",Bachelor's Degree,No information,,Candidate provides no information,Candidate provides no information,Candidate provides no information,...,,,,,,,,,No,
574,0.0,-47.480000,Male,"No, the candidate does not include any informa...",No Education Information Listed,Married,Candidate mentions previous elected office exp...,Candidate provides no information,Candidate provides no information,Candidate provides no information,...,,,,,,,,No,No,
575,0.0,-47.480000,Female,"No, the candidate does not include any informa...",Bachelor's Degree,No information,,Candidate provides no information,Candidate provides no information,Candidate provides no information,...,,,,,,,,No,No,
576,0.0,-47.480000,Male,"No, the candidate does not include any informa...",Master's Degree (includes MBA),Married,,Candidate provides no information,Candidate provides no information,Candidate provides no information,...,,,,,,,,No,No,


'total_runners'

In [146]:
# encoder = OneHotEncoder(drop='first')
# encoder.fit(new_df[to_ohe])
# encoded_data = encoder.transform(new_df[to_ohe]).toarray()
# encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())


# final_df = pd.concat([new_df, encoded_df], axis=1)
# final_df.drop(to_ohe, axis=1, inplace=True)

Index(['General Status_1.0', 'Partisan Lean_-62.060001',
       'Partisan Lean_-61.459999', 'Partisan Lean_-60.810001',
       'Partisan Lean_-55.029999', 'Partisan Lean_-54.009998',
       'Partisan Lean_-49.509998', 'Partisan Lean_-49.110001',
       'Partisan Lean_-49.009998', 'Partisan Lean_-48.73',
       ...
       'PCCC Endorsed?_Yes', 'PCCC Endorsed?_nan', 'Indivisible Endorsed?_Yes',
       'Indivisible Endorsed?_nan', 'WFP Endorsed?_Yes', 'WFP Endorsed?_nan',
       'VoteVets Endorsed?_Yes', 'VoteVets Endorsed?_nan',
       'No Labels Support?_Yes', 'No Labels Support?_nan'],
      dtype='object', length=315)

In [148]:
#correlation_df = pd.DataFrame(correlation_matrix.iloc[0]).abs().sort_values("General Status",ascending=False)
#correlation_df.head(30)

KeyError: 'General Status'

## Step 2: Drop one column from each OHE variable

## Step 3: Use forward selection to model select