In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import statsmodels.api as sm
from fuzzywuzzy import fuzz

pip install fuzzywuzzy python-Levenshtein

In [2]:
dem_candidates = pd.read_csv('../datasets/dem_candidates.csv')

In [216]:
get_info(new_df)

COL VALUE TYPES 
Primary.Outcome                                                     int64
District                                                           object
Brookings Candidate                                                object
Female                                                            float64
Listed.military.service.                                            int64
Education                                                          object
Marital.Status                                                     object
Previous.Electoral.Experience                                      object
Position.on.Affordable.Care.Act..ObamaCare.                        object
Position.on.Minimum.Wage                                           object
Position.on.Federal.Taxes                                          object
Position.on.Business.Regulations                                   object
Position.on.National.Debt.Deficit                                  object
Position.on.Social.Se

In [39]:
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)

list

In [214]:
new_df['District']


0      AL-1
1      AL-1
2      AL-2
3      AL-2
4      AL-3
       ... 
573    WV-2
574    WV-3
575    WV-3
576    WV-3
577    WV-3
Name: District, Length: 578, dtype: object

# cleaning the datasets to prepare for merging

In [229]:
def binarize_cols(df, nan_threshold=20):
    few_NaNs_cols = []
    for col in df.columns:
        if (df[col].nunique(dropna=False) == 2) or (df[col].nunique(dropna=True) == 2):
            unique_vals = df[col].unique()  # Excludes NaN from mapping
            print(f"col:{col}\n0 means:{unique_vals[0]}\n1 means: {unique_vals[1]}\n")
            df[col] = df[col].replace({unique_vals[0]: 0, unique_vals[1]: 1})
            
        if set(df[col]) == {0, 1} and (df[col].isna().sum() < nan_threshold) and (df[col].isna().sum() != 0):
            df[col].fillna(0, inplace=True)
            few_NaNs_cols.append(col)
    print(f"NaNs were turned to zeros for: {few_NaNs_cols}")         
    return df


def drop_columns(df, col_lst):
    for i in col_lst:
        if i in df.columns:
            df.drop(i, axis=1,inplace=True)
    return df

def lowercase_column(df, column_name):
    if column_name in df.columns:
        df[column_name] = df[column_name].astype(str).str.lower()
    return df

def count_candidates_by_district(df, district_column, new_column_name):
    district_counts = df.groupby(district_column).size().reset_index(name=new_column_name)
    df_merged = df.merge(district_counts, on=district_column)
    df_merged = df_merged[df_merged[new_column_name]>1]
    return df_merged

def move_column_to_front(df, col_lst):
    for column_name in col_lst:
        if column_name in df.columns:
            df = df[[column_name] + [col for col in df.columns if col != column_name]]
    return df

def get_info(df):
    print(f"COL VALUE TYPES \n{df.dtypes} \n\ndf shape:{df.shape}\n\nall the columns:\n{df.columns}")
    
    
def rename_column_if_exists(df, old_name, new_name):
    if old_name in df.columns:
        df.rename(columns={old_name: new_name}, inplace=True)
    return df
    
def replace_values_fill_na(df, column_name, replace_dict):
    if column_name in df.columns:
        df[column_name] = df[column_name].replace(replace_dict).fillna(0)
    return df
    
    
def get_ohe_cols(df, unique_limit=16):
    #object_cols = df.select_dtypes(include=['object']).columns
    #df = drop_columns(df, [col for col in columns_to_drop if col in df.columns])
    res = [col for col in df.columns if (df[col].nunique(dropna=False) <= unique_limit) and df[col].nunique(dropna=False)>2]
    return res


def convert_type(df, col_lst):
    for column_name in col_lst:
        if column_name in df.columns and df[column_name].dtype != 'int64':
            df[column_name] = df[column_name].astype(int)

            
def get_X_df(df, y_col):
    return df.drop(y_col, axis=1)

In [86]:
house = dem_candidates[dem_candidates['Office Type'] == 'Representative'] #filtering out non-house races
house = house[house['Race Type'] != 'Special']
house = rename_column_if_exists(house, 'Candidate', 'House Candidate')
house['dist_num'] = house['District'].str.extract('(\d+)$')
house['District Abbrev'] = house['State']+ '-' +house['dist_num']


In [87]:
qualities = ['Veteran?', 'LGBTQ?', 'STEM?', 'Race','Obama Alum?', 'Self-Funder?', 'Elected Official?'] #nulls in these columns mean no website for the candidate was found. 
# house[qualities] = house[qualities].replace({'No': 0, 'Yes': 1}).fillna(0)
# house[qualities] = house[qualities].replace({'White': 0, 'Nonwhite': 1}).fillna(0)
house = replace_values_fill_na(house,'General Status', {'None': 0, 'On the Ballot': 1})
house = house.reset_index(drop=True)

In [88]:
#of the resulting 22 rows, finding that 8 are NaNs due to the election not happenign yet. 
runoff_winners = ['Kendra Horn','Jason Nichols', 'Tim Gilpin', 'Mary Brannon']
house.loc[house['House Candidate'].isin(runoff_winners),'General Status'] = 'On the Ballot'
house.loc[house['House Candidate'].isin(runoff_winners), 'Primary Runoff Status'] = 'Advanced'
runoff_losers = ['Tom Guild', 'Clay Padgett','Amanda Douglas','Fred Gipson']
house.loc[house['House Candidate'].isin(runoff_losers),'General Status'] = 'None'

In [89]:
brookings = pd.read_csv('brookings.csv')

brookings = brookings[(brookings['Candidate.Party'].str.contains('Democrat')) & (brookings['Incumbent'].isnull())& (brookings['Primary.Outcome'].isin(['Winner','Loser']))]
brookings['Brookings Candidate'] = brookings['Candidate.First.Name'] + ' ' + brookings['Candidate.Last.Name']
brookings['District'] = brookings['Candidate.State'] + '-' + brookings['Candidate.District'].astype(str)
#brookings['Primary.Outcome'] = brookings['Primary.Outcome'].replace({'Loser': 0, 'Winner': 1})

replace_values_fill_na(brookings, 'Primary.Outcome', {'Loser': 0, 'Winner': 1})
brookings = drop_columns(brookings, ['Candidate.Party', 'Incumbent', 'Freshman.Member','Candidate.First.Name','Candidate.Last.Name','Republican',
                                     'Candidate.Website.URL', 'Primary.Runoff.Outcome', 'Party.Category.1','Candidate.State','Candidate.District',
                                     'Incumbency','Candidate.Gender','Democrat','Unnamed: 0'
                                    ])
brookings = move_column_to_front(brookings, ['Female','Brookings Candidate','District','Primary.Outcome'])
brookings = brookings.reset_index(drop=True)


In [90]:
house = drop_columns(house, ['dist_num', 'State', 'District', 'Office Type', 'Race Type',
       'Race Primary Election Date', 'Primary Status', 'Primary Runoff Status','Primary %', 'Won Primary'])

house = move_column_to_front(house, ['House Candidate','District Abbrev','General Status'])

In [91]:
house = count_candidates_by_district(house, 'District Abbrev', 'total_runners_house')
brookings = count_candidates_by_district(brookings, 'District', 'total_runners_brookings')

In [92]:
brookings = lowercase_column(brookings, 'Brookings Candidate')
house = lowercase_column(house, 'House Candidate')

In [93]:
house = house.reset_index(drop=True)
brookings = brookings.reset_index(drop=True)

# we can merge on 
- brookings['Primary.Outcome'] == house['General Status']
- brookings['District'] == house['District Abbrev']
- brookings['Candidate'] fuzzy match with house['Candidate'] with 60 percent match (this should be enough so long as the other two conditions are also matched)


Exporting this to two csvs to merge. Merging code in Merging Datasets

In [94]:
def fuzzy_join(brookings, house, threshold=50):
    matched_pairs = []

    for index1, row1 in brookings.iterrows():
        for index2, row2 in house.iterrows():
            if (row1['District'] == row2['District Abbrev'] and row1['Primary.Outcome'] == row2['General Status']):
                similarity_score = fuzz.ratio(row1['Brookings Candidate'], row2['House Candidate'])
                if similarity_score > threshold:
                    matched_pairs.append((index1, index2))

    # Joining matched rows
    matched_df = pd.DataFrame(columns=list(brookings.columns) + list(house.columns))
    for index1, index2 in matched_pairs:
        matched_row = pd.concat([brookings.iloc[[index1]].reset_index(drop=True), 
                                 house.iloc[[index2]].reset_index(drop=True)], axis=1)
        matched_df = matched_df.append(matched_row, ignore_index=True)

    return matched_df


output = fuzzy_join(brookings, house, threshold=70)


In [113]:
output.to_csv(Path('2:57pm.csv'))

In [230]:
new_df = binarize_cols(output.copy())

col:Primary.Outcome
0 means:0
1 means: 1

col:Female
0 means:1.0
1 means: 0.0

col:Listed.military.service.
0 means:No, the candidate does not include any information about serving in the military
1 means: Yes, the candidate serves or served in the military

col:Previous.Electoral.Experience
0 means:nan
1 means: Candidate mentions previous elected office experience

col:Position.on.Same.Sex.Marriage
0 means:Candidate provides no information
1 means: Candidate supports marriage equality measures

col:SinglePayer
0 means:nan
1 means: Yes, candidate supports universal healthcare reforms

col:General Status
0 means:0.0
1 means: 1.0

col:Race
0 means:Nonwhite
1 means: White

col:Veteran?
0 means:No
1 means: Yes

col:LGBTQ?
0 means:No
1 means: Yes

col:Elected Official?
0 means:No
1 means: Yes

col:Self-Funder?
0 means:No
1 means: Yes

col:STEM?
0 means:Yes
1 means: No

col:Obama Alum?
0 means:No
1 means: Yes

col:Party Support?
0 means:nan
1 means: No

col:Emily Endorsed?
0 means:nan
1 mean

In [231]:
new_df['SinglePayer'].value_counts(dropna=False)

1    308
0    270
Name: SinglePayer, dtype: int64

In [232]:
new_df = binarize_cols(output.copy())
#new_df = new_df.drop(['House Candidate','Brookings Candidate','General Status','Primary.Outcome', 'total_runners_house'])

col:Primary.Outcome
0 means:0
1 means: 1

col:Female
0 means:1.0
1 means: 0.0

col:Listed.military.service.
0 means:No, the candidate does not include any information about serving in the military
1 means: Yes, the candidate serves or served in the military

col:Previous.Electoral.Experience
0 means:nan
1 means: Candidate mentions previous elected office experience

col:Position.on.Same.Sex.Marriage
0 means:Candidate provides no information
1 means: Candidate supports marriage equality measures

col:SinglePayer
0 means:nan
1 means: Yes, candidate supports universal healthcare reforms

col:General Status
0 means:0.0
1 means: 1.0

col:Race
0 means:Nonwhite
1 means: White

col:Veteran?
0 means:No
1 means: Yes

col:LGBTQ?
0 means:No
1 means: Yes

col:Elected Official?
0 means:No
1 means: Yes

col:Self-Funder?
0 means:No
1 means: Yes

col:STEM?
0 means:Yes
1 means: No

col:Obama Alum?
0 means:No
1 means: Yes

col:Party Support?
0 means:nan
1 means: No

col:Emily Endorsed?
0 means:nan
1 mean

In [233]:
ohe_cols = get_ohe_cols(new_df)
#df_of_ohe_cols = new_df[get_ohe_cols(new_df)]
ohe_cols = ohe_cols[:-1]
ohe_cols = ohe_cols + ['Previous.Electoral.Experience']
ohe_cols

['Education',
 'Marital.Status',
 'Position.on.Affordable.Care.Act..ObamaCare.',
 'Position.on.Minimum.Wage',
 'Position.on.Federal.Taxes',
 'Position.on.Business.Regulations',
 'Position.on.National.Debt.Deficit',
 'Position.on.Social.Security',
 'Position.on.Gun.Control',
 'Position.on.Immigration',
 'Position.on.Abortion',
 'Position.on.Criminal.Justice.Reform',
 'Position.on.Federal.K.12.Education.Policy',
 'Position.on.Climate.Change',
 'Position.on.Campaign.Finance.Reform',
 'Position.on.Legalization.Decriminalization.of.Marijuana.Policy',
 'Position.on.Defense.Spending',
 'Position.on.Handling.Terrorism.Abroad',
 'Position.on.Russia',
 'Party.Category',
 'Trump.Mention',
 'Obama.Mention',
 'Sanders.Mention',
 'Clinton.Mention',
 'Special.Counsel.Mention',
 'Travel.Ban.Mention',
 'total_runners_brookings',
 'Race',
 'Veteran?',
 'LGBTQ?',
 'Elected Official?',
 'STEM?',
 'Obama Alum?',
 'Party Support?',
 'Emily Endorsed?',
 'Guns Sense Candidate?',
 'Biden Endorsed?',
 'Warren E

In [234]:
ohe_this = new_df.copy()
ohe_this

Unnamed: 0,Primary.Outcome,District,Brookings Candidate,Female,Listed.military.service.,Education,Marital.Status,Previous.Electoral.Experience,Position.on.Affordable.Care.Act..ObamaCare.,Position.on.Minimum.Wage,...,Warren Endorsed?,Sanders Endorsed?,Our Revolution Endorsed?,Justice Dems Endorsed?,PCCC Endorsed?,Indivisible Endorsed?,WFP Endorsed?,VoteVets Endorsed?,No Labels Support?,total_runners_house
0,0,AL-1,lizetta mcconnell,0.0,0,Other,Married,0,Candidate provides no information,Candidate provides no information,...,0,0,0,0,0,0,0,0,0,2
1,1,AL-1,robert kennedy,1.0,1,Master's Degree (includes MBA),Married,0,Candidate provides complicated/complex/unclear...,Candidate provides no information,...,0,0,0,0,0,0,0,0,0,2
2,0,AL-2,audri williams,0.0,1,No Education Information Listed,Other,0,Candidate provides complicated/complex/unclear...,Candidate supports raising the minimum wage,...,0,0,0,0,0,0,0,0,0,2
3,1,AL-2,tabitha isner,0.0,0,Master's Degree (includes MBA),Married,0,Candidate provides complicated/complex/unclear...,Candidate provides no information,...,0,0,0,0,0,0,0,0,0,2
4,0,AL-3,adia winfrey,0.0,0,Psy.D.,No information,0,Candidate provides no information,Candidate provides no information,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573,1,WV-2,talley sergent,0.0,0,Bachelor's Degree,No information,0,Candidate provides no information,Candidate provides no information,...,0,0,0,0,0,0,0,1,0,2
574,0,WV-3,shirley love,1.0,0,No Education Information Listed,Married,1,Candidate provides no information,Candidate provides no information,...,0,0,0,0,0,0,1,1,0,4
575,0,WV-3,janice hagerman,0.0,0,Bachelor's Degree,No information,0,Candidate provides no information,Candidate provides no information,...,0,0,0,0,0,0,1,1,0,4
576,0,WV-3,paul davis,1.0,0,Master's Degree (includes MBA),Married,0,Candidate provides no information,Candidate provides no information,...,0,0,0,0,0,0,1,1,0,4


In [193]:
ohe_this.columns

Index(['Primary.Outcome', 'District', 'Brookings Candidate', 'Female',
       'Listed.military.service.', 'Education', 'Marital.Status',
       'Previous.Electoral.Experience',
       'Position.on.Affordable.Care.Act..ObamaCare.',
       'Position.on.Minimum.Wage', 'Position.on.Federal.Taxes',
       'Position.on.Business.Regulations', 'Position.on.National.Debt.Deficit',
       'Position.on.Social.Security', 'Position.on.Gun.Control',
       'Position.on.Immigration', 'Position.on.Abortion',
       'Position.on.Same.Sex.Marriage', 'Position.on.Criminal.Justice.Reform',
       'Position.on.Federal.K.12.Education.Policy',
       'Position.on.Climate.Change', 'Position.on.Campaign.Finance.Reform',
       'Position.on.Legalization.Decriminalization.of.Marijuana.Policy',
       'Position.on.Defense.Spending', 'Position.on.Handling.Terrorism.Abroad',
       'Position.on.Russia', 'Party.Category', 'Trump.Mention',
       'Obama.Mention', 'Sanders.Mention', 'Clinton.Mention',
       'Special.

In [None]:
[
    'Primary.Outcome', 'District', 'Brookings Candidate', 'Female', 'Listed.military.service.', 'Education', 'Marital.Status', 'Previous.Electoral.Experience',
    'Position.on.Affordable.Care.Act..ObamaCare.', 'Position.on.Minimum.Wage', 'Position.on.Federal.Taxes', 'Position.on.Business.Regulations', 'Position.on.National.Debt.Deficit',
    'Position.on.Social.Security', 'Position.on.Gun.Control', 'Position.on.Immigration', 'Position.on.Abortion', 'Position.on.Same.Sex.Marriage', 'Position.on.Criminal.Justice.Reform',
    'Position.on.Federal.K.12.Education.Policy', 'Position.on.Climate.Change', 'Position.on.Campaign.Finance.Reform', 'Position.on.Legalization.Decriminalization.of.Marijuana.Policy',
    'Position.on.Defense.Spending', 'Position.on.Handling.Terrorism.Abroad', 'Position.on.Russia', 'Party.Category', 'Trump.Mention', 'Obama.Mention', 'Sanders.Mention', 'Clinton.Mention',
    'Special.Counsel.Mention', 'Travel.Ban.Mention', 'SinglePayer', 'total_runners_brookings', 'General Status', 'District Abbrev', 'House Candidate', 'Partisan Lean', 'Race', 'Veteran?', 'LGBTQ?',
    'Elected Official?', 'Self-Funder?', 'STEM?', 'Obama Alum?', 'Party Support?', 'Emily Endorsed?', 'Guns Sense Candidate?', 'Biden Endorsed?', 'Warren Endorsed? ', 'Sanders Endorsed?',
    'Our Revolution Endorsed?', 'Justice Dems Endorsed?', 'PCCC Endorsed?', 'Indivisible Endorsed?', 'WFP Endorsed?', 'VoteVets Endorsed?', 'No Labels Support?', 'total_runners_house'
]


In [238]:
ohe_this = new_df.copy()
result = pd.get_dummies(ohe_this, columns=ohe_cols, dummy_na=True)
#result.drop(axis=1)
result = drop_columns(result, ohe_cols)
result

Unnamed: 0,Primary.Outcome,District,Brookings Candidate,Female,Listed.military.service.,Position.on.Same.Sex.Marriage,SinglePayer,General Status,District Abbrev,House Candidate,...,VoteVets Endorsed?_1,VoteVets Endorsed?_Yes,VoteVets Endorsed?_nan,No Labels Support?_0,No Labels Support?_1,No Labels Support?_Yes,No Labels Support?_nan,Previous.Electoral.Experience_0.0,Previous.Electoral.Experience_1.0,Previous.Electoral.Experience_nan
0,0,AL-1,lizetta mcconnell,0.0,0,0,0,0,AL-1,lizzetta hill mcconnell,...,0,0,0,1,0,0,0,1,0,0
1,1,AL-1,robert kennedy,1.0,1,0,0,1,AL-1,robert kennedy jr.,...,0,0,0,1,0,0,0,1,0,0
2,0,AL-2,audri williams,0.0,1,0,1,0,AL-2,audri scott williams,...,0,0,0,1,0,0,0,1,0,0
3,1,AL-2,tabitha isner,0.0,0,0,1,1,AL-2,tabitha isner,...,0,0,0,1,0,0,0,1,0,0
4,0,AL-3,adia winfrey,0.0,0,0,0,0,AL-3,adia mcclellan winfrey,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573,1,WV-2,talley sergent,0.0,0,0,0,1,WV-2,talley sergent,...,1,0,0,1,0,0,0,1,0,0
574,0,WV-3,shirley love,1.0,0,0,0,0,WV-3,shirley love,...,1,0,0,1,0,0,0,0,1,0
575,0,WV-3,janice hagerman,0.0,0,0,0,0,WV-3,janice hagerman,...,1,0,0,1,0,0,0,1,0,0
576,0,WV-3,paul davis,1.0,0,0,0,0,WV-3,paul davis,...,1,0,0,1,0,0,0,1,0,0


In [239]:
def VIF_dropper(df):
    for col in df.columns:
        top_freq = df[col].value_counts(normalize=True).iloc[0]
        if top_freq >= 0.80:
            df.drop(col, axis=1, inplace=True)
    return df

In [247]:
output = VIF_dropper(result)
output = drop_columns(output, ['Primary.Outcome','District','Brookings Candidate','District Abbrev', 'House Candidate'])
output = move_column_to_front(output,['General Status'])

Index(['General Status', 'Female', 'Position.on.Same.Sex.Marriage',
       'SinglePayer', 'Partisan Lean', 'total_runners_house',
       'Education_Bachelor's Degree',
       'Education_Master's Degree (includes MBA)',
       'Education_No Education Information Listed', 'Marital.Status_Married',
       'Marital.Status_No information',
       'Position.on.Affordable.Care.Act..ObamaCare._Candidate provides complicated/complex/unclear position',
       'Position.on.Minimum.Wage_Candidate provides no information',
       'Position.on.Minimum.Wage_Candidate supports raising the minimum wage',
       'Position.on.Federal.Taxes_Candidate provides no information',
       'Position.on.Federal.Taxes_Candidate supports raising taxes on the wealthy/corporations',
       'Position.on.Business.Regulations_Candidate provides no information',
       'Position.on.Social.Security_Candidate provides no information',
       'Position.on.Social.Security_Candidate supports protecting the status quo Social S

In [263]:
output.to_csv(Path('4_50.csv'))


In [260]:
import statsmodels.api as sm

def aic(X, y):
    best_features = []
    best_aic = float('inf')

    for feature in X.columns:
        # Create a temporary DataFrame with the current set of best features plus the new feature
        X_temp = sm.add_constant(X[best_features + [feature]])

       
        model = sm.Logit(y, X_temp)  # Fit the logistic regression model
        result = model.fit()  # disp=0 suppresses the fit output

        # Check AIC and update if it is lower
        if result.aic < best_aic:
            best_aic = result.aic
            best_features.append(feature)

    # Fit the final model with the best features
    X_final = sm.add_constant(X[best_features])
    final_model = sm.Logit(y, X_final)
    final_result = final_model.fit(disp=0)

    # You might want to return the final model, its summary, or AIC
    print(f"{best_features}")
    return final_result.summary(), final_result.aic


In [261]:
X = output.drop('General Status',axis=1).dropna()
X = X.astype(float)
y = output['General Status']

aic(X, y)

Optimization terminated successfully.
         Current function value: 0.530165
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.527794
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.519054
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.504823
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.472044
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.469075
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.466413
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.462699
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.459621
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.459607
  



Optimization terminated successfully.
         Current function value: 0.442854
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.442513
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.441008
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.437038
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.436908
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.436794
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.436793
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.435657
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.435697
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.437015
  

(<class 'statsmodels.iolib.summary.Summary'>
 """
                            Logit Regression Results                           
 Dep. Variable:         General Status   No. Observations:                  578
 Model:                          Logit   Df Residuals:                      556
 Method:                           MLE   Df Model:                           21
 Date:                Sat, 09 Dec 2023   Pseudo R-squ.:                  0.2810
 Time:                        16:46:41   Log-Likelihood:                -234.10
 converged:                       True   LL-Null:                       -325.58
 Covariance Type:            nonrobust   LLR p-value:                 7.893e-28
                                                                                                                                         coef    std err          z      P>|z|      [0.025      0.975]
 --------------------------------------------------------------------------------------------------------------

In [287]:
new_predictors = [
    'Female', 
    'Position.on.Same.Sex.Marriage', 
    'SinglePayer', 
    'total_runners_house', 
    "Education_Bachelor's Degree", 
    'Position.on.Affordable.Care.Act..ObamaCare._Candidate provides complicated/complex/unclear position', 
    'Position.on.Immigration_Candidate provides no information',
    'Position.on.Federal.K.12.Education.Policy_Candidate supports federal proposals for major education reform (including common core)', 
    'Position.on.Climate.Change_Candidate supports regulations and measures to combat climate change', 
    'Position.on.Campaign.Finance.Reform_Candidate provides no information', 
    'Obama.Mention_NO MENTION', 
    'STEM?_1.0', 
    'Emily Endorsed?_1', 
    'Guns Sense Candidate?_1']


In [288]:
X = output.drop('General Status',axis=1).dropna()
X = X.astype(float)
y = output['General Status']

#aic(X, y)
output

Unnamed: 0,General Status,Female,Position.on.Same.Sex.Marriage,SinglePayer,Partisan Lean,total_runners_house,Education_Bachelor's Degree,Education_Master's Degree (includes MBA),Education_No Education Information Listed,Marital.Status_Married,...,Party Support?_0,Emily Endorsed?_0,Emily Endorsed?_1,Guns Sense Candidate?_0,Guns Sense Candidate?_1,Guns Sense Candidate?_No,Our Revolution Endorsed?_0,Our Revolution Endorsed?_1,Justice Dems Endorsed?_0,Justice Dems Endorsed?_1
0,0,0.0,0,0,-30.680000,2,0,0,0,1,...,1,1,0,1,0,0,1,0,1,0
1,1,1.0,0,0,-30.680000,2,0,1,0,1,...,1,1,0,1,0,0,1,0,1,0
2,0,0.0,0,1,-33.080002,2,0,0,1,0,...,1,1,0,1,0,0,1,0,1,0
3,1,0.0,0,1,-33.080002,2,0,1,0,1,...,1,1,0,1,0,0,1,0,1,0
4,0,0.0,0,0,-33.660000,2,0,0,0,0,...,1,1,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573,1,0.0,0,0,-35.330002,2,1,0,0,0,...,1,1,0,1,0,0,1,0,1,0
574,0,1.0,0,0,-47.480000,4,0,0,1,1,...,1,1,0,1,0,0,1,0,1,0
575,0,0.0,0,0,-47.480000,4,1,0,0,0,...,1,1,0,1,0,0,1,0,1,0
576,0,1.0,0,0,-47.480000,4,0,1,0,1,...,1,1,0,1,0,0,1,0,1,0


In [289]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, balanced_accuracy_score, matthews_corrcoef

def do_regression(df, y_col, predictor_list): 
    train, test = train_test_split(df, test_size = .30, random_state = 101)
    X_train = train[predictor_list]
    y_train = train[y_col]
    X_test = test[predictor_list]
    y_test = test[y_col]
    logisticmodel = LogisticRegression(penalty='none', solver='lbfgs')
    logisticmodel.fit(X_test, y_test)
    probs = logisticmodel.predict_proba(X_test)[:, 1]
    y_hat = (probs > 0.5).astype(np.int64)
    accuracy = np.mean(y_test == y_hat)
    
    precision = precision_score(y_test, y_hat)
    recall = recall_score(y_test, y_hat)
    # Display the results
    print(f'Precision: {precision:.4f}')
    prevalence = np.mean(df[y_col] == 1)
    print(f"Prevalence: {prevalence}")
    print(f'Recall: {recall:.4f}')
    f1 = f1_score(y_test, y_hat)
    roc_auc = roc_auc_score(y_test, y_hat)
    average_precision = average_precision_score(y_test, y_hat)
    balanced_accuracy = balanced_accuracy_score(y_test, y_hat)
    print(f"Accuracy on test set: {accuracy}")


In [290]:
do_regression(output,'General Status',new_predictors)

Precision: 0.7200
Prevalence: 0.2508650519031142
Recall: 0.4286
Accuracy on test set: 0.8218390804597702


In [291]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, balanced_accuracy_score, matthews_corrcoef


def do_regression_cv(df, y_col, predictor_list):
    X = df[predictor_list]
    y = df[y_col]

    logisticmodel = LogisticRegression(penalty='none', solver='lbfgs')

    # Using cross-validation
    scoring = ['precision', 'recall', 'f1', 'roc_auc', 'average_precision', 'balanced_accuracy']
    cv_results = cross_validate(logisticmodel, X, y, cv=5, scoring=scoring)

    # Display the results
    for score in scoring:
        print(f"{score.capitalize()}: {np.mean(cv_results['test_' + score]):.4f}")

    prevalence = np.mean(y == 1)
    print(f"Prevalence: {prevalence}")

# Example usage
# do_regression(df, 'your_target_column', ['list', 'of', 'predictor', 'columns'])


In [292]:
do_regression_cv(output,'General Status',new_predictors)

Precision: 0.5612
Recall: 0.3931
F1: 0.4483
Roc_auc: 0.7977
Average_precision: 0.5770
Balanced_accuracy: 0.6458
Prevalence: 0.2508650519031142


In [272]:
from sklearn.metrics import confusion_matrix

In [256]:
# from sklearn.feature_selection import SequentialFeatureSelector
# feature_selector = SequentialFeatureSelector(
#         LogisticRegression(),
#         n_features_to_select="auto",
#         direction="forward"
#     )


In [None]:
# from sklearn.feature_selection import SequentialFeatureSelector
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.datasets import load_iris
# X, y = load_iris(return_X_y=True)
# knn = KNeighborsClassifier(n_neighbors=3)
# sfs = SequentialFeatureSelector(knn, n_features_to_select=3)
# sfs.fit(X, y)
# sfs.get_support()
# sfs.transform(X).shape

In [141]:
# def ohe_and_avoid_multicollinearity(df, columns_to_ohe):
#     # nan_columns_dropped = []
#     # first_cols_dropped = []
#     my_last_fucking_straw = []
#     res = df.copy()
#     for col in columns_to_ohe:
#         ohe_df = pd.get_dummies(res[col], prefix=col,dummy_na=True)
#         res = res.drop(col, axis=1)
#         res = pd.concat([res, ohe_df], axis=1)
#         my_last_fucking_straw.append(col)
#             # nan_col = col + '_nan'
#             # if (nan_col in ohe_df.columns):
#             #     valid = (ohe_df[nan_col]==1).sum()>50
#             #     if valid:
#             #         ohe_df = ohe_df.drop(nan_col, axis=1)
#             #         nan_columns_dropped.append([col, nan_col])
#             # else:
#             #     first_cols_dropped.append([col, ohe_df.iloc[:, 0]])
#             #     ohe_df = ohe_df.iloc[:, 1:]
#             # # Merge the OHE columns back into the original dataframe

#             #print("\n".join(my_list))
#     # for i in nan_columns_dropped:
#     #     print(f"col:  {i[0]}\ndrop: {i[1]}\n")
#     # for i in first_cols_dropped:
#     #     print(f"col:  {i[0]}\ndrop: {i[1].name}\n")
#     # print(f"number of cols expected to OHE: {len(columns_to_ohe)}\nnumber of cols dropped: {len(nan_columns_dropped) + len(first_cols_dropped)}")    
#     print(f"{my_last_fucking_straw}")
#     return res

In [241]:
#new_df = ohe_and_avoid_multicollinearity(new_df, ohe_cols)

In [82]:
# def drop_columns(df, col_lst):
#     for i in col_lst:
#         if i in df.columns:
#             df.drop(i, axis=1)
#     return df

# new_df = drop_columns(new_df.copy(),['Primary.Outcome', 'District Abbrev','House Candidate','Brookings Candidate','total_runners_house','District','Previous.Electoral.Experience','SinglePayer'])
# new_df = move_column_to_front(new_df, ['General Status'])


In [83]:
new_df.to_csv(Path('ready_for_modelling.csv'))

In [84]:
# correlation_matrix = new_df.corr()
# cols = correlation_matrix.columns
# #correlation_matrix
# new_df = drop_columns(new_df,['Primary.Outcome', 'District Abbrev','House Candidate','Brookings Candidate','total_runners_house','District','Previous.Electoral.Experience','SinglePayer'])

# new_df

Unnamed: 0,General Status,Primary.Outcome,Brookings Candidate,House Candidate,District,Female,Listed.military.service.,Previous.Electoral.Experience,Position.on.Same.Sex.Marriage,SinglePayer,...,PCCC Endorsed?_0.0,PCCC Endorsed?_1.0,Indivisible Endorsed?_0.0,Indivisible Endorsed?_1.0,WFP Endorsed?_0.0,WFP Endorsed?_1.0,VoteVets Endorsed?_0.0,VoteVets Endorsed?_1.0,No Labels Support?_0.0,No Labels Support?_1.0
0,0,0,lizetta mcconnell,lizzetta hill mcconnell,AL-1,0.0,0,,0,,...,0,0,0,0,0,0,0,0,0,0
1,1,1,robert kennedy,robert kennedy jr.,AL-1,1.0,1,,0,,...,0,0,0,0,0,0,0,0,0,0
2,0,0,audri williams,audri scott williams,AL-2,0.0,1,,0,"Yes, candidate supports universal healthcare r...",...,0,0,0,0,0,0,0,0,0,0
3,1,1,tabitha isner,tabitha isner,AL-2,0.0,0,,0,"Yes, candidate supports universal healthcare r...",...,0,0,0,0,0,0,0,0,0,0
4,0,0,adia winfrey,adia mcclellan winfrey,AL-3,0.0,0,,0,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573,1,1,talley sergent,talley sergent,WV-2,0.0,0,,0,,...,0,0,0,0,0,0,1,0,0,0
574,0,0,shirley love,shirley love,WV-3,1.0,0,Candidate mentions previous elected office exp...,0,,...,0,0,0,0,1,0,1,0,0,0
575,0,0,janice hagerman,janice hagerman,WV-3,0.0,0,,0,,...,0,0,0,0,1,0,1,0,0,0
576,0,0,paul davis,paul davis,WV-3,1.0,0,,0,,...,0,0,0,0,1,0,1,0,0,0


In [44]:
# y = new_df['General Status']# a series
# y

0      0
1      1
2      0
3      1
4      0
      ..
573    1
574    0
575    0
576    0
577    1
Name: General Status, Length: 578, dtype: int64

In [46]:
# X = drop_columns(new_df, ['General Status']) # a df
# #X = X.drop(['General Status'],axis=1)
# #drop_columns
# X.copy()

Unnamed: 0,Female,Listed.military.service.,Position.on.Same.Sex.Marriage,total_runners_brookings,Partisan Lean,Veteran?,LGBTQ?,Elected Official?,Self-Funder?,STEM?,...,PCCC Endorsed?_0.0,PCCC Endorsed?_1.0,Indivisible Endorsed?_0.0,Indivisible Endorsed?_1.0,WFP Endorsed?_0.0,WFP Endorsed?_1.0,VoteVets Endorsed?_0.0,VoteVets Endorsed?_1.0,No Labels Support?_0.0,No Labels Support?_1.0
0,0.0,0,0,2,-30.680000,0.0,0.0,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1,0,2,-30.680000,1.0,0.0,0.0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,1,0,2,-33.080002,1.0,0.0,0.0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0,0,2,-33.080002,0.0,0.0,0.0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0,0,2,-33.660000,0.0,0.0,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573,0.0,0,0,2,-35.330002,0.0,0.0,0.0,0,1.0,...,0,0,0,0,0,0,1,0,0,0
574,1.0,0,0,4,-47.480000,0.0,0.0,0.0,0,1.0,...,0,0,0,0,1,0,1,0,0,0
575,0.0,0,0,4,-47.480000,0.0,0.0,0.0,0,1.0,...,0,0,0,0,1,0,1,0,0,0
576,1.0,0,0,4,-47.480000,0.0,0.0,0.0,0,1.0,...,0,0,0,0,1,0,1,0,0,0


In [None]:
X.columns

In [None]:
# import statsmodels.api as sm

# def aic(X, y):
#     best_features = []
#     best_aic = float('inf')

#     for feature in X.columns:
#         # Create a temporary DataFrame with the current set of best features plus the new feature
#         X_temp = sm.add_constant(X[best_features + [feature]])

       
#         model = sm.Logit(y, X_temp)  # Fit the logistic regression model
#         result = model.fit()  # disp=0 suppresses the fit output

#         # Check AIC and update if it is lower
#         if result.aic < best_aic:
#             best_aic = result.aic
#             best_features.append(feature)

#     # Fit the final model with the best features
#     X_final = sm.add_constant(X[best_features])
#     final_model = sm.Logit(y, X_final)
#     final_result = final_model.fit(disp=0)

#     # You might want to return the final model, its summary, or AIC
#     return final_result.summary(), final_result.aic


In [None]:
# def aic(X, y):
#     best_features = []
#     best_aic = float('inf') 

#     for feature in X.columns:
#         # Add a constant term and the current feature
#         X = sm.add_constant(X[best_features + [feature]])

#         # Fit the logistic regression model
#         model = sm.Logit(y, X)
#         result = model.fit()

#         # Check AIC and update if it is lower
#         if result.aic < best_aic:
#             best_aic = result.aic
#             best_features.append(feature)

#     # Fit the final model with the best features
#     X_final = sm.add_constant(X[best_features])
#     final_model = sm.Logit(y, X_final)
#     final_result = final_model.fit()

#     print(final_result.summary(), final_result.aic)

In [None]:
# aic(X.copy(),y.copy())

The smallest possible value for VIF is 1, which indicates the complete absence of collinearity. Typically in practice there is a small amount of collinearity among the predictors. As a rule of thumb, a VIF value that exceeds 5 or 10 indicates a problematic amount of collinearity.In the Credit data, a regression of balance on age, rating, and limit indicates that the predictors have VIF values of 1.01, 160.67, and 160.59. As we suspected, there is considerable collinearity in the data!When faced with the problem of collinearity, there are two simple solu- tions. The first is to drop one of the problematic variables from the regres- sion. This can usually be done without much compromise to the regression fit, since the presence of collinearity implies that the information that this variable provides about the response is redundant in the presence of the other variables. For instance, if we regress balance onto age and limit, without the rating predictor, then the resulting VIF values are close to the minimum possible value of 1, and the R2 drops from 0.754 to 0.75. So dropping rating from the set of predictors has effectively solved the collinearity problem without compromising the fit.

In [None]:
# correlation_df = pd.DataFrame(correlation_matrix.iloc[0]).abs().sort_values("General Status",ascending=False)
# correlation_df.head(3)

## Deprecated

In [549]:
# from statsmodels.stats.outliers_influence import variance_inflation_factor 

In [548]:
#get_X_df(new_df, 'General Status').columns

In [353]:
# from sklearn.preprocessing import OneHotEncoder
# encoder = OneHotEncoder(drop='first')
# encoder.fit(new_df[to_ohe])
# encoded_data = encoder.transform(new_df[to_ohe]).toarray()
# encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())
# final_df = pd.concat([new_df, encoded_df], axis=1)
# final_df.drop(to_ohe, axis=1, inplace=True)

In [264]:
# #new_df.columns

# new_df[['General Status', 'Partisan Lean','Candidate.Gender',
#        'Listed.military.service.', 'Education', 'Marital.Status',
#        'Previous.Electoral.Experience',
#        'Position.on.Affordable.Care.Act..ObamaCare.',
#        'Position.on.Minimum.Wage', 'Position.on.Federal.Taxes',
#        'Position.on.Business.Regulations', 'Position.on.National.Debt.Deficit',
#        'Position.on.Social.Security', 'Position.on.Gun.Control',
#        'Position.on.Immigration', 'Position.on.Abortion',
#        'Position.on.Same.Sex.Marriage', 'Position.on.Criminal.Justice.Reform',
#        'Position.on.Federal.K.12.Education.Policy',
#        'Position.on.Climate.Change', 'Position.on.Campaign.Finance.Reform',
#        'Position.on.Legalization.Decriminalization.of.Marijuana.Policy',
#        'Position.on.Defense.Spending', 'Position.on.Handling.Terrorism.Abroad',
#        'Position.on.Russia', 'Party.Category', 'Trump.Mention',
#        'Obama.Mention', 'Sanders.Mention', 'Clinton.Mention',
#        'Special.Counsel.Mention', 'Travel.Ban.Mention', 'SinglePayer',
#         'Race', 'Veteran?', 'LGBTQ?', 'Elected Official?', 'Self-Funder?', 'STEM?', 'Obama Alum?',
#        'Party Support?', 'Emily Endorsed?', 'Guns Sense Candidate?',
#        'Biden Endorsed?', 'Warren Endorsed? ', 'Sanders Endorsed?',
#        'Our Revolution Endorsed?', 'Justice Dems Endorsed?', 'PCCC Endorsed?',
#        'Indivisible Endorsed?', 'WFP Endorsed?', 'VoteVets Endorsed?',
#        'No Labels Support?']]

'total_runners'

In [146]:
# encoder = OneHotEncoder(drop='first')
# encoder.fit(new_df[to_ohe])
# encoded_data = encoder.transform(new_df[to_ohe]).toarray()
# encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())


# final_df = pd.concat([new_df, encoded_df], axis=1)
# final_df.drop(to_ohe, axis=1, inplace=True)

Index(['General Status_1.0', 'Partisan Lean_-62.060001',
       'Partisan Lean_-61.459999', 'Partisan Lean_-60.810001',
       'Partisan Lean_-55.029999', 'Partisan Lean_-54.009998',
       'Partisan Lean_-49.509998', 'Partisan Lean_-49.110001',
       'Partisan Lean_-49.009998', 'Partisan Lean_-48.73',
       ...
       'PCCC Endorsed?_Yes', 'PCCC Endorsed?_nan', 'Indivisible Endorsed?_Yes',
       'Indivisible Endorsed?_nan', 'WFP Endorsed?_Yes', 'WFP Endorsed?_nan',
       'VoteVets Endorsed?_Yes', 'VoteVets Endorsed?_nan',
       'No Labels Support?_Yes', 'No Labels Support?_nan'],
      dtype='object', length=315)

In [265]:
#correlation_df = pd.DataFrame(correlation_matrix.iloc[0]).abs().sort_values("General Status",ascending=False)
#correlation_df.head(30)/


## Step 2: Drop one column from each OHE variable

## Step 3: Use forward selection to model select