In [8]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import statsmodels.api as sm
from fuzzywuzzy import fuzz
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import random
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, balanced_accuracy_score, matthews_corrcoef

pip install fuzzywuzzy python-Levenshtein

In [9]:
dem_candidates = pd.read_csv('../datasets/dem_candidates.csv')

# cleaning the datasets to prepare for merging

In [98]:
def binarize_cols(df, nan_threshold=20):
    few_NaNs_cols = []
    for col in df.columns:
        if (df[col].nunique(dropna=False) == 2) or (df[col].nunique(dropna=True) == 2):
            unique_vals = df[col].unique()
            if (unique_vals[0] !=0 and unique_vals[1] !=0):
                print(f"col:{col}\n0 means:{unique_vals[0]}\n1 means: {unique_vals[1]}\n")
                df[col] = df[col].replace({unique_vals[0]: int(0), unique_vals[1]: int(1)})
            df[col].fillna(0, inplace=True) #now this will only make sense tho if we OHE first.    
    return df


def drop_columns(df, col_lst):
    for i in col_lst:
        if i in df.columns:
            df.drop(i, axis=1,inplace=True)
    return df

def lowercase_column(df, column_name):
    if column_name in df.columns:
        df[column_name] = df[column_name].astype(str).str.lower()
    return df

def count_candidates_by_district(df, district_column, new_column_name):
    district_counts = df.groupby(district_column).size().reset_index(name=new_column_name)
    df_merged = df.merge(district_counts, on=district_column)
    df_merged = df_merged[df_merged[new_column_name]>1]
    return df_merged

def move_column_to_front(df, col_lst):
    for column_name in col_lst:
        if column_name in df.columns:
            df = df[[column_name] + [col for col in df.columns if col != column_name]]
    return df

def get_info(df):
    print(f"COL VALUE TYPES \n{df.dtypes} \n\ndf shape:{df.shape}\n\nall the columns:\n{df.columns}")
    
    
def rename_column_if_exists(df, old_name, new_name):
    if old_name in df.columns:
        df.rename(columns={old_name: new_name}, inplace=True)
    return df
    
def replace_values_fill_na(df, column_name, replace_dict):
    if column_name in df.columns:
        df[column_name] = df[column_name].replace(replace_dict).fillna(0)
    return df
    
    
def get_ohe_cols(df, unique_limit=16, exclude = ['total_runners_house','total_runners']):
    #object_cols = df.select_dtypes(include=['object']).columns
    res = [col for col in df.columns if ((df[col].nunique(dropna=False) <= unique_limit) and (df[col].nunique(dropna=False)>2) and (col != 'total_runners') and (col != 'General Status') and (col !='Total Endorsements'))]
    return res


def convert_type(df, col_lst):
    for column_name in col_lst:
        if column_name in df.columns and df[column_name].dtype != 'int64':
            df[column_name] = df[column_name].astype(int)

            
def get_X_df(df, y_col):
    return df.drop(y_col, axis=1)

def zero_dropper(df):
    for col in df.columns:
        if ((df[col]==0).mean() >= 0.90):
            df = df.drop(col, axis=1)
    return df

def drop_VIF_col(X, threshold=2):
    while True:
        vif_info = pd.DataFrame()
        vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
        vif_info['Column'] = X.columns
        vif_info = vif_info.sort_values('VIF', ascending=False)
        max_VIF = vif_info['VIF'].iloc[0]

        if max_VIF > threshold:
            to_drop = vif_info['Column'].iloc[0]
            print(f"Dropped: {to_drop} with VIF of {max_VIF}")
            X = X.drop(to_drop, axis=1)
        else:
            break

    return X  # Returning the modified DataFrame

def change_to_int(df, column):
    df[column] = df[column].astype(int)
    
    
def convert_to_int_if_possible(df, col_lst):
    for column_name in col_lst:
        temp_col = pd.to_numeric(df[column_name], errors='coerce')
        if temp_col.isna().sum() == 0:
            if all(temp_col.dropna() == temp_col.dropna().astype(int)):
                df[column_name] = temp_col.astype(int)
    return df

def aic(X, y):
    best_features = []
    best_aic = float('inf')

    for feature in X.columns:
        # Create a temporary DataFrame with the current set of best features plus the new feature
        X_temp = sm.add_constant(X[best_features + [feature]])

       
        model = sm.Logit(y, X_temp)  # Fit the logistic regression model
        result = model.fit()  # disp=0 suppresses the fit output

        # Check AIC and update if it is lower
        if result.aic < best_aic:
            best_aic = result.aic
            best_features.append(feature)

    # Fit the final model with the best features
    X_final = sm.add_constant(X[best_features])
    final_model = sm.Logit(y, X_final)
    final_result = final_model.fit()

    # You might want to return the final model, its summary, or AIC
    print(f"{best_features}")
    return final_result.summary(), final_result.aic

In [99]:
#All subsets of size k
def get_combinations(arr, k):
    if k == 0:
        return [[]]
    if not arr: 
        return []
    return get_combinations(arr[1:], k) + [x + [arr[0]] for x in get_combinations(arr[1:], k-1)] 

#Best Subsets Logisitc Regression using AIC

def best_aic_per_k_function(X, y):
    best_aic_per_k = {}
    
    for k in range(1, 4): # Looping through all possible sizes of predictor subsets
    
        best_aic_k = float("inf")    # Getting all combinations of predictors of size k without using itertools
        best_model_k = None
        best_variables_k = None
    
    for variables in get_combinations(list(X.columns), k):
        predictors = X[list(variables)]
        predictors = sm.add_constant(predictors)  # Add a constant (intercept) to the model
        
        # Fitting the model
        model = sm.Logit(y, predictors).fit()  # Fit the logistic regression model
        
        # Updating best AIC model for each k
        if model.aic < best_aic_k:
            best_aic_k = model.aic
            best_model_k = model
            best_variables_k = variables  # Storing variable names
    
    
    best_aic_per_k[k] = (best_aic_k, best_model_k, best_variables_k) # Storing the best AIC and variable names for each k in the dictionary

    for k, (aic, model, variables) in best_aic_per_k.items():  # Displaying the results
        print(f"\nBest AIC value for k={k} variables: {aic}")
        print(f"Variables in the model: {variables}")
        print(f"Model parameters:\n{model.params}")

In [100]:
house = dem_candidates[dem_candidates['Office Type'] == 'Representative'] #filtering out non-house races
house = house[house['Race Type'] != 'Special']
house = rename_column_if_exists(house, 'Candidate', 'House Candidate')
house['dist_num'] = house['District'].str.extract('(\d+)$')
house['District Abbrev'] = house['State']+ '-' +house['dist_num']

In [101]:
qualities = ['Veteran?', 'LGBTQ?', 'STEM?', 'Race','Obama Alum?', 'Self-Funder?', 'Elected Official?'] #nulls in these columns mean no website for the candidate was found. 
# house[qualities] = house[qualities].replace({'No': 0, 'Yes': 1}).fillna(0)
# house[qualities] = house[qualities].replace({'White': 0, 'Nonwhite': 1}).fillna(0)
house = replace_values_fill_na(house,'General Status', {'None': 0, 'On the Ballot': 1})
house = house.reset_index(drop=True)

In [102]:
#of the resulting 22 rows, finding that 8 are NaNs due to the election not happenign yet. 
runoff_winners = ['Kendra Horn','Jason Nichols', 'Tim Gilpin', 'Mary Brannon']
house.loc[house['House Candidate'].isin(runoff_winners),'General Status'] = 'On the Ballot'
house.loc[house['House Candidate'].isin(runoff_winners), 'Primary Runoff Status'] = 'Advanced'
runoff_losers = ['Tom Guild', 'Clay Padgett','Amanda Douglas','Fred Gipson']
house.loc[house['House Candidate'].isin(runoff_losers),'General Status'] = 'None'

In [103]:
brookings = pd.read_csv('brookings.csv')

brookings = brookings[(brookings['Candidate.Party'].str.contains('Democrat')) & (brookings['Incumbent'].isnull())& (brookings['Primary.Outcome'].isin(['Winner','Loser']))]
brookings['Brookings Candidate'] = brookings['Candidate.First.Name'] + ' ' + brookings['Candidate.Last.Name']
brookings['District'] = brookings['Candidate.State'] + '-' + brookings['Candidate.District'].astype(str)
#brookings['Primary.Outcome'] = brookings['Primary.Outcome'].replace({'Loser': 0, 'Winner': 1})

replace_values_fill_na(brookings, 'Primary.Outcome', {'Loser': 0, 'Winner': 1})
brookings = drop_columns(brookings, ['Candidate.Party', 'Incumbent', 'Freshman.Member','Candidate.First.Name','Candidate.Last.Name','Republican',
                                     'Candidate.Website.URL', 'Primary.Runoff.Outcome', 'Party.Category.1','Candidate.State','Candidate.District',
                                     'Incumbency','Candidate.Gender','Democrat','Unnamed: 0'
                                    ])
brookings = move_column_to_front(brookings, ['Female','Brookings Candidate','District','Primary.Outcome'])
brookings = brookings.reset_index(drop=True)


In [75]:
house = drop_columns(house, ['dist_num', 'State', 'District', 'Office Type', 'Race Type',
       'Race Primary Election Date', 'Primary Status', 'Primary Runoff Status','Primary %', 'Won Primary'])

house = move_column_to_front(house, ['House Candidate','District Abbrev','General Status'])

In [76]:
house = count_candidates_by_district(house, 'District Abbrev', 'total_runners_house')
brookings = count_candidates_by_district(brookings, 'District', 'total_runners_brookings')

In [77]:
brookings = drop_columns(brookings, 'total_runners_brookings')

In [78]:
brookings = lowercase_column(brookings, 'Brookings Candidate')
house = lowercase_column(house, 'House Candidate')

In [79]:
house = house.reset_index(drop=True)
brookings = brookings.reset_index(drop=True)

# we can merge on 
- brookings['Primary.Outcome'] == house['General Status']
- brookings['District'] == house['District Abbrev']
- brookings['Candidate'] fuzzy match with house['Candidate'] with 60 percent match (this should be enough so long as the other two conditions are also matched)


Exporting this to two csvs to merge. Merging code in Merging Datasets

In [80]:
def fuzzy_join(brookings, house, threshold=50):
    matched_pairs = []

    for index1, row1 in brookings.iterrows():
        for index2, row2 in house.iterrows():
            if (row1['District'] == row2['District Abbrev'] and row1['Primary.Outcome'] == row2['General Status']):
                similarity_score = fuzz.ratio(row1['Brookings Candidate'], row2['House Candidate'])
                if similarity_score > threshold:
                    matched_pairs.append((index1, index2))
    matched_df = pd.DataFrame(columns=list(brookings.columns) + list(house.columns))
    for index1, index2 in matched_pairs:
        matched_row = pd.concat([brookings.iloc[[index1]].reset_index(drop=True), 
                                 house.iloc[[index2]].reset_index(drop=True)], axis=1)
        matched_df = matched_df.append(matched_row, ignore_index=True)

    return matched_df

In [81]:
output = fuzzy_join(brookings, house, threshold=70)

In [82]:
output = move_column_to_front(output,['Female','total_runners_house','Primary.Outcome','General Status','Brookings Candidate','House Candidate'])

In [83]:
change_to_int(output, 'Female')
change_to_int(output, 'General Status')
change_to_int(output, 'total_runners_house')

In [84]:
#output.to_csv(Path('joined_dfs.csv'))

In [85]:
output = drop_columns(output, ['District','Brookings Candidate', 'District Abbrev','House Candidate','Primary.Outcome'])
output = move_column_to_front(output, ['total_runners_house','total_runners_house','General Status'])

In [86]:
ohe_this = output.copy()

In [104]:

FEC_merged = pd.read_csv('FEC_merged.csv')
FEC_merged = convert_to_int_if_possible(FEC_merged,list(FEC_merged.columns))

In [105]:
def do_all_the_stuff(df):
    ohe_this = df.copy()
    ohe_cols =  get_ohe_cols(ohe_this)
    
    ohe_result = pd.get_dummies(ohe_this, columns=ohe_cols)
    ohe_result = drop_columns(ohe_result, ohe_cols)
    
    to_binarize = ohe_result.copy()
    binarized = binarize_cols(to_binarize)
    
    to_resample = binarized.copy()
    X_to_oversample = to_resample.drop(['General Status'],axis=1)
    y_to_oversample = to_resample['General Status']
    
    ros = RandomOverSampler(sampling_strategy=0.6,random_state=101)
    ros.fit(X_to_oversample, y_to_oversample)
    
    X_resampled, y_resampled = ros.fit_resample(X_to_oversample, y_to_oversample)
    combined_df = pd.concat([X_resampled, y_resampled.reset_index(drop=True)], axis=1)
    
    # Shuffle the combined DataFrame
    shuffled_df = combined_df.sample(frac=1, random_state=101).reset_index(drop=True)
    
    # Split the DataFrame back into X and y
    X_resampled_shuffled = shuffled_df.iloc[:, :-1]
    y_resampled_shuffled = shuffled_df.iloc[:, -1]
    shuffled_df = move_column_to_front(shuffled_df,['General Status'])
    total_runners_cols = [col for col in df.columns if "total_runners" in col]
    shuffled_df = drop_columns(shuffled_df, total_runners_cols)
    
    to_drop_sparse = shuffled_df.copy()
    dropped_sparse = zero_dropper(to_drop_sparse)
    X = dropped_sparse.drop('General Status', axis=1)
    best_X = drop_VIF_col(X)
    best_X_lst = list(best_X.columns)
    
    dropped_sparse = dropped_sparse.sample(frac=1, random_state=101).reset_index(drop=True)
    fwd_selection_X = dropped_sparse[best_X_lst]
    random.shuffle(list(fwd_selection_X.columns))
    fwd_selection_X = fwd_selection_X.astype(float)
    fwd_selection_y = dropped_sparse['General Status']
    
    #best_aic_per_k_function(X, y)
    
    
    #print(f"adityas function \n")
    #best_aic_per_k_function(fwd_selection_X,fwd_selection_y)
    
    return best_X_lst, dropped_sparse #RETURNS the best predicotrs to use, and the dataframe containing those columns and more (including "General status") ! 


In [106]:
X_lst, df = do_all_the_stuff(FEC_merged)

col:Listed.military.service.
0 means:No, the candidate does not include any information about serving in the military
1 means: Yes, the candidate serves or served in the military

col:Previous.Electoral.Experience
0 means:nan
1 means: Candidate mentions previous elected office experience

col:Position.on.Same.Sex.Marriage
0 means:Candidate provides no information
1 means: Candidate supports marriage equality measures

col:SinglePayer
0 means:nan
1 means: Yes, candidate supports universal healthcare reforms

Dropped: Position.on.Campaign.Finance.Reform_Candidate provides no information with VIF of 75.44409825586024
Dropped: Position.on.Criminal.Justice.Reform_Candidate provides no information with VIF of 63.0965066005967
Dropped: Position.on.Legalization.Decriminalization.of.Marijuana.Policy_Candidate provides no information with VIF of 47.33760243297395
Dropped: Position.on.Gun.Control_Candidate supports gun control measures with VIF of 43.4924403473346
Dropped: Position.on.National.De

In [112]:
exclude_cols = 
included_cols = [col for col in X_lst if col not in exclude_cols]


TypeError: bad operand type for unary ~: 'str'

In [109]:
aic(df[X_lst].astype(float),df['General Status'])

Optimization terminated successfully.
         Current function value: 0.518999
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.496274
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.496270
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.492693
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.491508
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.492004
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.458711
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.432286
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.425326
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.423501
  

(<class 'statsmodels.iolib.summary.Summary'>
 """
                            Logit Regression Results                           
 Dep. Variable:         General Status   No. Observations:                  707
 Model:                          Logit   Df Residuals:                      690
 Method:                           MLE   Df Model:                           16
 Date:                Sun, 10 Dec 2023   Pseudo R-squ.:                  0.4529
 Time:                        07:16:06   Log-Likelihood:                -255.86
 converged:                       True   LL-Null:                       -467.66
 Covariance Type:            nonrobust   LLR p-value:                 4.062e-80
                                                                                                                       coef    std err          z      P>|z|      [0.025      0.975]
 --------------------------------------------------------------------------------------------------------------------------------

Now we do regression

In [114]:
cols_to_use = ['Partisan Lean', 'receipts', 'Total Endorsements', 'Female', 'Race'
 , 'STEM?', 'SinglePayer'
 , 'Party Support?_0.0', 'Emily Endorsed?_0.0'
 , 'Justice Dems Endorsed?_0.0', 'Indivisible Endorsed?_0.0'
 , 'VoteVets Endorsed?_0.0', 'Education_J.D.'
 , 'Position.on.Affordable.Care.Act..ObamaCare._Candidate provides no information'
 , 'Position.on.National.Debt.Deficit_Candidate calls for lowering the national debt or calls for deficit reduction'
 , 'Position.on.Immigration_Candidate provides no information'
]


the_droppables = ['Listed.military.service.','Elected Official?','Gun Sense Candidate?_0.0','Our Revolution Endorsed?_0.0','Education_No Education Information Listed','Education_Master\'s Degree (includes MBA)']
#Our Revolution Endorsed?_0.0

In [93]:
#Precision: Measures the proportion of true positives among all predicted positives. Good for minimizing false positives.
#Recall (Sensitivity): Measures the proportion of actual positives correctly identified. Important when missing true positives is costly.

In [94]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, balanced_accuracy_score, matthews_corrcoef

def do_regression(df, y_col, predictor_list): 
    
    train, test = train_test_split(df, test_size = .30, random_state = 101)
    X_train = train[predictor_list]
    y_train = train[y_col]
    X_test = test[predictor_list]
    y_test = test[y_col]
    logisticmodel = LogisticRegression(penalty='none', solver='lbfgs')
    logisticmodel.fit(X_test, y_test)
    probs = logisticmodel.predict_proba(X_test)[:, 1]
    y_hat = (probs > 0.5).astype(np.int64)
    accuracy = np.mean(y_test == y_hat)
    
    precision = precision_score(y_test, y_hat)
    recall = recall_score(y_test, y_hat)
    # Display the results
    print(f'Precision: {precision}')
    prevalence = np.mean(df[y_col] == 1)
    print(f"Prevalence: {prevalence}")
    print(f'Recall: {recall:.4f}')
    f1 = f1_score(y_test, y_hat)
    roc_auc = roc_auc_score(y_test, y_hat)
    average_precision = average_precision_score(y_test, y_hat)
    balanced_accuracy = balanced_accuracy_score(y_test, y_hat)
    print(f"Accuracy on test set: {accuracy}")

In [95]:
do_regression(df, 'General Status', cols_to_use)

Precision: 0.3314917127071823
Prevalence: 0.37482319660537483
Recall: 0.9231
Accuracy on test set: 0.4084507042253521


In [97]:
# ohe_result = pd.get_dummies(ohe_this, columns=ohe_cols)

# ohe_result = drop_columns(ohe_result, ohe_cols)
# #ohe_result

In [None]:
ohe_result.columns

In [None]:
to_binarize = ohe_result.copy()
binarized = binarize_cols(to_binarize)
#binarized

In [None]:
binarized
to_resample = binarized.copy()
to_resample.columns

In [None]:
to_resample['No Labels Support?_No'].sum()

In [None]:
to_resample['No Labels Support?_Yes'].sum()

In [None]:
X_to_oversample = to_resample.drop(['General Status'],axis=1)
y_to_oversample = to_resample['General Status']

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(sampling_strategy=0.4,random_state=101)

ros.fit(X_to_oversample, y_to_oversample)

X_resampled, y_resampled = ros.fit_resample(X_to_oversample, y_to_oversample)


In [None]:
combined_df = pd.concat([X_resampled, y_resampled.reset_index(drop=True)], axis=1)
# Shuffle the combined DataFrame
shuffled_df = combined_df.sample(frac=1, random_state=101).reset_index(drop=True)

# Split the DataFrame back into X and y
X_resampled_shuffled = shuffled_df.iloc[:, :-1]
y_resampled_shuffled = shuffled_df.iloc[:, -1]


In [None]:
shuffled_df.columns
shuffled_df = move_column_to_front(shuffled_df,['General Status'])

shuffled_df.columns
shuffled_df = drop_columns(shuffled_df, ['total_runners_brookings_2', 'Race_Nonwhite', 'Guns Sense Candidate?_No', 'Position.on.Legalization.Decriminalization.of.Marijuana.Policy_Candidate provides no information' ])
shuffled_df.shape



In [None]:
to_drop_sparse = shuffled_df.copy()
dropped_sparse = zero_dropper(to_drop_sparse)
dropped_sparse.columns

In [None]:
X = dropped_sparse.drop('General Status', axis=1)
best_X = drop_VIF_col(X)
best_X_lst = list(best_X.columns)

In [None]:
best_X_lst

In [None]:
y_best_X_lst = best_X_lst + ['General Status']

In [None]:
X = best_X.copy()
vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info = vif_info.sort_values('VIF', ascending=False)

In [None]:
def aic(X, y):
    best_features = []
    best_aic = float('inf')

    for feature in X.columns:
        # Create a temporary DataFrame with the current set of best features plus the new feature
        X_temp = sm.add_constant(X[best_features + [feature]])

       
        model = sm.Logit(y, X_temp)  # Fit the logistic regression model
        result = model.fit()  # disp=0 suppresses the fit output

        # Check AIC and update if it is lower
        if result.aic < best_aic:
            best_aic = result.aic
            best_features.append(feature)

    # Fit the final model with the best features
    X_final = sm.add_constant(X[best_features])
    final_model = sm.Logit(y, X_final)
    final_result = final_model.fit(disp=0)

    # You might want to return the final model, its summary, or AIC
    print(f"{best_features}")
    return final_result.summary(), final_result.aic


In [None]:
shuffled_df = shuffled_df.sample(frac=1).reset_index(drop=True)
fwd_selection_X = shuffled_df.drop('General Status',axis=1).dropna()
fwd_selection_X = fwd_selection_X.astype(float)
fwd_selection_y = shuffled_df['General Status']

In [None]:
aic(fwd_selection_X, fwd_selection_y)

In [None]:
X = output.drop('General Status',axis=1).dropna()
X = X.astype(float)
y = output['General Status']

#aic(X, y)
output

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, balanced_accuracy_score, matthews_corrcoef

def do_regression(df, y_col, predictor_list): 
    train, test = train_test_split(df, test_size = .30, random_state = 101)
    X_train = train[predictor_list]
    y_train = train[y_col]
    X_test = test[predictor_list]
    y_test = test[y_col]
    logisticmodel = LogisticRegression(penalty='none', solver='lbfgs')
    logisticmodel.fit(X_test, y_test)
    probs = logisticmodel.predict_proba(X_test)[:, 1]
    y_hat = (probs > 0.5).astype(np.int64)
    accuracy = np.mean(y_test == y_hat)
    
    precision = precision_score(y_test, y_hat)
    recall = recall_score(y_test, y_hat)
    # Display the results
    print(f'Precision: {precision:.4f}')
    prevalence = np.mean(df[y_col] == 1)
    print(f"Prevalence: {prevalence}")
    print(f'Recall: {recall:.4f}')
    f1 = f1_score(y_test, y_hat)
    roc_auc = roc_auc_score(y_test, y_hat)
    average_precision = average_precision_score(y_test, y_hat)
    balanced_accuracy = balanced_accuracy_score(y_test, y_hat)
    print(f"Accuracy on test set: {accuracy}")


In [None]:
do_regression(output,'General Status',new_predictors)

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, balanced_accuracy_score, matthews_corrcoef


def do_regression_cv(df, y_col, predictor_list):
    X = df[predictor_list]
    y = df[y_col]

    logisticmodel = LogisticRegression(penalty='none', solver='lbfgs')

    # Using cross-validation
    scoring = ['precision', 'recall', 'f1', 'roc_auc', 'average_precision', 'balanced_accuracy']
    cv_results = cross_validate(logisticmodel, X, y, cv=5, scoring=scoring)

    # Display the results
    for score in scoring:
        print(f"{score.capitalize()}: {np.mean(cv_results['test_' + score]):.4f}")

    prevalence = np.mean(y == 1)
    print(f"Prevalence: {prevalence}")

# Example usage
# do_regression(df, 'your_target_column', ['list', 'of', 'predictor', 'columns'])


In [None]:
do_regression_cv(output,'General Status',new_predictors)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# from sklearn.feature_selection import SequentialFeatureSelector
# feature_selector = SequentialFeatureSelector(
#         LogisticRegression(),
#         n_features_to_select="auto",
#         direction="forward"
#     )


In [None]:
# from sklearn.feature_selection import SequentialFeatureSelector
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.datasets import load_iris
# X, y = load_iris(return_X_y=True)
# knn = KNeighborsClassifier(n_neighbors=3)
# sfs = SequentialFeatureSelector(knn, n_features_to_select=3)
# sfs.fit(X, y)
# sfs.get_support()
# sfs.transform(X).shape

In [None]:
# def ohe_and_avoid_multicollinearity(df, columns_to_ohe):
#     # nan_columns_dropped = []
#     # first_cols_dropped = []
#     my_last_fucking_straw = []
#     res = df.copy()
#     for col in columns_to_ohe:
#         ohe_df = pd.get_dummies(res[col], prefix=col,dummy_na=True)
#         res = res.drop(col, axis=1)
#         res = pd.concat([res, ohe_df], axis=1)
#         my_last_fucking_straw.append(col)
#             # nan_col = col + '_nan'
#             # if (nan_col in ohe_df.columns):
#             #     valid = (ohe_df[nan_col]==1).sum()>50
#             #     if valid:
#             #         ohe_df = ohe_df.drop(nan_col, axis=1)
#             #         nan_columns_dropped.append([col, nan_col])
#             # else:
#             #     first_cols_dropped.append([col, ohe_df.iloc[:, 0]])
#             #     ohe_df = ohe_df.iloc[:, 1:]
#             # # Merge the OHE columns back into the original dataframe

#             #print("\n".join(my_list))
#     # for i in nan_columns_dropped:
#     #     print(f"col:  {i[0]}\ndrop: {i[1]}\n")
#     # for i in first_cols_dropped:
#     #     print(f"col:  {i[0]}\ndrop: {i[1].name}\n")
#     # print(f"number of cols expected to OHE: {len(columns_to_ohe)}\nnumber of cols dropped: {len(nan_columns_dropped) + len(first_cols_dropped)}")    
#     print(f"{my_last_fucking_straw}")
#     return res

In [None]:
#new_df = ohe_and_avoid_multicollinearity(new_df, ohe_cols)

In [None]:
# def drop_columns(df, col_lst):
#     for i in col_lst:
#         if i in df.columns:
#             df.drop(i, axis=1)
#     return df

# new_df = drop_columns(new_df.copy(),['Primary.Outcome', 'District Abbrev','House Candidate','Brookings Candidate','total_runners_house','District','Previous.Electoral.Experience','SinglePayer'])
# new_df = move_column_to_front(new_df, ['General Status'])


In [None]:
new_df.to_csv(Path('ready_for_modelling.csv'))

In [84]:
# correlation_matrix = new_df.corr()
# cols = correlation_matrix.columns
# #correlation_matrix
# new_df = drop_columns(new_df,['Primary.Outcome', 'District Abbrev','House Candidate','Brookings Candidate','total_runners_house','District','Previous.Electoral.Experience','SinglePayer'])

# new_df

Unnamed: 0,General Status,Primary.Outcome,Brookings Candidate,House Candidate,District,Female,Listed.military.service.,Previous.Electoral.Experience,Position.on.Same.Sex.Marriage,SinglePayer,...,PCCC Endorsed?_0.0,PCCC Endorsed?_1.0,Indivisible Endorsed?_0.0,Indivisible Endorsed?_1.0,WFP Endorsed?_0.0,WFP Endorsed?_1.0,VoteVets Endorsed?_0.0,VoteVets Endorsed?_1.0,No Labels Support?_0.0,No Labels Support?_1.0
0,0,0,lizetta mcconnell,lizzetta hill mcconnell,AL-1,0.0,0,,0,,...,0,0,0,0,0,0,0,0,0,0
1,1,1,robert kennedy,robert kennedy jr.,AL-1,1.0,1,,0,,...,0,0,0,0,0,0,0,0,0,0
2,0,0,audri williams,audri scott williams,AL-2,0.0,1,,0,"Yes, candidate supports universal healthcare r...",...,0,0,0,0,0,0,0,0,0,0
3,1,1,tabitha isner,tabitha isner,AL-2,0.0,0,,0,"Yes, candidate supports universal healthcare r...",...,0,0,0,0,0,0,0,0,0,0
4,0,0,adia winfrey,adia mcclellan winfrey,AL-3,0.0,0,,0,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573,1,1,talley sergent,talley sergent,WV-2,0.0,0,,0,,...,0,0,0,0,0,0,1,0,0,0
574,0,0,shirley love,shirley love,WV-3,1.0,0,Candidate mentions previous elected office exp...,0,,...,0,0,0,0,1,0,1,0,0,0
575,0,0,janice hagerman,janice hagerman,WV-3,0.0,0,,0,,...,0,0,0,0,1,0,1,0,0,0
576,0,0,paul davis,paul davis,WV-3,1.0,0,,0,,...,0,0,0,0,1,0,1,0,0,0


In [44]:
# y = new_df['General Status']# a series
# y

0      0
1      1
2      0
3      1
4      0
      ..
573    1
574    0
575    0
576    0
577    1
Name: General Status, Length: 578, dtype: int64

In [46]:
# X = drop_columns(new_df, ['General Status']) # a df
# #X = X.drop(['General Status'],axis=1)
# #drop_columns
# X.copy()

Unnamed: 0,Female,Listed.military.service.,Position.on.Same.Sex.Marriage,total_runners_brookings,Partisan Lean,Veteran?,LGBTQ?,Elected Official?,Self-Funder?,STEM?,...,PCCC Endorsed?_0.0,PCCC Endorsed?_1.0,Indivisible Endorsed?_0.0,Indivisible Endorsed?_1.0,WFP Endorsed?_0.0,WFP Endorsed?_1.0,VoteVets Endorsed?_0.0,VoteVets Endorsed?_1.0,No Labels Support?_0.0,No Labels Support?_1.0
0,0.0,0,0,2,-30.680000,0.0,0.0,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1,0,2,-30.680000,1.0,0.0,0.0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,1,0,2,-33.080002,1.0,0.0,0.0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0,0,2,-33.080002,0.0,0.0,0.0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0,0,2,-33.660000,0.0,0.0,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573,0.0,0,0,2,-35.330002,0.0,0.0,0.0,0,1.0,...,0,0,0,0,0,0,1,0,0,0
574,1.0,0,0,4,-47.480000,0.0,0.0,0.0,0,1.0,...,0,0,0,0,1,0,1,0,0,0
575,0.0,0,0,4,-47.480000,0.0,0.0,0.0,0,1.0,...,0,0,0,0,1,0,1,0,0,0
576,1.0,0,0,4,-47.480000,0.0,0.0,0.0,0,1.0,...,0,0,0,0,1,0,1,0,0,0


In [None]:
X.columns

In [None]:
# import statsmodels.api as sm

# def aic(X, y):
#     best_features = []
#     best_aic = float('inf')

#     for feature in X.columns:
#         # Create a temporary DataFrame with the current set of best features plus the new feature
#         X_temp = sm.add_constant(X[best_features + [feature]])

       
#         model = sm.Logit(y, X_temp)  # Fit the logistic regression model
#         result = model.fit()  # disp=0 suppresses the fit output

#         # Check AIC and update if it is lower
#         if result.aic < best_aic:
#             best_aic = result.aic
#             best_features.append(feature)

#     # Fit the final model with the best features
#     X_final = sm.add_constant(X[best_features])
#     final_model = sm.Logit(y, X_final)
#     final_result = final_model.fit(disp=0)

#     # You might want to return the final model, its summary, or AIC
#     return final_result.summary(), final_result.aic


In [None]:
# aic(X.copy(),y.copy())

The smallest possible value for VIF is 1, which indicates the complete absence of collinearity. Typically in practice there is a small amount of collinearity among the predictors. As a rule of thumb, a VIF value that exceeds 5 or 10 indicates a problematic amount of collinearity.In the Credit data, a regression of balance on age, rating, and limit indicates that the predictors have VIF values of 1.01, 160.67, and 160.59. As we suspected, there is considerable collinearity in the data!When faced with the problem of collinearity, there are two simple solu- tions. The first is to drop one of the problematic variables from the regres- sion. This can usually be done without much compromise to the regression fit, since the presence of collinearity implies that the information that this variable provides about the response is redundant in the presence of the other variables. For instance, if we regress balance onto age and limit, without the rating predictor, then the resulting VIF values are close to the minimum possible value of 1, and the R2 drops from 0.754 to 0.75. So dropping rating from the set of predictors has effectively solved the collinearity problem without compromising the fit.

In [None]:
# correlation_df = pd.DataFrame(correlation_matrix.iloc[0]).abs().sort_values("General Status",ascending=False)
# correlation_df.head(3)

## Deprecated

In [549]:
# from statsmodels.stats.outliers_influence import variance_inflation_factor 

In [548]:
#get_X_df(new_df, 'General Status').columns

In [None]:
# def aic(X, y):
#     best_features = []
#     best_aic = float('inf') 

#     for feature in X.columns:
#         # Add a constant term and the current feature
#         X = sm.add_constant(X[best_features + [feature]])

#         # Fit the logistic regression model
#         model = sm.Logit(y, X)
#         result = model.fit()

#         # Check AIC and update if it is lower
#         if result.aic < best_aic:
#             best_aic = result.aic
#             best_features.append(feature)

#     # Fit the final model with the best features
#     X_final = sm.add_constant(X[best_features])
#     final_model = sm.Logit(y, X_final)
#     final_result = final_model.fit()

#     print(final_result.summary(), final_result.aic)

In [353]:
# from sklearn.preprocessing import OneHotEncoder
# encoder = OneHotEncoder(drop='first')
# encoder.fit(new_df[to_ohe])
# encoded_data = encoder.transform(new_df[to_ohe]).toarray()
# encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())
# final_df = pd.concat([new_df, encoded_df], axis=1)
# final_df.drop(to_ohe, axis=1, inplace=True)

In [264]:
# #new_df.columns

# new_df[['General Status', 'Partisan Lean','Candidate.Gender',
#        'Listed.military.service.', 'Education', 'Marital.Status',
#        'Previous.Electoral.Experience',
#        'Position.on.Affordable.Care.Act..ObamaCare.',
#        'Position.on.Minimum.Wage', 'Position.on.Federal.Taxes',
#        'Position.on.Business.Regulations', 'Position.on.National.Debt.Deficit',
#        'Position.on.Social.Security', 'Position.on.Gun.Control',
#        'Position.on.Immigration', 'Position.on.Abortion',
#        'Position.on.Same.Sex.Marriage', 'Position.on.Criminal.Justice.Reform',
#        'Position.on.Federal.K.12.Education.Policy',
#        'Position.on.Climate.Change', 'Position.on.Campaign.Finance.Reform',
#        'Position.on.Legalization.Decriminalization.of.Marijuana.Policy',
#        'Position.on.Defense.Spending', 'Position.on.Handling.Terrorism.Abroad',
#        'Position.on.Russia', 'Party.Category', 'Trump.Mention',
#        'Obama.Mention', 'Sanders.Mention', 'Clinton.Mention',
#        'Special.Counsel.Mention', 'Travel.Ban.Mention', 'SinglePayer',
#         'Race', 'Veteran?', 'LGBTQ?', 'Elected Official?', 'Self-Funder?', 'STEM?', 'Obama Alum?',
#        'Party Support?', 'Emily Endorsed?', 'Guns Sense Candidate?',
#        'Biden Endorsed?', 'Warren Endorsed? ', 'Sanders Endorsed?',
#        'Our Revolution Endorsed?', 'Justice Dems Endorsed?', 'PCCC Endorsed?',
#        'Indivisible Endorsed?', 'WFP Endorsed?', 'VoteVets Endorsed?',
#        'No Labels Support?']]

'total_runners'

In [146]:
# encoder = OneHotEncoder(drop='first')
# encoder.fit(new_df[to_ohe])
# encoded_data = encoder.transform(new_df[to_ohe]).toarray()
# encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())


# final_df = pd.concat([new_df, encoded_df], axis=1)
# final_df.drop(to_ohe, axis=1, inplace=True)

Index(['General Status_1.0', 'Partisan Lean_-62.060001',
       'Partisan Lean_-61.459999', 'Partisan Lean_-60.810001',
       'Partisan Lean_-55.029999', 'Partisan Lean_-54.009998',
       'Partisan Lean_-49.509998', 'Partisan Lean_-49.110001',
       'Partisan Lean_-49.009998', 'Partisan Lean_-48.73',
       ...
       'PCCC Endorsed?_Yes', 'PCCC Endorsed?_nan', 'Indivisible Endorsed?_Yes',
       'Indivisible Endorsed?_nan', 'WFP Endorsed?_Yes', 'WFP Endorsed?_nan',
       'VoteVets Endorsed?_Yes', 'VoteVets Endorsed?_nan',
       'No Labels Support?_Yes', 'No Labels Support?_nan'],
      dtype='object', length=315)

In [265]:
#correlation_df = pd.DataFrame(correlation_matrix.iloc[0]).abs().sort_values("General Status",ascending=False)
#correlation_df.head(30)/


## Step 2: Drop one column from each OHE variable

## Step 3: Use forward selection to model select