In [14]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from tabulate import tabulate
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
# from stargazer.stargazer import Stargazer
import scipy.stats as stats
import seaborn as sns
import os

In [15]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
final_scripts = "final_scripts\\"
regression_stats = "regression\\regression_stats\\"

years = [ "2010", "2011", "2013", "2018"]


  absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"


In [16]:
university_stats_path = r"C:\Projects\connecteddatahub\data\statistics\regression_data_with_grants.csv"
university_board_statistics_df = pd.read_csv(university_stats_path)

In [17]:

dependent_var = "female_president"
year_var = "Year"


independent_vars = [
    "student.women", 
    "student.size", 
    "female_proportion",
    "billionaire_proportion",
    "total_members",
    "betweenness",
    "degree",
    "faculty.race_ethnicity.white",
    "poc_proportion",
    "board_turnover",
    "control",
    "faculty.women",
    "strength",
    "cost.tuition.out_of_state",
    "school.faculty_salary",
    "RD_expenditure",
    "clustering",
    # "Rank",
    "num_grants",
    "grant_amount"
]


cols_to_check = [dependent_var] + independent_vars + [year_var, "control"]
regression_data = university_board_statistics_df.dropna(subset=cols_to_check).copy()

regression_data[dependent_var] = regression_data[dependent_var].astype(int)

#one-hot encode Year and control.
# drop_first=True avoids the dummy variable trap.
regression_data = pd.get_dummies(
    regression_data,
    columns=[year_var, "control"],
    drop_first=True
)

#identify the dummy columns for Year and control.
year_dummies = [col for col in regression_data.columns if col.startswith(f"{year_var}_")]
control_dummies = [col for col in regression_data.columns if col.startswith("control_")]

#remove the original "control" from independent_vars and add our new dummy variables.
predictor_vars = [var for var in independent_vars if var not in ["control"]] + year_dummies + control_dummies

X = regression_data[predictor_vars]
y = regression_data[dependent_var]

#ensure all predictor columns are numeric.
X = X.apply(pd.to_numeric, errors='raise')

#normalize continuous predictors.
#exclude dummy variables (for year and control) from scaling.
continuous_vars = [
    "student.women", 
    "student.size", 
    "female_proportion",
    "billionaire_proportion",
    "total_members",
    "betweenness",
    "degree",
    "faculty.race_ethnicity.white",
    "poc_proportion",
    "board_turnover",
    "faculty.women",
    "strength",
    "cost.tuition.out_of_state",
    "school.faculty_salary",
    "RD_expenditure",
    "clustering",
    # "Rank",
    "num_grants",
    "grant_amount"
]

scaler = StandardScaler()
vars_to_scale = [var for var in continuous_vars if var in X.columns]
X_scaled = X.copy()
X_scaled[vars_to_scale] = scaler.fit_transform(X_scaled[vars_to_scale])

# Ensure the scaled data is numeric.
X_scaled = X_scaled.apply(pd.to_numeric, errors='raise')

# Add an intercept.
X_scaled = sm.add_constant(X_scaled)
# Force all columns to float to avoid object dtype errors.
X_scaled = X_scaled.astype(float)

# Fit the logistic regression model.
logit_model = sm.Logit(y, X_scaled)
result = logit_model.fit()


         Current function value: 0.295493
         Iterations: 35




In [18]:
# Print standard summary
print(result.summary())

# Pseudo R² (McFadden’s)
llf = result.llf       # log-likelihood of fitted model
llnull = result.llnull # log-likelihood of null model
pseudo_r2 = 1 - (llf / llnull)
print(f"\nMcFadden's pseudo R²: {pseudo_r2:.4f}")

# Coefficients, p-values, odds ratios, and 95% CI
summary_df = pd.DataFrame({
    "Coefficient": result.params,
    "Std.Err": result.bse,
    "z": result.tvalues,
    "p-value": result.pvalues,
    "Odds Ratio": np.exp(result.params),
    "CI Lower": np.exp(result.conf_int()[0]),
    "CI Upper": np.exp(result.conf_int()[1])
})

print("\nDetailed results with odds ratios:")
print(summary_df)


                           Logit Regression Results                           
Dep. Variable:       female_president   No. Observations:                  718
Model:                          Logit   Df Residuals:                      694
Method:                           MLE   Df Model:                           23
Date:                Sun, 28 Sep 2025   Pseudo R-squ.:                 0.09273
Time:                        20:14:08   Log-Likelihood:                -212.16
converged:                      False   LL-Null:                       -233.85
Covariance Type:            nonrobust   LLR p-value:                  0.006258
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const                          -13.1155    312.265     -0.042      0.966    -625.144     598.913
student.women                   -0.1947      0.203     -0.960      0.337     