In [32]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from tabulate import tabulate
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
from scipy.stats import chi2
from sklearn.preprocessing import StandardScaler


In [33]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary = "temporary_data\\"
college_matching = "college_matching\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{board_dataframes}"
temporary_data_path = f"{absolute_path}{temporary}"
college_matching_path = f"{absolute_path}{college_matching}"


# Valid Years
years = ["1999", "2000", "2005", "2007", "2008", "2009", "2011", "2013", "2018"]

#Created Files
university_boards_statistics_path = f"{altered_dataframe_path}sample_board_statistics.csv"
university_admissions_path = f"{college_matching_path}university_admission_rate.csv"
university_demographics_path = f"{college_matching_path}university_student_demographics.csv"

In [34]:
university_board_statistics_df = pd.read_csv(university_boards_statistics_path)
university_admissions_df = pd.read_csv(university_admissions_path)
university_demographics_df = pd.read_csv(university_demographics_path)

In [35]:
def remove_non_samples(df):
    df = df[df['PrimarySample'] == True]
    return df

In [36]:
#normalize proportions for the regression

university_board_statistics_df['female_proportion'] = \
         university_board_statistics_df.apply(
             lambda row: row['female'] / row['total_members'] if row['total_members'] > 0 else 0,
             axis=1
         )

university_board_statistics_df['poc_proportion'] = \
         university_board_statistics_df.apply(
             lambda row: row['poc'] / row['total_ethnicity'] if row['total_ethnicity'] > 0 else 0,
             axis=1
         )

# Calculate the proportion of billionaires on the board
university_board_statistics_df['billionaire_proportion'] = (
    university_board_statistics_df['num_billionaires'] / 
    university_board_statistics_df['total_members'].replace(0, np.nan)
).fillna(0)

university_board_statistics_df = remove_non_samples(university_board_statistics_df)

university_board_statistics_df.to_csv(university_boards_statistics_path, index=False)


In [None]:
'''Logistic Regression'''

dependent_var = "female_president"
year_var = "Year"
independent_vars = [
    "admissions.admission_rate.overall", 
    "student.students_with_pell_grant", 
    "female_proportion",
    "billionaire_proportion",
    "total_members",
    "eigenvector",
    "degree",
    "student.demographics.women",
    "poc_proportion",
    "board_turnover"
]

#Drop any rows with missing data (will remove years because of the incomplete carnegie data)
regression_data = university_board_statistics_df.dropna(
    subset = [dependent_var] + independent_vars + [year_var]
).copy()

#ensure binary dependent var
regression_data[dependent_var] = regression_data[dependent_var].astype(int)

#make dummy vars for the year
regression_data = pd.get_dummies(
    regression_data,
    columns = [year_var],
    drop_first = True
)

#type safety
year_dummies = [col for col in regression_data.columns if col.startswith(f"{year_var}_")]
regression_data[year_dummies] = regression_data[year_dummies].astype(int)


all_predictors = independent_vars + year_dummies

X = regression_data[all_predictors]
y = regression_data[dependent_var]

#normalize vars
continuous_vars = [
    "admissions.admission_rate.overall", 
    "student.students_with_pell_grant", 
    "female_proportion",
    "billionaire_proportion",  
    "total_members",
    "eigenvector",
    "degree",
    "student.demographics.women",
    "poc_proportion",
    "board_turnover"
]

scaler = StandardScaler()
# Fit and transform only existing continuous variables
vars_to_scale = [var for var in continuous_vars if var in X.columns]
X_scaled = X.copy()
X_scaled[vars_to_scale] = scaler.fit_transform(X_scaled[vars_to_scale])


#add intercept and fit
X_scaled = sm.add_constant(X_scaled)
logit_model = sm.Logit(y, X_scaled)
result = logit_model.fit()


odds_ratios = pd.DataFrame({
    "Variable": X_scaled.columns,
    "Coefficient": result.params,
    "Odds Ratio": np.exp(result.params),
    "P-Value": result.pvalues
})


print(result.summary())
print("\nOdds Ratios and P-Values:")
print(tabulate(odds_ratios, headers="keys", tablefmt="grid"))

#calculate and print viff
if 'const' in X_scaled.columns:
    X_check = X_scaled.drop('const', axis=1)
else:
    X_check = X_scaled.copy()

vif_data = pd.DataFrame({
    "Variable": X_check.columns,
    "VIF": [variance_inflation_factor(X_check.values, i) for i in range(X_check.shape[1])]
})
print("\nVariance Inflation Factors (VIF):")
print(tabulate(vif_data, headers="keys", tablefmt="grid"))

Optimization terminated successfully.
         Current function value: 0.359167
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:       female_president   No. Observations:                 1005
Model:                          Logit   Df Residuals:                      989
Method:                           MLE   Df Model:                           15
Date:                Wed, 29 Jan 2025   Pseudo R-squ.:                  0.1063
Time:                        20:58:39   Log-Likelihood:                -360.96
converged:                       True   LL-Null:                       -403.89
Covariance Type:            nonrobust   LLR p-value:                 5.845e-12
                                        coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const                                -1.9357      0.247     -7