In [119]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from tabulate import tabulate
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
from scipy.stats import chi2
from sklearn.preprocessing import StandardScaler
from stargazer.stargazer import Stargazer

In [120]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary = "temporary_data\\"
college_matching = "college_matching\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{board_dataframes}"
temporary_data_path = f"{absolute_path}{temporary}"
college_matching_path = f"{absolute_path}{college_matching}"


# Valid Years
years = ["1999", "2000", "2005", "2007", "2008", "2009", "2011", "2013", "2018"]

#Created Files
university_boards_statistics_path = f"{altered_dataframe_path}sample_board_statistics.csv"
university_admissions_path = f"{college_matching_path}university_admission_rate.csv"
university_demographics_path = f"{college_matching_path}university_student_demographics.csv"
university_faculty_path = f"{college_matching_path}university_faculty.csv"

In [121]:
university_board_statistics_df = pd.read_csv(university_boards_statistics_path)
university_admissions_df = pd.read_csv(university_admissions_path)
university_demographics_df = pd.read_csv(university_demographics_path)
university_faculty_df = pd.read_csv(university_faculty_path)

In [122]:
def remove_non_samples(df):
    df = df[df['PrimarySample'] == True]
    return df

In [123]:
# university_board_statistics_df = university_board_statistics_df.merge(
#     university_admissions_df.rename(columns={"year": "Year"}), 
#     on=["Year", "AffiliationId"], 
#     how="left"
# )

# # Merge demographics data into board stats
# university_board_statistics_df = university_board_statistics_df.merge(
#     university_demographics_df.rename(columns={"year": "Year"}), 
#     on=["Year", "AffiliationId"], 
#     how="left"
# )

# # Merge demographics data into board stats
# university_faculty_df = university_faculty_df.rename(columns={"year": "Year"})
# university_board_statistics_df = university_board_statistics_df.merge(
#     university_faculty_df[['Year', 'AffiliationId','student.demographics.faculty.women']], 
#     on=["Year", "AffiliationId"], 
#     how="left"
# )


In [124]:
#normalize proportions for the regression

university_board_statistics_df['female_proportion'] = \
         university_board_statistics_df.apply(
             lambda row: row['female'] / row['total_members'] if row['total_members'] > 0 else 0,
             axis=1
         )

university_board_statistics_df['poc_proportion'] = \
         university_board_statistics_df.apply(
             lambda row: row['poc'] / row['total_ethnicity'] if row['total_ethnicity'] > 0 else 0,
             axis=1
         )

# Calculate the proportion of billionaires on the board
university_board_statistics_df['billionaire_proportion'] = (
    university_board_statistics_df['num_billionaires'] / 
    university_board_statistics_df['total_members'].replace(0, np.nan)
).fillna(0)

university_board_statistics_df = remove_non_samples(university_board_statistics_df)

university_board_statistics_df.to_csv(university_boards_statistics_path, index=False)


In [125]:
'''Logistic Regression'''

dependent_var = "female_president"
year_var = "Year"

# Note: We are going to convert "region" to a numeric variable later (as region_numeric).
independent_vars = [
    "admissions.admission_rate.overall", 
    "student.students_with_pell_grant", 
    "female_proportion",
    "billionaire_proportion",
    "total_members",
    "eigenvector",
    "degree",
    "student.demographics.women",
    "poc_proportion",
    "board_turnover",
    "control",
    "student.demographics.faculty.women",
    "region"   # we'll convert this to a numeric variable below
]

# When dropping missing data, drop on the columns that exist in the original DataFrame.
# We exclude "region_numeric" (since it will be created from "region") and drop on "region" instead.
cols_to_check = [dependent_var] + independent_vars + [year_var, "region"]
regression_data = university_board_statistics_df.dropna(subset=cols_to_check).copy()

# Ensure the dependent variable is binary.
regression_data[dependent_var] = regression_data[dependent_var].astype(int)

# Convert the categorical 'region' into a numeric variable.
# Define a mapping for the four regions. (Adjust the mapping as needed.)
region_map = {"Northeast": 0, "Midwest": 1, "South": 2, "West": 3}
regression_data["region_numeric"] = regression_data["region"].map(region_map)

# Now, create dummy variables for Year only.
regression_data = pd.get_dummies(
    regression_data,
    columns=[year_var],
    drop_first=True
)

# Identify the dummy columns for Year.
year_dummies = [col for col in regression_data.columns if col.startswith(f"{year_var}_")]
regression_data[year_dummies] = regression_data[year_dummies].astype(int)

# Build the full predictor list.
# We remove the original "region" from independent_vars and add our new "region_numeric" instead.
predictor_vars = [var for var in independent_vars if var != "region"] + ["region_numeric"] + year_dummies

X = regression_data[predictor_vars]
y = regression_data[dependent_var]

# Convert "control" to a binary variable.
X["control"] = X["control"].map({"Public": 1, "Private": 0})

# Ensure all predictor columns are numeric.
X = X.apply(pd.to_numeric)

# Normalize continuous predictors.
continuous_vars = [
    "admissions.admission_rate.overall", 
    "student.students_with_pell_grant", 
    "female_proportion",
    "billionaire_proportion",  
    "total_members",
    "eigenvector",
    "degree",
    "student.demographics.women",
    "poc_proportion",
    "board_turnover",
    "control",
    "student.demographics.faculty.women"
    # Note: "region_numeric" is treated as categorical, so we leave it as is.
]

scaler = StandardScaler()
vars_to_scale = [var for var in continuous_vars if var in X.columns]
X_scaled = X.copy()
X_scaled[vars_to_scale] = scaler.fit_transform(X_scaled[vars_to_scale])

# Ensure the scaled data is numeric.
X_scaled = X_scaled.apply(pd.to_numeric)

# Add an intercept.
X_scaled = sm.add_constant(X_scaled)

# Fit the logistic regression model.
logit_model = sm.Logit(y, X_scaled)
result = logit_model.fit()

# Create a DataFrame with odds ratios and p-values.
odds_ratios = pd.DataFrame({
    "Variable": X_scaled.columns,
    "Coefficient": result.params,
    "Odds Ratio": np.exp(result.params),
    "P-Value": result.pvalues
})

print(result.summary())
print("\nOdds Ratios and P-Values:")
print(tabulate(odds_ratios, headers="keys", tablefmt="grid"))

# Calculate Variance Inflation Factors (VIF).
if 'const' in X_scaled.columns:
    X_check = X_scaled.drop('const', axis=1)
else:
    X_check = X_scaled.copy()

vif_data = pd.DataFrame({
    "Variable": X_check.columns,
    "VIF": [variance_inflation_factor(X_check.values, i) for i in range(X_check.shape[1])]
})
print("\nVariance Inflation Factors (VIF):")
print(tabulate(vif_data, headers="keys", tablefmt="grid"))


Optimization terminated successfully.
         Current function value: 0.339403
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:       female_president   No. Observations:                  837
Model:                          Logit   Df Residuals:                      819
Method:                           MLE   Df Model:                           17
Date:                Sat, 01 Feb 2025   Pseudo R-squ.:                  0.1425
Time:                        19:16:00   Log-Likelihood:                -284.08
converged:                       True   LL-Null:                       -331.28
Covariance Type:            nonrobust   LLR p-value:                 9.599e-13
                                         coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                 -1.6459      0.290    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["control"] = X["control"].map({"Public": 1, "Private": 0})


In [126]:
# Define dependent and independent variables
dependent_var = "female_president"
year_var = "Year"
independent_vars = [
    "student.students_with_pell_grant", 
    "billionaire_proportion",
    "student.demographics.women",
    "eigenvector",
    # "degree",
    "poc_proportion",
    # "board_turnover",
    "student.demographics.faculty.women",
    "total_members",
    "female_proportion",
    "admissions.admission_rate.overall", 
    "control",
    "region"   # add region here!
]

# Drop rows with missing data
regression_data = university_board_statistics_df.dropna(
    subset=[dependent_var] + independent_vars + [year_var]
).copy()

# Ensure binary dependent variable
regression_data[dependent_var] = regression_data[dependent_var].astype(int)

# Convert the categorical 'region' into a numeric variable.
# Define a mapping for the four regions.
region_map = {"Northeast": 0, "Midwest": 1, "South": 2, "West": 3}
regression_data["region"] = regression_data["region"].map(region_map)

# Create dummy variables for the year
regression_data = pd.get_dummies(
    regression_data,
    columns=[year_var],
    drop_first=True
)

# Ensure correct data type for year dummy variables
year_dummies = [col for col in regression_data.columns if col.startswith(f"{year_var}_")]
regression_data[year_dummies] = regression_data[year_dummies].astype(int)

# Convert "control" column to binary mapping
regression_data["control"] = regression_data["control"].map({"Public": 1, "Private": 0})

# Normalize continuous variables
continuous_vars = [
    "student.students_with_pell_grant", 
    "billionaire_proportion",
    "student.demographics.women",
    "eigenvector",
    # "degree",
    "poc_proportion",
    # "board_turnover",
    "student.demographics.faculty.women",
    "female_proportion",
    "total_members",
    "admissions.admission_rate.overall", 
    "control",
]
scaler = StandardScaler()
vars_to_scale = [var for var in continuous_vars if var in regression_data.columns]
regression_data[vars_to_scale] = scaler.fit_transform(regression_data[vars_to_scale])

# Define different model specifications
model_specs = [
    independent_vars[:-4],  # Exclude last 4 variables
    independent_vars[:-2],  # Exclude last 2 variables
    independent_vars         # Include all variables
]

# Fit multiple logistic regression models
models = []
for predictors in model_specs:
    X = regression_data[predictors + year_dummies]
    X = sm.add_constant(X)
    y = regression_data[dependent_var]
    model = sm.Logit(y, X).fit()
    models.append(model)

# Generate Stargazer output for multiple regression results
stargazer = Stargazer(models)
stargazer.title("Logistic Regression Results with Different Specifications")
stargazer.dependent_variable_name(dependent_var)

# Display the output as LaTeX
print(stargazer.render_latex())

Optimization terminated successfully.
         Current function value: 0.382473
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.350947
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.339861
         Iterations 7
\begin{table}[!htbp] \centering
  \caption{Logistic Regression Results with Different Specifications}
\begin{tabular}{@{\extracolsep{5pt}}lccc}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{3}{c}{\textit{Dependent variable: female_president}} \
\cr \cline{2-4}
\\[-1.8ex] & (1) & (2) & (3) \\
\hline \\[-1.8ex]
 Year_2007 & -0.013$^{}$ & -0.162$^{}$ & -0.158$^{}$ \\
& (0.314) & (0.328) & (0.333) \\
 Year_2009 & 0.058$^{}$ & -0.251$^{}$ & -0.189$^{}$ \\
& (0.320) & (0.339) & (0.346) \\
 Year_2011 & 0.179$^{}$ & -0.141$^{}$ & -0.033$^{}$ \\
& (0.344) & (0.364) & (0.372) \\
 Year_2013 & 0.087$^{}$ & -0.331$^{}$ & -0.201$^{}$ \\
& (0.358) & (0.382) & (0.390) \\
 admissions.admission_rate