In [54]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tabulate import tabulate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import statsmodels.api as sm


In [55]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary = "temporary_data\\"
college_matching = "college_matching\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{board_dataframes}"
temporary_data_path = f"{absolute_path}{temporary}"
college_matching_path = f"{absolute_path}{college_matching}"


# Valid Years
years = ["1999", "2000", "2005", "2008", "2009", "2013"]

#Created Files
diversity_statistics_path = f"{altered_dataframe_path}university_board_statistics.csv"

university_admissions_path = f"{college_matching_path}university_admission_rate.csv"
university_demographics_path = f"{college_matching_path}university_student_demographics.csv"


In [56]:
diversity_statistics_df = pd.read_csv(diversity_statistics_path)
university_admissions_df = pd.read_csv(university_admissions_path)
university_demographics_df = pd.read_csv(university_demographics_path)

In [57]:

# Merge admissions data into diversity statistics
diversity_statistics_df = diversity_statistics_df.merge(
    university_admissions_df.rename(columns={"year": "Year"}), 
    on=["Year", "AffiliationId"], 
    how="left"
)

# Merge demographics data into diversity statistics
diversity_statistics_df = diversity_statistics_df.merge(
    university_demographics_df.rename(columns={"year": "Year"}), 
    on=["Year", "AffiliationId"], 
    how="left"
)

# Display the updated dataframe
print(diversity_statistics_df.head())



   Year               Institution  carnegie_id_x  AffiliationId  \
0  1999        Adelphi University       188429.0     71965598.0   
1  1999       American University       131159.0    181401687.0   
2  1999        Andrews University       168740.0    102298084.0   
3  1999  Arizona State University       104151.0     55732556.0   
4  1999         Auburn University       100858.0     82497590.0   

   female_president  poc_president  total_members  poc  white  female  ...  \
0             False          False             17    2     15       3  ...   
1             False          False             27    1     26      11  ...   
2             False          False             36    6     30       9  ...   
3             False          False             11    0     11       4  ...   
4             False          False             12    1     11       1  ...   

   student.demographics.unemployment  student.demographics.men  \
0                                NaN                       NaN

In [58]:
#bootstrapping values for missing years for columns that i thought may be appropriate

columns_to_fill = [
    "admissions.admission_rate.overall", 
    "student.students_with_pell_grant", 
    "student.demographics.avg_family_income", 
    "student.demographics.poverty_rate", 
    "student.demographics.men", 
    "student.demographics.women"
]

# Fill missing values for each column
for col in columns_to_fill:
    # Identify years with missing values
    missing_mask = diversity_statistics_df[col].isna()
    # Fill missing values by bootstrapping from the same institution
    for affiliation_id in diversity_statistics_df["AffiliationId"].unique():
        institution_mask = (diversity_statistics_df["AffiliationId"] == affiliation_id)
        combined_mask = missing_mask & institution_mask
        existing_values = diversity_statistics_df.loc[~missing_mask & institution_mask, col].dropna()
        if not existing_values.empty:
            diversity_statistics_df.loc[combined_mask, col] = existing_values.sample(
                n=combined_mask.sum(), replace=True
            ).values

# Display the updated dataframe with a sample of rows
diversity_statistics_df = diversity_statistics_df.sort_values(by=["Year", "Institution"])
print(tabulate(diversity_statistics_df.sample(100), headers='keys', tablefmt='grid'))


+------+--------+------------------------------------------------------------+-----------------+------------------+--------------------+-----------------+-----------------+-------+---------+----------+--------+-----------+--------------+----------------+------------------+---------------+-----------------+-----------------+-------------------------------------+---------------+----------------------------------+-------------------------------------+--------------------------------+----------------------------------+------------------------------------------+------------------------------------+-------------------------------------+-------------------------------------+----------------------------+------------------------------+---------------------------------------------+---------------------------------------------+---------------------------------------------+------------------------------------------------+--------------------------------------------------------------+--------------

In [59]:
#logistic regression on various variables

independent_vars = [
    "admissions.admission_rate.overall", 
    "student.students_with_pell_grant", 
    "student.demographics.avg_family_income", 
    "student.demographics.poverty_rate", 
    "student.demographics.men", 
    "student.demographics.women"
]

dependent_var = "female_president"

# Drop rows with missing values in dependent or independent variables
regression_data = diversity_statistics_df.dropna(subset=[dependent_var] + independent_vars)

# Define X (independent variables) and y (dependent variable)
X = regression_data[independent_vars]
X = sm.add_constant(X)  # Add intercept

y = regression_data[dependent_var].apply(lambda x: 1 if x else 0)  # Ensure binary classification

# Perform logistic regression for statistical inference
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Display results summary
print(result.summary())

# Display odds ratios for better interpretability
odds_ratios = pd.DataFrame({
    "Variable": X.columns,
    "Odds Ratio": result.params.apply(lambda x: np.exp(x)),
    "P-Value": result.pvalues
})
print(tabulate(odds_ratios, headers="keys", tablefmt="grid"))


Optimization terminated successfully.
         Current function value: 0.357824
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:       female_president   No. Observations:                  897
Model:                          Logit   Df Residuals:                      890
Method:                           MLE   Df Model:                            6
Date:                Thu, 12 Dec 2024   Pseudo R-squ.:                 0.02692
Time:                        22:05:42   Log-Likelihood:                -320.97
converged:                       True   LL-Null:                       -329.85
Covariance Type:            nonrobust   LLR p-value:                  0.006861
                                             coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------
const                                     23.6013   