In [11]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tabulate import tabulate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import statsmodels.api as sm


In [12]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary = "temporary_data\\"
college_matching = "college_matching\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{board_dataframes}"
temporary_data_path = f"{absolute_path}{temporary}"
college_matching_path = f"{absolute_path}{college_matching}"


# Valid Years
years = ["1999", "2000", "2005", "2008", "2009", "2013"]

#Created Files
diversity_statistics_path = f"{altered_dataframe_path}university_board_statistics.csv"

university_admissions_path = f"{college_matching_path}university_admission_rate.csv"
university_demographics_path = f"{college_matching_path}university_student_demographics.csv"


In [13]:
diversity_statistics_df = pd.read_csv(diversity_statistics_path)
university_admissions_df = pd.read_csv(university_admissions_path)
university_demographics_df = pd.read_csv(university_demographics_path)

In [14]:

# Merge admissions data into diversity statistics
diversity_statistics_df = diversity_statistics_df.merge(
    university_admissions_df.rename(columns={"year": "Year"}), 
    on=["Year", "AffiliationId"], 
    how="left"
)

# Merge demographics data into diversity statistics
diversity_statistics_df = diversity_statistics_df.merge(
    university_demographics_df.rename(columns={"year": "Year"}), 
    on=["Year", "AffiliationId"], 
    how="left"
)

# Display the updated dataframe
print(diversity_statistics_df.head())



   Year                   Institution  carnegie_id_x  AffiliationId  \
0  1999  Abilene Christian University       222178.0     60205797.0   
1  1999            Adelphi University       188429.0     71965598.0   
2  1999           Agnes Scott College       138600.0     64506506.0   
3  1999                Albion College       168546.0     45644089.0   
4  1999             Alfred University       188641.0     49502546.0   

   female_president  total_members  female  male  unknown  male_change  ...  \
0             False             84      12    65        7          NaN  ...   
1             False             17       3    13        1          NaN  ...   
2              True             29      12    14        3          NaN  ...   
3             False             31       4    26        1          NaN  ...   
4             False             29       7    22        0          NaN  ...   

   student.demographics.unemployment  student.demographics.men  \
0                               

In [15]:
# Logistic regression with fixing for year

independent_vars = [
    "admissions.admission_rate.overall", 
    "student.students_with_pell_grant", 
    "student.demographics.avg_family_income", 
    "student.demographics.poverty_rate", 
    "student.demographics.women"
]

dependent_var = "female_president"

# Ensure "Year" is included in the dataset
year_var = "Year"

# Drop rows with missing values in dependent or independent variables, including "Year"
regression_data = diversity_statistics_df.dropna(subset=[dependent_var] + independent_vars + [year_var])

# Create dummy variables for the Year column
regression_data = pd.get_dummies(regression_data, columns=[year_var], drop_first=True)

# Update independent variables to include year dummy variables
year_dummies = [col for col in regression_data.columns if col.startswith(f"{year_var}_")]
independent_vars += year_dummies

# Define X (independent variables) and y (dependent variable)
X = regression_data[independent_vars]
X = sm.add_constant(X)  # Add intercept

y = regression_data[dependent_var].apply(lambda x: 1 if x else 0)  # Ensure binary classification

# Perform logistic regression for statistical inference
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Display results summary
print(result.summary())

# Display odds ratios for better interpretability
odds_ratios = pd.DataFrame({
    "Variable": X.columns,
    "Odds Ratio": result.params.apply(lambda x: np.exp(x)),
    "P-Value": result.pvalues
})
print(tabulate(odds_ratios, headers="keys", tablefmt="grid"))


Optimization terminated successfully.
         Current function value: 0.410043
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:       female_president   No. Observations:                  394
Model:                          Logit   Df Residuals:                      388
Method:                           MLE   Df Model:                            5
Date:                Sun, 15 Dec 2024   Pseudo R-squ.:                 0.07583
Time:                        22:22:44   Log-Likelihood:                -161.56
converged:                       True   LL-Null:                       -174.81
Covariance Type:            nonrobust   LLR p-value:                 7.098e-05
                                             coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------
const                                     -5.5909   