In [36]:
import os
import pandas as pd
import numpy as np
from collections import Counter

In [37]:
data_path = r'C:\Projects\connecteddatahub\data'
years = ["1999","2000","2005","2007","2008","2009","2010","2011","2013","2018",]
# years = ["2013"]

In [38]:
def count_first_provost(df):
    """Find the first occurrence of any 'Provost' in each institution group,
    then classify it as accepted or rejected based on the title.
    Count frequencies of both accepted and rejected titles,
    and list institutions with no 'Provost' mention at all."""

    accepted = []
    rejected = []
    no_provost_insts = []

    for inst, group in df.groupby("Institution"):
        found_provost = False

        for _, row in group.iterrows():
            pos = str(row["Position"]).strip()
            pos_lower = pos.lower()

            if "provost" in pos_lower:
                found_provost = True
                if not any(bad in pos_lower for bad in ["vice", "assistant", "associate", ", "]):
                    accepted.append(pos)
                else:
                    rejected.append(pos)
                break  # only the first provost match per group

        if not found_provost:
            no_provost_insts.append(inst)

    print("\nInstitutions with no reported Provost:")
    for inst in no_provost_insts:
        print(f" - {inst}")

    print("\nAccepted first 'Provost' titles:")
    accepted_counts = Counter(accepted)
    for title, count in accepted_counts.most_common():
        print(f"{title}: {count}")

    print("\nRejected first 'Provost' titles:")
    rejected_counts = Counter(rejected)
    for title, count in rejected_counts.most_common():
        print(f"{title}: {count}")


In [39]:
def classify_additional_provosts(df: pd.DataFrame):
    accepted = []
    rejected = []
    no_provost_insts = []
    rejected_insts = []
    vpaa_insts = set()
    
    addl_classifications = {
        'vice': [],
        'associate': [],
        'assistant': [],
        'other': []
    }

    for inst, group in df.groupby("Institution", sort=False):
        found = False
        group = group.reset_index(drop=True)
        
        for idx, row in group.iterrows():
            pos = str(row["Position"]).strip()
            pos_lower = pos.lower()

            if "vice president, academic affairs" in pos_lower:
                vpaa_insts.add(inst)

            if "provost" in pos_lower and not found:
                found = True
                if not any(bad in pos_lower for bad in ["vice", "assistant", "associate", ", "]):
                    accepted.append(pos)
                else:
                    rejected.append(pos)
                    rejected_insts.append((inst, pos))
                first_provost_idx = idx
                break

        if not found:
            no_provost_insts.append(inst)
            continue

        # After the first provost, classify others
        for _, row in group.iloc[first_provost_idx + 1:].iterrows():
            pos = str(row["Position"]).strip()
            pos_lower = pos.lower()

            if "provost" in pos_lower:
                if "vice" in pos_lower:
                    addl_classifications['vice'].append(pos)
                elif "associate" in pos_lower:
                    addl_classifications['associate'].append(pos)
                elif "assistant" in pos_lower:
                    addl_classifications['assistant'].append(pos)
                else:
                    addl_classifications['other'].append(pos)

    # Print summary
    print("\nInstitutions with NO 'Provost' mention at all:")
    for inst in no_provost_insts:
        print(f" - {inst}")

    print("\nInstitutions whose first 'Provost' title was rejected:")
    for inst, title in rejected_insts:
        print(f" - {inst}: '{title}'")

    print("\nAccepted first 'Provost' titles:")
    for title, cnt in Counter(accepted).most_common():
        print(f" {cnt:3d} × {title}")

    print("\nRejected first 'Provost' titles:")
    for title, cnt in Counter(rejected).most_common():
        print(f" {cnt:3d} × {title}")

    print(f"\nNumber of institutions with a 'Vice President, Academic Affairs': {len(vpaa_insts)}")

    print("\nAdditional Provost-classified titles after first per institution:")
    for role, positions in addl_classifications.items():
        print(f"\n{role.title()} Provost Titles ({len(positions)}):")
        for title, cnt in Counter(positions).most_common():
            print(f" {cnt:3d} × {title}")


In [40]:
# for year in years:
#     print(f"Processing: {year}")
#     df = pd.read_csv(os.path.join(data_path, 'cleaned_dataframes', f'{year}_cleanedDataframe.csv'))
#     count_first_provost(df)
#     classify_additional_provosts(df)

In [None]:
def mark_first_provost_positions(df):
    """Mark the first acceptable 'Provost' per institution, return a solely provost df and the original df with an updated 'FixedPosition' column"""
    if "FixedPosition" not in df.columns:
        df["FixedPosition"] = np.nan

    accepted_indices = []

    for inst, group in df.groupby("Institution"):
        for idx, row in group.iterrows():
            pos = str(row["Position"]).strip()
            pos_lower = pos.lower()
            if "provost" in pos_lower and not any(
                bad in pos_lower for bad in ["vice", "assistant", "associate", ","]
            ):
                accepted_indices.append(idx)
                break

    df.loc[accepted_indices, "FixedPosition"] = "Provost"
    provost_rows = df.loc[accepted_indices].copy()

    return df, provost_rows


In [42]:
for year in years:
    print(f"Processing: {year}")
    df = pd.read_csv(os.path.join(data_path, 'cleaned_dataframes', f'{year}_cleanedDataframe.csv'))
    full_df, provost_df = mark_first_provost_positions(df)
    full_df.to_csv(os.path.join(data_path, 'cleaned_dataframes', f'{year}_cleanedDataframe.csv'), index = False)
    provost_df.to_csv(os.path.join(data_path, 'cleaned_dataframes', 'provost', f'{year}_provost.csv'), index = False)
    

Processing: 1999
Processing: 2000
Processing: 2005
Processing: 2007
Processing: 2008
Processing: 2009
Processing: 2010
Processing: 2011
Processing: 2013
Processing: 2018
