In [None]:
import pandas as pd
import numpy as np
import os
from collections import Counter
import re
from collections import defaultdict
from nameparser import HumanName

In [None]:
data_path = r"C:\Projects\connecteddatahub\data"
# years = ["1999","2000","2005","2007","2008","2009","2010","2011","2013","2018",]
years = ['2013']

In [None]:
#global word banks

#president
president_words = ["president", "chancellor", "superintendent", "commissioner", "officer-in-charge"]

#board of directors
board_words = ["Trustee", "Regent", "Member", "Fellow", "Overseer", "Governor", "Curator", "Visitor", "Manager", 'Administrator']
position_bank = ["President", "Chancellor", "Provost", "Director", "Dean", "Controller", "Trustee", "Member", "Regent", "Chairman", "Overseer", "Assistant", "Librarian", "Secretary", "Chaplain", "Minister", "Treasurer", "Senior Counsel", "General Counsel", "Legal Counsel", "University Counsel", "College Counsel", "Special Counsel", "Corporation Counsel", "Officer", "Chief", "Professor", "Commissioner", "Fellow", "Chairperson", "Manager", "Clergy", "Coordinator", "Auditor", "Governor", "Representative", "Stockbroker", "Advisor", "Commandant", "Rector", "Attorney", "Curator", "Clerk", "Department Head", "Pastor", "Head", "Comptroller", "Deputy", "Inspector General"]
exception_terms = {"secretary", "chairman", "treasurer", 'member'}

In [None]:
'''President Labeling'''

def count_first_positions(df):
    """
    validation function
    """
    # Get the first row (original index preserved) per institution
    first_row_indices = df.groupby("Institution", sort=False).head(1).index
    first_rows = df.loc[first_row_indices]

    positions = []
    flagged = []

    for idx, row in first_rows.iterrows():
        pos = str(row["Position"]).strip()
        institution = row["Institution"]
        positions.append(pos)

        position_lower = pos.lower()
        if not any(pres in position_lower for pres in president_words) or "vice" in position_lower:
            flagged.append((idx, institution, pos))

    print("\nInstitutions with flagged positions (no 'President'/'Chancellor' or contains 'Vice President'):")
    for idx, inst, pos in flagged:
        print(f" - Index {idx}: {inst} → '{pos}'")

    print("\nCounts of first-row position titles:")
    counts = Counter(positions)
    for title, count in counts.most_common():
        print(f"{title}: {count}")


def mark_president_positions(df: pd.DataFrame):
    """
    Mark rows as 'President' in 'FixedPosition' if we detect a true president
    (president_words, excluding 'vice') in the first row of each institution;
    otherwise, look up to the next 3 rows. Print institutions where we had to
    look further, and those where we found no president in the first 4 rows,
    listing the positions seen.
    
    Returns:
      df               -- original DataFrame with FixedPosition (object dtype) filled in
      president_rows   -- subset DataFrame of the rows marked as President
    """
    # ensure column exists as object dtype
    if "FixedPosition" not in df.columns:
        df["FixedPosition"] = pd.Series([None] * len(df), index=df.index, dtype="object")
    else:
        df["FixedPosition"] = df["FixedPosition"].astype("object")

    def is_true_president(pos):
        pos = str(pos).lower()
        return any(p in pos for p in president_words) and "vice" not in pos

    president_indices = []
    found_late = []    # [(institution, idx, pos), ...]
    not_found = []     # [(institution, [pos1, pos2, pos3, pos4]), ...]

    for inst, group in df.groupby("Institution", sort=False):
        idxs = list(group.index)
        # check first row
        first_idx = idxs[0]
        if is_true_president(df.at[first_idx, "Position"]):
            president_indices.append(first_idx)
            continue

        # look in the next up to 3 rows
        found = False
        for nxt in idxs[1:4]:
            if is_true_president(df.at[nxt, "Position"]):
                president_indices.append(nxt)
                found_late.append((inst, nxt, df.at[nxt, "Position"]))
                found = True
                break
        if not found:
            seen_positions = [str(df.at[i, "Position"]).strip() for i in idxs[:4]]
            not_found.append((inst, seen_positions))

    # Mark the identified rows (no dtype warning now)
    df.loc[president_indices, "FixedPosition"] = "President"
    president_rows = df.loc[president_indices].copy()

    # Print summary
    if found_late:
        print("\nInstitutions where president was found in rows 2–4:")
        for inst, idx, pos in found_late:
            print(f" - {inst}: row {idx} → '{pos}'")

    if not_found:
        print("\nInstitutions with NO president in first 4 rows and their observed positions:")
        for inst, pos_list in not_found:
            joined = ", ".join(f"'{p}'" for p in pos_list)
            print(f" - {inst}: {joined}")

    return df, president_rows

In [None]:
'''Provost Labeling'''

