In [55]:
import pandas as pd
import numpy as np
import os
from collections import Counter
import re
from collections import defaultdict
from nameparser import HumanName

In [None]:
data_path = r"C:\Projects\connecteddatahub\data"
years = ["1999","2000","2005","2007","2008","2009","2010","2011","2013","2018",]
# years = ['2013']

In [57]:
board_words = ["Trustee", "Regent", "Member", "Fellow", "Overseer", "Governor", "Curator", "Visitor", "Manager", 'Administrator']
position_bank = ["President", "Chancellor", "Provost", "Director", "Dean", "Controller", "Trustee", "Member", "Regent", "Chairman", "Overseer", "Assistant", "Librarian", "Secretary", "Chaplain", "Minister", "Treasurer", "Senior Counsel", "General Counsel", "Legal Counsel", "University Counsel", "College Counsel", "Special Counsel", "Corporation Counsel", "Officer", "Chief", "Professor", "Commissioner", "Fellow", "Chairperson", "Manager", "Clergy", "Coordinator", "Auditor", "Governor", "Representative", "Stockbroker", "Advisor", "Commandant", "Rector", "Attorney", "Curator", "Clerk", "Department Head", "Pastor", "Head", "Comptroller", "Deputy", "Inspector General"]
exception_terms = {"secretary", "chairman", "treasurer", 'member'}

In [58]:
def remove_suffixes(df):
    def strip_name(name):
        if pd.isna(name):
            return name
        clean = re.sub(r'(["\']).*?\1', '', name)
        hn = HumanName(clean)
        hn.suffix = ''
        return str(hn).strip()
    df['tempName'] = df['Name'].apply(strip_name)
    return df

In [None]:
def get_board_names(df):
    """
    Infer each institution’s board keyword by sampling its tail positions in two phases:
      1. Look at the last 10 rows.
      2. If no valid keyword emerges, look at the last 20 rows.

    In each phase:
      • Count occurrences of each position_bank word (skipping “director, ” and
        skipping “governor” when “governor appointed” appears).
      • Apply the governor/director vs trustee/regent runner‑up override.
      • Only pick a keyword if it appears at least 4 times and is in board_words.
    """
    board_names = {}

    for institution, group in df.groupby('Institution'):
        chosen = None

        # Try two sampling windows: first 10, then 20
        for window in (10, 20):
            recent = group.tail(window)['Position'].dropna().astype(str)
            counts = Counter()

            # 1) build counts
            for pos in recent:
                pl = pos.lower()
                for word in position_bank:
                    w = word.lower()
                    if w in pl and 'director, ' not in pl:
                        if w == 'governor' and 'governor appointed' in pl:
                            continue
                        counts[word] += 1

            if not counts:
                continue  # nothing matched in this window → try next

            # 2) sort and pick top (with runner‑up override)
            common = counts.most_common()
            top_word, top_count = common[0]

            if len(common) > 1:
                second_word, second_count = common[1]
                # override if governor/director vs trustee/regent tie or
                # if runner‑up is in board_words but top isn't
                if (
                    (top_word.lower() in {'governor', 'director'}
                     and second_word.lower() in {'trustee', 'regent'}
                     and second_count >= 0.8 * top_count)
                    or (top_word not in board_words and second_word in board_words)
                ):
                    top_word, top_count = second_word, second_count

            # 3) enforce minimum frequency and membership
            if top_count >= 4 and top_word in board_words:
                chosen = top_word
                break  # stop after this phase succeeds

        board_names[institution] = chosen

    return board_names


def detect_director_boards(df, board_names):
    """
    Institutions without board_names whose last 10 positions
    are exactly 'director' (no suffixes or extra text).
    Requires at least 5 such occurrences, and that the runner-up
    isn’t 'dean'. Returns the set of institutions detected as
    director-led.
    """
    director_institutions = set()

    for institution, group in df.groupby('Institution'):
        # skip if a board keyword was already inferred
        if board_names.get(institution):
            continue

        recent = group.tail(10)['Position'].fillna('').astype(str)
        # count exact 'director' entries
        counts = Counter(p.lower().strip() for p in recent if p.strip())

        # find the two most common exact strings
        most_common = counts.most_common(2)
        top_key, top_count = most_common[0] if most_common else (None, 0)
        second_key = most_common[1][0] if len(most_common) > 1 else None

        # declare a director board only if:
        #  - top_key == 'director'
        #  - at least 5 exact 'director' entries
        #  - second_key != 'dean'
        if (
            top_key == 'director'
            and top_count >= 5
            and second_key != 'dean'
        ):
            director_institutions.add(institution)

    return director_institutions

In [60]:
# def get_permissive_blocks(df, board_names):
#     """
#     Identify board member blocks for each institution.

#     For each institution, this function scans from the last occurrence of the institution’s
#     board keyword upward, allowing certain exceptions and heuristics, and returns a
#     mapping from institution to list of row indices representing the detected block.
#     """
#     static_exceptions = ['secretary', 'chairman', 'treasurer', 'chairperson', 'vice chair', 'member', 'appointed', 'elected']
#     blocks = {}
#     all_board_words = set(board_words)

#     for institution, group in df.groupby('Institution'):
#         names = group['tempName'].dropna().astype(str).tolist()
#         last_names = [n.split()[-1].lower() for n in names]
#         positions = group['Position'].fillna('').astype(str).tolist()
#         indices = group.index.to_list()

#         board_word = (board_names.get(institution) or '').lower()

#         # exclude the chosen word itself
#         others = all_board_words - {board_word.title()}

#         #start at last occurrence of this institution’s board word
#         if board_word:
#             matches = [i for i, p in enumerate(positions) if board_word in p.lower()]
#             start_idx = matches[-1] if matches else len(last_names) - 1
#         else:
#             start_idx = len(last_names) - 1

#         #expand board upwards
#         i = start_idx - 1
#         while i >= 0:
#             p_lower = positions[i].lower()

#             #check for alphabetical ordering, has the correct position, or is a commonly occuring board title (chairman, secretary, etc)
#             in_order = last_names[i] <= last_names[i + 1]
#             is_board = board_word in p_lower
#             is_exception = any(exc in p_lower for exc in static_exceptions)
            
#             #break if we see a frequently occurring "other" board word
#             hit_other = False
#             for other in others:
#                 #auburn university has an odd board and hardcoding is more robust than trying to make a rule that caters toward every board:
#                 if institution == 'Auburn University':
#                     if other.lower() in p_lower and other.lower() != 'member' and not(other.lower() == 'governor' and board_word in p_lower):
#                         hit_other = True
#                         i = -1
#                         break
#                 else:
#                     if other.lower() in p_lower and not(other.lower() == 'governor' and board_word in p_lower) and not(p_lower == 'governor appointed') and not(other.lower() == 'member' and in_order):
#                         hit_other = True
#                         i = -1
#                         break
#             if hit_other:
#                 break

    
#             if in_order or is_board or is_exception:
#                 start_idx = i
#                 i -= 1
#                 continue

#             #sometimes this prematurely stops so peek upwards with more restrictions to check for this
#             successes = 0
#             peeks = 0

#             for j in range(i - 1, max(i - 4, -1), -1):
#                 peeks += 1
#                 pj = positions[j].lower()

#                 #always indicates that the end of the board was reached
#                 if 'dean' in pj or ('director' in pj and board_word != 'director') or 'director,' in pj:
#                     continue
                
#                 #check for the original conditions
#                 j_in_order = last_names[j] <= last_names[j + 1]
#                 j_is_board = board_word in pj
#                 j_is_exception = any(exc in pj for exc in static_exceptions)

#                 if j_is_board or (j_in_order and j_is_exception):
#                     successes += 1

#             if peeks > 1 and successes > 1:
#                 start_idx = i
#                 i -= 1
#                 continue
#             else:
#                 break
        
#         blocks[institution] = indices[start_idx:]

#     return blocks


In [None]:
def get_permissive_blocks(df, board_names):
    """
    Identify board member blocks for each institution.

    For each institution, this function scans from the last occurrence of the institution’s
    board keyword upward, allowing certain exceptions and heuristics, and returns a
    mapping from institution to list of row indices representing the detected block.

    Now requires three consecutive “other” board‑word hits (excluding rows that also
    contain the actual board keyword) to break the scan.
    """
    static_exceptions = [
        'secretary', 'chairman', 'treasurer',
        'chairperson', 'vice chair', 'member',
        'appointed', 'elected', 'gubernatorial'
    ]
    blocks = {}
    all_board_words = set(board_words)

    for institution, group in df.groupby('Institution'):
        names = group['tempName'].dropna().astype(str).tolist()
        last_names = [n.split()[-1].lower() for n in names]
        positions = group['Position'].fillna('').astype(str).tolist()
        indices = group.index.to_list()

        board_word = (board_names.get(institution) or '').lower()
        others = all_board_words - {board_word.title()}

        # 1) seed at last occurrence of this institution’s board_word
        if board_word:
            matches = [
                i for i, p in enumerate(positions)
                if board_word in p.lower()
            ]
            start_idx = matches[-1] if matches else len(last_names) - 1
        else:
            start_idx = len(last_names) - 1

        # 2) expand upward with three‑in‑a‑row “other” rule,
        #    but skip if row also contains board_word
        consecutive_other = 0
        i = start_idx - 1
        while i >= 0:
            p_lower = positions[i].lower()

            # check alphabetical order, board keyword, or static exception
            in_order = last_names[i] <= last_names[i + 1]
            is_board = board_word in p_lower
            is_exception = any(exc in p_lower for exc in static_exceptions)

            # detect “other” board words
            hit_other = False
            for other in others:
                o = other.lower()
                skip_governor = (o == 'governor' and board_word in p_lower)
                # if board_word also in this row, treat it as safe
                if o in p_lower and not skip_governor and o != 'member' and board_word not in p_lower:
                    hit_other = True
                    break

            if hit_other:
                consecutive_other += 1
                if consecutive_other >= 3:
                    # three in a row → remove all three and stop
                    start_idx = i + 3
                    break
                else:
                    # include this row and continue, but do not reset baseline
                    start_idx = i
                    i -= 1
                    continue
            else:
                # reset counter if this row is not a pure “other”
                consecutive_other = 0

            # normal inclusion rules
            if in_order or is_board or is_exception:
                start_idx = i
                i -= 1
                continue

            # fallback peek logic
            successes = 0
            peeks = 0
            for j in range(i - 1, max(i - 4, -1), -1):
                peeks += 1
                pj = positions[j].lower()
                # hard stops
                if 'dean' in pj or ('director' in pj and board_word != 'director') or 'director,' in pj:
                    continue
                j_in_order = last_names[j] <= last_names[j + 1]
                j_is_board = board_word in pj
                j_is_exception = any(exc in pj for exc in static_exceptions)
                if j_is_board or (j_in_order and j_is_exception):
                    successes += 1

            if peeks > 1 and successes > 1:
                start_idx = i
                i -= 1
                continue
            else:
                break

        blocks[institution] = indices[start_idx:]

    return blocks


In [62]:
def split_into_contiguous_runs(indices):
    """
    identified boards must be contiguous blocks - so remove any smaller blocks that were identified
    """
    if not indices:
        return []
    sorted_idx = sorted(indices)
    runs = [[sorted_idx[0]]]
    for x in sorted_idx[1:]:
        if x == runs[-1][-1] + 1:
            runs[-1].append(x)
        else:
            runs.append([x])
    return runs


def mark_board_members(df, board_names):
    """
    Label board members for each institution based on inferred board keywords.

    1. Override keywords with 'director' for institutions detected as director-led.
    2. Use permissive block detection to find candidate indices.
    3. Refine each block by locating the first keyword occurrence and including nearby exceptions.
    4. Enforce contiguity and gap rules, then label those rows as 'Board Member'.

    Returns modified df and a DataFrame of labeled board members.
    """
    static_exc = ['secretary', 'chairman', 'treasurer', 'chairperson', 'vice chair', 'member']

    # 1. Director override
    director_insts = detect_director_boards(df, board_names)
    names_map = board_names.copy()
    for inst in director_insts:
        names_map[inst] = 'director'

    if 'FixedPosition' not in df.columns:
        df['FixedPosition'] = np.nan

    blocks = get_permissive_blocks(df, names_map)
    validated_idx = []

    for inst, idx_list in blocks.items():
        board_word = (names_map.get(inst) or '').lower()
        if not board_word or not idx_list:
            continue

        # dynamic exceptions: include any non-dean position in the block
        dyn_exc = {str(df.at[i, 'Position']).lower().strip() for i in idx_list if 'dean' not in str(df.at[i, 'Position']).lower()}
        exceptions = set(static_exc) | dyn_exc

        # 2. Locate first occurrence of the board keyword
        first_rel = None
        for rel, i in enumerate(idx_list[:10]):
            p = str(df.at[i, 'Position']).lower().strip()
            if (board_word == 'director' and p == 'director') or \
               (board_word != 'director' and board_word in p):
                first_rel = rel
                break


        if first_rel is None:
            continue

        # allow up to 3 rows above first_rel if they match exceptions
        start_rel = max(0, first_rel - 3)
        earliest = first_rel
        for rel in range(start_rel, first_rel):
            p = str(df.at[idx_list[rel], 'Position']).lower()
            if any(exc in p for exc in exceptions):
                earliest = rel
                break

        selected = idx_list[earliest:]

        # 3. Enforce contiguity: keep longest and additional runs if gaps <= 25
        runs = split_into_contiguous_runs(selected)
        longest = max(runs, key=len)
        kept = [longest]
        for run in runs:
            if run is longest or len(run) <= 2:
                continue
            gap = (run[0] - longest[-1] - 1) if run[0] > longest[-1] else (longest[0] - run[-1] - 1)
            if gap <= 25:
                kept.append(run)

        final = [i for r in kept for i in r]

        # 4. Peek above earliest row for chair positions
        if final:
            first_global = min(final)
            peek_idx = first_global - 1
            if peek_idx in df.index:
                peek_p = str(df.at[peek_idx, 'Position']).lower().strip()
                if peek_p in {'chairman', 'chairperson', 'chair'} and peek_idx not in final:
                    final.insert(0, peek_idx)

        validated_idx.extend(final)
        df.loc[final, 'FixedPosition'] = 'Board Member'

    board_df = df.loc[validated_idx].copy()
    df = df.drop(columns='tempName', errors='ignore')
    board_df = board_df.drop(columns='tempName', errors='ignore')
    return df, board_df



In [63]:
def detect_primary_and_secondary_boards(df: pd.DataFrame):
    """
    1) Run the normal pipeline → label 'Board Member'
    2) Mask out those rows, rerun → label 'Second Board Member'
    3) For any institution with BOTH labels present, swap them
       if the first board is larger than the second, so the
       smaller block always carries 'Board Member'.
    """
    # ── Pass 1: primary board ─────────────────────────────────────────────
    board_names_1        = get_board_names(df)
    df_labeled, board_df_1 = mark_board_members(df.copy(), board_names_1)

    # ── Pass 2: secondary board on remaining rows ────────────────────────
    remaining = df.drop(index=board_df_1.index)
    board_names_2        = get_board_names(remaining)
    _, board_df_2       = mark_board_members(remaining.copy(), board_names_2)

    # if no second board, we’re done
    if board_df_2.empty or board_names_1 == board_names_2:
        return df_labeled, board_df_1

    # ── Initial labeling ─────────────────────────────────────────────────
    # (mark_board_members already set FixedPosition='Board Member' on df_labeled)
    df_labeled.loc[board_df_2.index, 'FixedPosition'] = 'Second Board Member'
    board_df_1 = board_df_1.assign(FixedPosition='Board Member')
    board_df_2 = board_df_2.assign(FixedPosition='Second Board Member')

    # ── Swap per institution if needed ──────────────────────────────────
    for inst in board_df_2['Institution'].unique():
        # exact-match selection
        idx1 = board_df_1.loc[board_df_1['Institution'] == inst].index
        idx2 = board_df_2.loc[board_df_2['Institution'] == inst].index

        # only consider those insts that truly have both labels
        if not idx1.empty and not idx2.empty:
            # smaller block should be 'Board Member'
            if len(idx1) > len(idx2):
                # swap them
                df_labeled.loc[idx1, 'FixedPosition'] = 'Second Board Member'
                df_labeled.loc[idx2, 'FixedPosition'] = 'Board Member'
                board_df_1.loc[idx1, 'FixedPosition']  = 'Second Board Member'
                board_df_2.loc[idx2, 'FixedPosition']  = 'Board Member'

    # ── Combine and return ────────────────────────────────────────────────
    combined = pd.concat([board_df_1, board_df_2], axis=0)
    return df_labeled, combined


In [64]:
def clean_and_report_boards(df_labeled: pd.DataFrame,
                            board_df: pd.DataFrame):
    """
    For *every* board block—both 'Board Member' and 'Second Board Member'—
    1) Drop any leading rows whose Position contains 'dean' or 'director,'
       (printing which rows were removed, per institution & board type).
    2) On each cleaned block, check the relative‑frequency clause:
       second_count >= 0.333 * top_count—printing any inst & board type that violates.
    Returns (cleaned_df, cleaned_board_df).
    """
    df_clean = df_labeled.copy()
    bd_clean = board_df.copy()
    bw_lower = [w.lower() for w in board_words]

    # we'll process each institution × board type separately
    for inst in sorted(bd_clean['Institution'].unique()):
        for label in ['Board Member', 'Second Board Member']:
            # select this block
            mask = (bd_clean['Institution'] == inst) & (bd_clean['FixedPosition'] == label)
            inst_rows = bd_clean[mask].sort_index()
            if inst_rows.empty:
                continue

            pos_series = inst_rows['Position'].astype(str).str.lower()

            # 1) strip off leading 'dean' or 'director,'
            to_drop = []
            for idx, p in pos_series.items():
                if 'dean' in p or 'director,' in p:
                    to_drop.append(idx)
                else:
                    break

            if to_drop:
                print(f"{inst} ({label}): dropping top rows {to_drop} for dean/director,")
                df_clean = df_clean.drop(index=to_drop)
                bd_clean = bd_clean.drop(index=to_drop)

                # refresh pos_series after dropping
                mask = (bd_clean['Institution'] == inst) & (bd_clean['FixedPosition'] == label)
                inst_rows = bd_clean[mask].sort_index()
                pos_series = inst_rows['Position'].astype(str).str.lower()

            # 2) relative‑frequency check on remaining rows
            freqs = {w: pos_series.str.contains(w).sum() for w in bw_lower}
            present = [(w, cnt) for w, cnt in freqs.items() if cnt > 0]
            if len(present) >= 2:
                present.sort(key=lambda x: x[1], reverse=True)
                top_w, top_c       = present[0]
                second_w, second_c = present[1]
                if second_c >= 0.333 * top_c:
                    print(
                        f"{inst} ({label}) → "
                        f"'{second_w}' ({second_c}) ≥ 0.333× '{top_w}' ({top_c})"
                    )

    return df_clean, bd_clean


In [65]:
for year in years:
    print(f"Processing {year}")
    raw = pd.read_csv(f"{data_path}/cleaned_dataframes/{year}_cleanedDataframe.csv")

    # your existing suffix clean‑ups, etc.
    raw = remove_suffixes(raw)

    # primary + secondary boards
    clean_df, board_df = detect_primary_and_secondary_boards(raw)
    clean_df, board_df = clean_and_report_boards(clean_df, board_df)
    # save back
    clean_df.to_csv(f"{data_path}/cleaned_dataframes/{year}_cleanedDataframe.csv",
                    index=False)
    board_df.to_csv(f"{data_path}/cleaned_dataframes/boards/{year}_boards.csv",
                    index=False)


Processing 1999
Albion College (Board Member): dropping top rows [214] for dean/director,
Boston University (Board Member) → 'member' (18) ≥ 0.333× 'trustee' (47)
Chestnut Hill College (Board Member): dropping top rows [3877, 3878, 3879] for dean/director,
Cornell University (Second Board Member) → 'fellow' (22) ≥ 0.333× 'trustee' (60)
Hendrix College (Board Member): dropping top rows [8401, 8402] for dean/director,
Indiana University Southeast (Board Member) → 'governor' (6) ≥ 0.333× 'trustee' (6)
Lake Erie College (Board Member): dropping top rows [9767] for dean/director,
Lenoir Rhyne College (Board Member): dropping top rows [9944] for dean/director,
Long Island University (Second Board Member) → 'member' (15) ≥ 0.333× 'trustee' (36)
Ohio State University (Board Member) → 'regent' (4) ≥ 0.333× 'trustee' (7)
Regis University (Board Member) → 'member' (9) ≥ 0.333× 'trustee' (27)
Rhodes College (Board Member) → 'member' (10) ≥ 0.333× 'trustee' (21)
State University Of New York At Gene