In [309]:
import pandas as pd
import numpy as np
import os
from collections import Counter
import re
from collections import defaultdict
from nameparser import HumanName

In [310]:
data_path = r"C:\Projects\connecteddatahub\data"
years = ["1999","2000","2005","2007","2008","2009","2010","2011","2013","2018",]
years = ['2013']

In [None]:
board_words = ["Trustee", "Regent", "Member", "Fellow", "Overseer", "Governor", "Curator", "Visitor", "Manager"]
position_bank = ["President", "Chancellor", "Provost", "Director", "Dean", "Controller", "Trustee", "Member", "Regent", "Chairman", "Overseer", "Assistant", "Librarian", "Secretary", "Chaplain", "Minister", "Treasurer", "Senior Counsel", "General Counsel", "Legal Counsel", "University Counsel", "College Counsel", "Special Counsel", "Corporation Counsel", "Officer", "Chief", "Professor", "Commissioner", "Fellow", "Chairperson", "Manager", "Clergy", "Coordinator", "Auditor", "Governor", "Representative", "Stockbroker", "Advisor", "Commandant", "Rector", "Attorney", "Curator", "Clerk", "Department Head", "Pastor", "Head", "Comptroller", "Deputy", "Inspector General"]
exception_terms = {"secretary", "chairman", "treasurer"}

In [312]:
def remove_suffixes(df):
    def strip_name(name):
        if pd.isna(name):
            return name
        clean = re.sub(r'(["\']).*?\1', '', name)
        hn = HumanName(clean)
        hn.suffix = ''
        return str(hn).strip()
    df['tempName'] = df['Name'].apply(strip_name)
    return df

In [None]:
def get_board_names(df):
    """
    Infer each institution’s board keyword by sampling its last 10 positions.
    Applies a governor/director vs trustee/regent runner‑up override.
    """
    board_names = {}
    for institution, group in df.groupby('Institution'):
        recent = group.tail(10)['Position'].dropna().astype(str)
        counts = Counter()
        for pos in recent:
            pl = pos.lower()
            for word in position_bank:
                if word.lower() in pl and 'director, ' not in pl:
                    counts[word] += 1
        chosen = None
        if counts:
            common = counts.most_common()
            top_word, top_count = common[0]
            if len(common) > 1:
                second_word, second_count = common[1]
                if (top_word.lower() in {'governor','director'}
                    and second_word.lower() in {'trustee','regent'}
                    and second_count >= 0.8 * top_count):
                    top_word = second_word
            if top_word in board_words:
                chosen = top_word
        board_names[institution] = chosen
    return board_names

def detect_director_boards(df, board_names):
    """
    Institutions without board_names whose last 10 positions
    are most commonly exactly 'director'. Prints detections.
    """
    director_institutions = set()
    for institution, group in df.groupby('Institution'):
        if board_names.get(institution):
            continue
        recent = group.tail(10)['Position'].fillna('').astype(str)
        counts = Counter(p.lower().strip() for p in recent if p.strip())
        if counts and counts.most_common(1)[0][0] == 'director':
            print(f'Director board detected for {institution}')
            director_institutions.add(institution)
    return director_institutions

In [None]:
def get_permissive_blocks(df, board_names):
    static_exceptions = ['secretary','chairman','treasurer','chairperson','vice chair']
    blocks = {}

    all_board_words = set(board_words)

    for institution, group in df.groupby('Institution'):
        names       = group['tempName'].dropna().astype(str).tolist()
        last_names  = [n.split()[-1].lower() for n in names]
        positions   = group['Position'].fillna('').astype(str).tolist()
        indices     = group.index.to_list()

        board_word  = (board_names.get(institution) or '').lower()
        # exclude the chosen word itself
        others      = all_board_words - {board_word.title()}

        # precompute how often each "other" appears in this group
        other_freq = {
            other: sum(1 for p in positions if other.lower() in p.lower())
            for other in others
        }

        # 1) seed at last occurrence of this inst’s board_word
        if board_word:
            matches = [i for i, p in enumerate(positions) if board_word in p.lower()]
            if matches:
                start_idx = matches[-1]
            else:
                start_idx = len(last_names) - 1
        else:
            start_idx = len(last_names) - 1

        # initialize scan pointer
        i = start_idx - 1

        # 2) expand upward
        while i >= 0:
            p_lower = positions[i].lower()

            # A) break if we see an "other" twice in a row frequency rule
            hit_other = False
            for other in others:
                if other.lower() in p_lower and other_freq.get(other, 0) > 6:
                    # immediate stop
                    hit_other = True
                    i = -1
                    break
            if hit_other or i < 0:
                break

            # B) standard flags
            in_order     = last_names[i] <= last_names[i+1]
            is_board     = board_word in p_lower
            is_exception = any(exc in p_lower for exc in static_exceptions)

            if in_order or is_board or is_exception:
                start_idx = i
                i -= 1
                continue

            # C) A/B last‑name bail
            if last_names[i].startswith(('a','b')):
                break


            # D) peek logic (2 of next 3 must pass)
            successes = 0
            peeks     = 0
            for j in range(i-1, max(i-4, -1), -1):
                peeks += 1
                pj = positions[j].lower()
            
                # hard fails
                if 'dean' in pj or ('director' in pj and board_word != 'director'):
                    continue
                j_in_order     = last_names[j] <= last_names[j+1]
                j_is_board     = board_word in pj
                j_is_exception = any(exc in pj for exc in static_exceptions)

                # if j_in_order or j_is_board or j_is_exception:
                if j_is_board or (j_in_order and j_is_exception):
                    successes += 1

            if peeks >= 2 and successes >= 2:
                start_idx = i
                i -= 1
                continue
            else:
                break
        
        blocks[institution] = indices[start_idx:]

    return blocks


def split_into_contiguous_runs(indices):
    """
    Split a sorted list of global indices into maximal contiguous runs.
    """
    if not indices:
        return []
    sorted_idx = sorted(indices)
    runs = [[sorted_idx[0]]]
    for x in sorted_idx[1:]:
        if x == runs[-1][-1] + 1:
            runs[-1].append(x)
        else:
            runs.append([x])
    return runs


def mark_board_members(df, board_names):
    static_exc = ['secretary','chairman','treasurer','chairperson']
    # ── 1. director overrides ────────────────────────────────────────────
    director_insts = detect_director_boards(df, board_names)
    names_map = board_names.copy()
    for inst in director_insts:
        names_map[inst] = 'director'
    if 'FixedPosition' not in df.columns:
        df['FixedPosition'] = np.nan

    blocks = get_permissive_blocks(df, names_map)
    validated_idx = []

    for inst, idx_list in blocks.items():
        board_word = (names_map.get(inst) or '').lower()
        if not board_word or not idx_list:
            continue

        if inst == "Brown University":
            print(idx_list)
        # dynamic exceptions (no 'dean')
        dyn_exc = {
            str(df.at[i,'Position']).lower().strip()
            for i in idx_list
            if 'dean' not in str(df.at[i,'Position']).lower()
        }
        exceptions = set(static_exc) | dyn_exc

        # ── 2. locate first board word ──────────────────────────────────
        first_rel = None
        for rel, i in enumerate(idx_list[:7]):
            p = str(df.at[i,'Position']).lower().strip()
            if (board_word == 'director' and p == 'director') or \
               (board_word != 'director' and board_word in p):
                first_rel = rel
                break
        if first_rel is None:
            continue

        # expand up to 3 rows above for exceptions
        window = 3
        start_rel = max(0, first_rel - window)
        earliest = first_rel
        for rel in range(start_rel, first_rel):
            if any(exc in str(df.at[idx_list[rel],'Position']).lower() for exc in exceptions):
                earliest = rel
                break

        selected = idx_list[earliest:]

        # ── 3. enforce contiguity rules ─────────────────────────────────
        runs = split_into_contiguous_runs(selected)
        longest = max(runs, key=len)
        kept = [longest]
        for run in runs:
            if run is longest:
                continue
            if len(run) <= 2:
                continue
            gap = (run[0] - longest[-1] - 1
                   if run[0] > longest[-1]
                   else longest[0] - run[-1] - 1)
            if gap > 100:
                continue
            kept.append(run)

        # flatten into final list
        final = [i for r in kept for i in r]

        # ── NEW: peek one row *above* the earliest selected row ──────────
        # if that row exists in idx_list and its Position is exactly
        # 'chairman', 'chairperson', or 'chair', include it too
        if final:
            # find the smallest global index in final
            first_global = min(final)
            peek_idx     = first_global - 1
            # only if that row exists in the dataframe
            if peek_idx in df.index:
                peek_p = str(df.at[peek_idx, 'Position']).lower().strip()
                if peek_p in {'chairman', 'chairperson', 'chair'}:
                    # prepend it, but only if it's not already in final
                    if peek_idx not in final:
                        final.insert(0, peek_idx)

        # label and collect
        validated_idx.extend(final)
        df.loc[final, 'FixedPosition'] = 'Board Member'

    board_df = df.loc[validated_idx].copy()
    df = df.drop(columns='tempName', errors='ignore')
    board_df = board_df.drop(columns='tempName', errors='ignore')
    return df, board_df


In [314]:
for year in years:
    print(f"Processing: {year}")
    df = pd.read_csv(os.path.join(data_path, 'cleaned_dataframes', f'{year}_cleanedDataframe.csv'))
    df = remove_suffixes(df)
    board_names = get_board_names(df)
    clean_df, board_df = mark_board_members(df, board_names)
    clean_df.to_csv(os.path.join(data_path, 'cleaned_dataframes', f'{year}_cleanedDataframe.csv'), index = False)
    board_df.to_csv(os.path.join(data_path, 'cleaned_dataframes', 'boards', f'{year}_boards.csv'), index = False)

Processing: 2013
Director board detected for Creighton University
Director board detected for Duquesne University
Director board detected for Georgetown University
Director board detected for John Carroll University
Director board detected for Oregon Health & Science University
Director board detected for University Of St Thomas Texas
Director board detected for Valparaiso University
[1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916]
