In [336]:
data_path = r'C:\Projects\connecteddatahub\data'

In [337]:
import pandas as pd
from pathlib import Path
from typing import List, Optional
import os

Word bank of positions that appear in the data:

In [338]:
POSITION_BANK = [
    "President", "Chancellor", "Provost", "Director", "Dean", "Controller",
    "Trustee", "Member", "Regent", "Chairman", "Overseer", "Assistant",
    "Librarian", "Secretary", "Chaplain", "Minister", "Treasurer",
    "Senior Counsel", "General Counsel", "Legal Counsel", "University Counsel",
    "College Counsel", "Special Counsel", "Corporation Counsel",
    "Corporate Counsel", "Officer", "Chief", "Professor", "Commissioner",
    "Fellow", "Chairperson", "Manager", "Clergy", "Coordinator", "Auditor",
    "Governor", "Representative", "Stockbroker", "Advisor", "Commandant",
    "Rector", "Attorney", "Curator", "Clerk", "Department Head", "Pastor",
    "Head", "Comptroller", "Deputy", "Inspector General", "Instructor",
    "Registrar", "Ombuds", "Administrator", "Liaison",
    "Administrative Associate", "Webmaster", "Specialist",
    "University Planner", "Architect",
]

In [339]:
data_path = r"C:\Projects\connecteddatahub\data"
years = ["1999","2000","2005","2007","2008","2009","2010","2011","2013","2018",]
# years = ['1999']

**Position Splitting Logic:**

1. Traverse backwards through the positions, checking for delimiters:
2. Check **And** first because this is the delimiter that will be used last every time
3. Then commas and semicolon – semicolon is a rare delimiter but does occur
4. Do not split if the format is something like “Dean, Office of the Provost”, because this is actually one position
5. Run this multiple times to account for positions needing to be split multiple times


In [340]:
def contains_position(text: str) -> bool:
    """Return True iff any word from POSITION_BANK appears in *text*."""
    return any(title in text for title in POSITION_BANK)

    
def count_appearances(text: str) -> int:
    """How many distinct bank titles appear (substring match)."""
    return sum(1 for title in POSITION_BANK if title in text)


def all_split_indices(text: str, sep: str) -> list[int]:
    """Return a list of all positions where `sep` occurs in `text`."""
    idxs = []
    start = 0
    while True:
        i = text.find(sep, start)
        if i < 0:
            return idxs
        idxs.append(i)
        start = i + len(sep)

def split_position_rows(df: pd.DataFrame) -> pd.DataFrame:
    new_rows = []
    for _, row in df.iterrows():
        pos = (str(row["Position"]) if not pd.isna(row["Position"]) else "").title()
        split_done = False

        for sep in ("And", ";", ","):
            # Try this delimiter up to three times
            for _ in range(3):
                for idx in reversed(all_split_indices(pos, sep)):
                    left  = pos[:idx].strip()
                    right = pos[idx + len(sep):].strip()

                    #never split if the LHS has no known title
                    if not contains_position(left):
                        continue

                    #skip unwanted “office of the” fragments e.g 'Director, Office of the Provost" would split into 2, but this is just one
                    low = right.lower()
                    if ("office of the" in low or "to the" in low or "'s office" in low or "for the" in low) and count_appearances(right) < 2:
                        continue

                    #if both sides now both contain a title, accept the split
                    if _contains_position(right):
                        top = row.copy(); top["Position"] = left
                        bottom = row.copy(); bottom["Position"] = right
                        new_rows.extend((top, bottom))
                        split_done = True
                        break

                if split_done:
                    break
            if split_done:
                break
        if not split_done:
            new_rows.append(row)
    return pd.DataFrame(new_rows).reset_index(drop=True)

def apply_splitting_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    df = split_position_rows(df)
    df = split_position_rows(df)
    df = split_position_rows(df)
    return df


In [None]:
for year in years:
    print(f"Processing: {year}")
    gpt_df = pd.read_csv(os.path.join(data_path, 'gpt_dataframes', f'{year}_gptDataframe.csv'))
    split_df = apply_splitting_pipeline(gpt_df)
    split_df.to_csv(os.path.join(data_path, 'cleaned_dataframes', 'split_positions', f'{year}_split_positions.csv'), index=False)


Processing: 1999
Processing: 2000
Processing: 2005
Processing: 2007
Processing: 2008
Processing: 2009
Processing: 2010
Processing: 2011
Processing: 2013
Processing: 2018


In [342]:
'''Same pipeline as before but used for validation so keeps track of each split that is made and prints out the resultant splits:'''


def split_position_rows_validation(
    df: pd.DataFrame
) -> Tuple[pd.DataFrame, List[Dict[str, Any]]]:
    """
    Splits rows on delimiters (“And”, “;”, “,”) exactly as before,
    but also records each split into `splits`.
    """
    splits: List[Dict[str, Any]] = []
    new_rows = []

    for idx, row in df.iterrows():
        pos = (str(row["Position"]) if not pd.isna(row["Position"]) else "").title()
        original_pos = pos
        split_done = False

        for sep in ("And", ";", ","):
            # Try up to three nested splits
            for _ in range(3):
                for split_idx in reversed(all_split_indices(pos, sep)):
                    left  = pos[:split_idx].strip()
                    right = pos[split_idx + len(sep):].strip()

                    # never split if LHS has no known title
                    if not contains_position(left):
                        continue

                    # skip “office of…” fragments
                    low = right.lower()
                    if  ("office of the" in low or "to the" in low or "'s office" in low or "for the" in low) and count_appearances(right) < 2:
                        continue

                    # if RHS also contains a title, do the split
                    if contains_position(right):
                        top = row.copy();    top["Position"] = left
                        bot = row.copy();    bot["Position"] = right
                        new_rows.extend([top, bot])

                        # record it
                        splits.append({
                            "row_index": idx,
                            "original": original_pos,
                            "sep": sep,
                            "left": left,
                            "right": right
                        })

                        split_done = True
                        break
                if split_done:
                    break
            if split_done:
                break

        if not split_done:
            new_rows.append(row)

    return pd.DataFrame(new_rows).reset_index(drop=True), splits

def apply_splitting_pipeline_validation(
    df: pd.DataFrame
) -> Tuple[pd.DataFrame, List[Dict[str, Any]]]:
    all_splits: List[Dict[str, Any]] = []
    df1, s1 = split_position_rows_validation(df)
    all_splits.extend(s1)
    df2, s2 = split_position_rows_validation(df1)
    all_splits.extend(s2)
    df3, s3 = split_position_rows_validation(df2)
    all_splits.extend(s3)
    return df3, all_splits

In [343]:
year = years[0]
gpt_df = pd.read_csv(os.path.join(data_path, 'gpt_dataframes', f'{year}_gptDataframe.csv'))
split_df, split_events = apply_splitting_pipeline_validation(gpt_df)

print("Splits performed:")
for ev in split_events:
    print(
        f" • Row {ev['row_index']}: "
        f"“{ev['original']}” split on “{ev['sep']}” to "
        f"“{ev['left']}” | “{ev['right']}”"
    )

Splits performed:
 • Row 3: “Vice Chancellor And President, Acu Foundation” split on “And” to “Vice Chancellor” | “President, Acu Foundation”
 • Row 7: “Vice President And General Counsel” split on “And” to “Vice President” | “General Counsel”
 • Row 27: “Vice Chairman And Secretary” split on “And” to “Vice Chairman” | “Secretary”
 • Row 105: “Assistant Secretary To The Board Of Trustees And Special Assistant To The President” split on “And” to “Assistant Secretary To The Board Of Trustees” | “Special Assistant To The President”
 • Row 111: “Vice President, Finance And Treasurer” split on “And” to “Vice President, Finance” | “Treasurer”
 • Row 121: “Controller And Associate Treasurer” split on “And” to “Controller” | “Associate Treasurer”
 • Row 147: “Vice President, Student Life And Community Relations And Dean Of Students” split on “And” to “Vice President, Student Life And Community Relations” | “Dean Of Students”
 • Row 154: “Director, Graduate Studies And Associate Dean Of The Col