In [31]:
import os
import pandas as pd
import numpy as np
from collections import Counter

In [32]:
data_path = r'C:\Projects\connecteddatahub\data'

years = ["2013"]
years = ["1999","2000","2005","2007","2008","2009","2010","2011","2013","2018",]

president_words = ["president", "chancellor", "superintendent", "commissioner", "officer-in-charge"]

In [33]:
def count_first_positions(df):
    """
    Check the distribution of positions listed for the first position of each institution.
    Flag institutions whose first position is not 'president' or 'chancellor',
    or contains 'vice president'. Print the institution, flagged position, and original row index.
    Also print full group for 'Queens College'.
    """
    # Get the first row (original index preserved) per institution
    first_row_indices = df.groupby("Institution", sort=False).head(1).index
    first_rows = df.loc[first_row_indices]

    positions = []
    flagged = []

    for idx, row in first_rows.iterrows():
        pos = str(row["Position"]).strip()
        institution = row["Institution"]
        positions.append(pos)

        position_lower = pos.lower()
        if not any(pres in position_lower for pres in president_words) or "vice" in position_lower:
            flagged.append((idx, institution, pos))

    print("\nInstitutions with flagged positions (no 'President'/'Chancellor' or contains 'Vice President'):")
    for idx, inst, pos in flagged:
        print(f" - Index {idx}: {inst} → '{pos}'")

    print("\nCounts of first-row position titles:")
    counts = Counter(positions)
    for title, count in counts.most_common():
        print(f"{title}: {count}")


In [34]:
for year in years:
    print(f"Processing: {year}")
    gpt_df = pd.read_csv(os.path.join(data_path, 'cleaned_dataframes', 'split_positions', f'{year}_split_positions.csv'))
    count_first_positions(gpt_df)

Processing: 1999

Institutions with flagged positions (no 'President'/'Chancellor' or contains 'Vice President'):
 - Index 4259: Queens College → 'Dean'
 - Index 6636: Tennessee Board Of Regents → 'Regent'
 - Index 9010: Institute For Advanced Study → 'Director'
 - Index 17319: State University Of New York System → 'Trustee'
 - Index 18277: Texas A&M University Baylor College Of Dentistry → 'Vice President'
 - Index 18414: Texas A&M University Galveston → 'Vice President'

Counts of first-row position titles:
President: 547
Chancellor: 116
President (Acting): 17
Chancellor (Acting): 2
Vice President: 2
Dean: 1
President (Until January 1, 1999): 1
Regent: 1
Director: 1
Commissioner of Higher Education: 1
Acting President: 1
Trustee: 1
Acting Chancellor: 1
Superintendent: 1
Hudson E. Bridge Chancellor: 1
Processing: 2000

Institutions with flagged positions (no 'President'/'Chancellor' or contains 'Vice President'):
 - Index 9462: Institute For Advanced Study → 'Director'
 - Index 19911:

In [35]:
def mark_president_positions(df: pd.DataFrame):
    """
    Mark rows as 'President' in 'FixedPosition' if we detect a true president
    (president_words, excluding 'vice') in the first row of each institution;
    otherwise, look up to the next 3 rows. Print institutions where we had to
    look further, and those where we found no president in the first 4 rows,
    listing the positions seen.
    
    Returns:
      df               -- original DataFrame with FixedPosition (object dtype) filled in
      president_rows   -- subset DataFrame of the rows marked as President
    """
    # ensure column exists as object dtype
    if "FixedPosition" not in df.columns:
        df["FixedPosition"] = pd.Series([None] * len(df), index=df.index, dtype="object")
    else:
        df["FixedPosition"] = df["FixedPosition"].astype("object")

    def is_true_president(pos):
        pos = str(pos).lower()
        return any(p in pos for p in president_words) and "vice" not in pos

    president_indices = []
    found_late = []    # [(institution, idx, pos), ...]
    not_found = []     # [(institution, [pos1, pos2, pos3, pos4]), ...]

    for inst, group in df.groupby("Institution", sort=False):
        idxs = list(group.index)
        # check first row
        first_idx = idxs[0]
        if is_true_president(df.at[first_idx, "Position"]):
            president_indices.append(first_idx)
            continue

        # look in the next up to 3 rows
        found = False
        for nxt in idxs[1:4]:
            if is_true_president(df.at[nxt, "Position"]):
                president_indices.append(nxt)
                found_late.append((inst, nxt, df.at[nxt, "Position"]))
                found = True
                break
        if not found:
            seen_positions = [str(df.at[i, "Position"]).strip() for i in idxs[:4]]
            not_found.append((inst, seen_positions))

    # Mark the identified rows (no dtype warning now)
    df.loc[president_indices, "FixedPosition"] = "President"
    president_rows = df.loc[president_indices].copy()

    # Print summary
    if found_late:
        print("\nInstitutions where president was found in rows 2–4:")
        for inst, idx, pos in found_late:
            print(f" - {inst}: row {idx} → '{pos}'")

    if not_found:
        print("\nInstitutions with NO president in first 4 rows and their observed positions:")
        for inst, pos_list in not_found:
            joined = ", ".join(f"'{p}'" for p in pos_list)
            print(f" - {inst}: {joined}")

    return df, president_rows


In [36]:
for year in years:
    print(f"\n\nProcessing: {year}")
    df = pd.read_csv(os.path.join(data_path, 'cleaned_dataframes', 'split_positions', f'{year}_split_positions.csv'))
    full_df, president_df = mark_president_positions(df)
    full_df.to_csv(os.path.join(data_path, 'cleaned_dataframes', f'{year}_cleanedDataframe.csv'), index = False)
    president_df.to_csv(os.path.join(data_path, 'cleaned_dataframes', 'presidents', f'{year}_presidents.csv'), index = False)
    



Processing: 1999

Institutions where president was found in rows 2–4:
 - Queens College: row 14322 → 'President'
 - Tennessee Board Of Regents: row 6638 → 'Chancellor'

Institutions with NO president in first 4 rows and their observed positions:
 - Institute For Advanced Study: 'Director', 'Administrative Officer', 'Administrative Officer', 'Administrative Officer'
 - State University Of New York System: 'Trustee', 'Trustee', 'Trustee', 'Chairman'
 - Texas A&M University Baylor College Of Dentistry: 'Vice President', 'Dean (Acting)', 'Vice President, Business Services (Acting)', 'Vice President. Institutional Research and Information Technology Systems'
 - Texas A&M University Galveston: 'Vice President', 'Chief Executive Officer', 'Executive Associate Vice President', 'Associate Vice President, Student Affairs'


Processing: 2000

Institutions with NO president in first 4 rows and their observed positions:
 - Institute For Advanced Study: 'Director', 'Administrative Officer', 'Admin