In [5]:
import pandas as pd
import os

In [6]:
years = ["1999","2000","2005","2007","2008","2009","2010","2011","2013","2018",]

data_path = r'C:\Projects\connecteddatahub\data\cleaned_dataframes\split_positions'

In [7]:
def mark_non_contiguous(df: pd.DataFrame):
    """
    Identify institutions whose rows in the DataFrame are not in a single contiguous
    block of indices. For each such institution, report the first gap detected.
    
    Prints:
      - Institution name
      - The pair of indices where the jump occurs (prev_index → next_index)
    
    Returns:
      List[Tuple[str, int, int]]: a list of (institution, prev_idx, next_idx) for each flagged group.
    """
    flagged = []
    
    # iterate groups without re‐sorting keys
    for inst, group in df.groupby("Institution", sort=False):
        idxs = sorted(group.index)
        # look for any gap > 1 between successive indices
        for prev_idx, next_idx in zip(idxs, idxs[1:]):
            if next_idx != prev_idx + 1:
                flagged.append((inst, prev_idx, next_idx))
                break  # only report the first gap per institution

    if flagged:
        print("\nNon-contiguous groups detected:")
        for inst, prev_idx, next_idx in flagged:
            print(f" - {inst}: gap between index {prev_idx} → {next_idx}")
    else:
        print("\nAll institution groups have contiguous indices.")
    
    return flagged


In [8]:
for year in years:
    print(f"Processing: {year}")
    gpt_df = pd.read_csv(os.path.join(data_path, f'{year}_split_positions.csv'))
    mark_non_contiguous(gpt_df)

Processing: 1999

Non-contiguous groups detected:
 - California State University System: gap between index 2524 → 3059
 - City University Of New York: gap between index 4033 → 4301
 - Queens College: gap between index 4260 → 14322
 - College Of Charleston: gap between index 4844 → 4866
 - College Of Mount St Joseph: gap between index 4865 → 4867
 - Depaul University: gap between index 6018 → 6034
 - Tennessee Board Of Regents: gap between index 6645 → 11501
 - Harvard University: gap between index 8272 → 8291
 - Louisiana State University System: gap between index 10304 → 10419
 - Loyola University: gap between index 10638 → 10645
 - Loyola University Chicago: gap between index 10644 → 10698
 - University Of Montana Missoula: gap between index 11735 → 11747
 - North Dakota University System: gap between index 12354 → 12361
 - University Of Oregon: gap between index 13242 → 13247
 - Pennsylvania State System Of Higher Education: gap between index 13405 → 13706
 - Shippensburg University