In [61]:
import pandas as pd
from collections import defaultdict
import networkx as nx
import os
import re
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz

In [62]:
absolute_path = "C:\\Users\\tykun\\OneDrive\\Documents\\SchoolDocs\\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"       # (Not used for saving in this version)
scripts = "scripts\\"
board_dataframes = "board_dataframes\\"
yearly_interlocks = "yearly_interlocks\\"

years = ["1999", "2000", "2005", "2007", "2008", "2009", "2011", "2013", "2018"]

In [63]:
def remove_non_samples(df: pd.DataFrame) -> pd.DataFrame:
    """Filter the DataFrame to include only rows where 'PrimarySample' is True."""
    return df[df['PrimarySample'] == True]

In [64]:

def remove_non_samples(df):
    # Your cleaning logic here…
    return df

def normalize(x):
    """Convert the input to a string and strip whitespace."""
    return str(x).strip()

# ======================
# --- UNION–FIND SETUP ---
# ======================
# We'll use union–find to group identifiers (Institution names and AffiliationIds)
parent = {}
appearances = {}   # Map: normalized node -> set of years it appears in.
node_sources = {}  # Map: normalized node -> set of source tags ("Institution", "AffiliationId")

def find(x):
    x = normalize(x)
    if parent[x] != x:
        parent[x] = find(parent[x])
    return parent[x]

def union(x, y):
    x = normalize(x)
    y = normalize(y)
    rootX = find(x)
    rootY = find(y)
    if rootX != rootY:
        parent[rootY] = rootX

def add_node(node, source, year):
    node = normalize(node)
    if node == "":
        return
    if node not in parent:
        parent[node] = node
    appearances.setdefault(node, set()).add(year)
    node_sources.setdefault(node, set()).add(source)

# ----------------------------
# Record associations between AffiliationId and Institution.
# For rows where both values are present, we record that this affiliation id
# is paired with the institution name.
aff_to_inst = defaultdict(set)

# ======================
# --- MERGE THE DATA ---
# ======================
# We'll store all the boards in one DataFrame.
merged_boards_list = []  # to hold DataFrames for each year

for year in years:
    file_path = f"{absolute_path}{board_dataframes}{year}_boards.csv"
    boards_df = pd.read_csv(file_path)
    boards_df = remove_non_samples(boards_df)
    boards_df['Year'] = year  # add a Year column
    merged_boards_list.append(boards_df)
    
    # Process each row for union–find
    for _, row in boards_df.iterrows():
        inst = row.get("Institution")
        aff = row.get("AffiliationId")
        
        valid_inst = pd.notna(inst) and normalize(inst) != ""
        valid_aff  = pd.notna(aff)  and normalize(aff)  != ""
        
        # If Institution is present, record it.
        if valid_inst:
            add_node(inst, "Institution", year)
        
        # For AffiliationId:
        # - If only AffiliationId is present, record it.
        # - If both are present, record both and union them.
        if valid_aff:
            if not valid_inst:
                add_node(aff, "AffiliationId", year)
            else:
                add_node(aff, "AffiliationId", year)
                aff_to_inst[normalize(aff)].add(normalize(inst))
                union(inst, aff)

# Concatenate all boards into a single DataFrame.
merged_df = pd.concat(merged_boards_list, ignore_index=True)

# ======================
# --- GROUP INTO COMPONENTS ---
# ======================
components = {}
for node in parent:
    root = find(node)
    if root not in components:
        components[root] = {"nodes": set(), "years": set(), "sources": set()}
    components[root]["nodes"].add(node)
    if node in appearances:
        components[root]["years"].update(appearances[node])
    if node in node_sources:
        components[root]["sources"].update(node_sources[node])

# ======================
# --- IDENTIFY DISCREPANCY GROUPS ---
# ======================
# A group is flagged if:
#   1) It has more than one distinct Institution name (nodes from "Institution"), or
#   2) The group appears in some year(s) with only an AffiliationId (i.e. no direct Institution).
discrepancy_groups = set()
for comp_id, data in components.items():
    # Get all nodes that came from the "Institution" column.
    inst_nodes = {node for node in data['nodes'] if "Institution" in node_sources.get(node, set())}
    # Skip groups that never had any direct Institution data.
    if not inst_nodes:
        continue
    # Get the years where a direct Institution was reported.
    direct_inst_years = set()
    for node in inst_nodes:
        direct_inst_years.update(appearances.get(node, set()))
    # If more than one Institution name appears or there are years with only an AffiliationId, flag it.
    if (len(inst_nodes) > 1) or (data['years'] != direct_inst_years):
        discrepancy_groups.add(comp_id)

# ======================
# --- ASSIGN GROUP IDs TO THE MERGED DATAFRAME ---
# ======================
def get_group_id(row):
    # Use the nonempty field to compute the group.
    if pd.notna(row.get("Institution")) and normalize(row.get("Institution")) != "":
        return find(normalize(row.get("Institution")))
    elif pd.notna(row.get("AffiliationId")) and normalize(row.get("AffiliationId")) != "":
        return find(normalize(row.get("AffiliationId")))
    else:
        return None

merged_df["GroupID"] = merged_df.apply(get_group_id, axis=1)

# ======================
# --- DETERMINE THE CANONICAL INSTITUTION NAME FOR EACH DISCREPANT GROUP ---
# ======================
# For each discrepancy group, choose the institution name that appears most frequently
# (i.e. the mode among nonempty Institution entries in that group).
group_to_canonical = {}
for group_id in discrepancy_groups:
    group_rows = merged_df[merged_df["GroupID"] == group_id]
    # Only consider rows with a nonempty Institution value.
    valid_insts = group_rows["Institution"].dropna().apply(normalize)
    valid_insts = valid_insts[valid_insts != ""]
    if not valid_insts.empty:
        canonical_name = valid_insts.value_counts().idxmax()
        group_to_canonical[group_id] = canonical_name
        print(f"Group '{group_id}': replacing with canonical Institution '{canonical_name}'")
    else:
        # In the unlikely case no row has an Institution value, leave it as is.
        group_to_canonical[group_id] = None

# ======================
# --- UPDATE THE MERGED DATAFRAME ---
# ======================
# For every row that belongs to a discrepant group, replace its Institution column
# with the canonical name computed.
def update_institution(row):
    group_id = row["GroupID"]
    if pd.notna(group_id) and group_id in group_to_canonical and group_to_canonical[group_id]:
        return group_to_canonical[group_id]
    else:
        return row.get("Institution")

merged_df["Institution"] = merged_df.apply(update_institution, axis=1)

# Optionally drop the GroupID column if you no longer need it.
merged_df = merged_df.drop(columns=["GroupID"])
merged_df = merged_df.drop(columns=["FullName"])

# ======================
# --- WRITE THE MERGED DATAFRAME TO CSV ---
# ======================
# Ensure the output folder exists.
output_folder = "merged_boards"
os.makedirs(output_folder, exist_ok=True)
output_path = os.path.join(output_folder, "merged_boards.csv")
merged_df.to_csv(output_path, index=False)
print(f"\nMerged boards saved to: {output_path}")


Group 'Polytechnic Institute Of New York University': replacing with canonical Institution 'New York University'
Group 'Austin Pay State University': replacing with canonical Institution 'Austin Peay State University'
Group 'Loyola University Maryland': replacing with canonical Institution 'Loyola University Maryland'
Group 'St Marys University Texas': replacing with canonical Institution 'St Marys University'
Group 'State University Of New York College At Plattsburgh': replacing with canonical Institution 'State University Of New York At Plattsburgh'
Group 'University Of Illinois System': replacing with canonical Institution 'University Of Illinois System'
Group 'Arcadia University': replacing with canonical Institution 'Arcadia University'
Group 'Southeast Missouri University': replacing with canonical Institution 'Southeast Missouri State University'
Group 'Loyola University New Orleans': replacing with canonical Institution 'Loyola University New Orleans'
Group 'Montana State Unive

In [None]:
'''
Get list of interlocked names, and fuzzy match them with other board members to find potential missed interlocking. 
'''

substrings_to_remove = [
    "Rev.", "SJ", "Sister", "Brother", "Father", "OP", "The Very",
    "Sr.", "O.P.", "Very Rev.", "Br.", "Dr.", "Md.", "S.J.", "Very Rev",
    "M.D.", "O.P", "S.J", "J.R", "Jr.", "Jr ", "III", "His ", "Eminence", "Cardinal "
]

def clean_name(raw_name: str) -> str:
    """
    Clean and canonicalize a board member's name by:
      - Removing specified title substrings (case-insensitive)
      - Removing punctuation and extra whitespace
      - Converting to title case.
    """
    for title in substrings_to_remove:
        title_clean = title.strip()
        raw_name = re.sub(r'\b' + re.escape(title_clean) + r'\b', '', raw_name, flags=re.IGNORECASE)
    raw_name = re.sub(r'[^\w\s]', '', raw_name)  # remove punctuation
    cleaned_name = " ".join(raw_name.split())      # remove extra whitespace
    return cleaned_name.title()

# ---------------------------
# Main Processing: Counting Interlocks and Recording Board Members
# ---------------------------

# We'll store information on interlocked board members from every year.
# Each record will contain the Year, a canonical name, the fuzzy–matched names, 
# the union of institutions (i.e. the interlock set), and a count.
all_interlock_members = []

for year in years:
    print(f"Processing year: {year}")
    
    # Reinitialize the board membership dictionary for this year.
    # Key: cleaned board member name; Value: set of institutions where that person appears.
    board_member_dict = defaultdict(set)
    
    # ---------------------------
    # Load Board Data (assumes two files per year)
    # ---------------------------
    boards_path = f"{absolute_path}{board_dataframes}{year}_boards.csv"
    double_boards_path = f"{absolute_path}{board_dataframes}{year}_double_board.csv"
    
    boards_df = pd.read_csv(boards_path)
    double_boards_df = pd.read_csv(double_boards_path)
    
    boards_df = remove_non_samples(boards_df)
    double_boards_df = remove_non_samples(double_boards_df)
    
    # ---------------------------
    # Process rows to build board_member_dict.
    # (We ignore any row whose 'Name' contains "vacant".)
    # ---------------------------
    def process_row(row):
        if "vacant" in row['Name'].lower():
            return
        name = clean_name(row['Name'])
        institution = row['Institution']
        board_member_dict[name].add(institution)
    
    for _, row in boards_df.iterrows():
        process_row(row)
    for _, row in double_boards_df.iterrows():
        process_row(row)
    
    # ---------------------------
    # Identify board members who serve on more than one institution.
    # These are our interlock candidates.
    # ---------------------------
    interlock_candidates = {name: insts for name, insts in board_member_dict.items() if len(insts) > 1}
    print(f"  Found {len(interlock_candidates)} board members with interlocks (before fuzzy matching).")
    
    # ---------------------------
    # Fuzzy Matching to Merge Names That Might Represent the Same Person
    # ---------------------------
    # For example, if "John A. Smith" and "John Smith" both appear, we may want to treat them as the same person.
    fuzzy_threshold = 90  # Adjust as needed (0-100)
    names_list = list(interlock_candidates.keys())
    merged_members = {}  # Mapping: canonical name -> dict with keys: 'Fuzzy_Matches' and 'Institutions'
    visited = set()
    
    for i, name in enumerate(names_list):
        if name in visited:
            continue
        canonical = name
        fuzzy_matches = {name}  # start with itself
        union_insts = set(interlock_candidates[name])
        # Compare with later names.
        for other in names_list[i+1:]:
            if other in visited:
                continue
            score = fuzz.token_set_ratio(name, other)
            if score >= fuzzy_threshold:
                fuzzy_matches.add(other)
                union_insts.update(interlock_candidates[other])
                visited.add(other)
        merged_members[canonical] = {
            'Fuzzy_Matches': list(fuzzy_matches),
            'Institutions': list(union_insts),
            'Interlock_Count': len(union_insts)
        }
        visited.add(name)
    
    # ---------------------------
    # Record each merged board member as an interlock record.
    # ---------------------------
    for canonical, info in merged_members.items():
        member_record = {
            'Year': year,
            'Canonical_Name': canonical,
            'Fuzzy_Matches': info['Fuzzy_Matches'],
            'Institutions': info['Institutions'],
            'Interlock_Count': info['Interlock_Count']
        }
        all_interlock_members.append(member_record)

# ---------------------------
# Create a DataFrame of All Interlock Board Members
# ---------------------------
interlock_members_df = pd.DataFrame(all_interlock_members)
print("\nInterlock Members DataFrame (first few rows):")
print(interlock_members_df.head())

# ---------------------------
# Write the DataFrame to CSV
# ---------------------------
output_folder = "interlock_results"
os.makedirs(output_folder, exist_ok=True)
output_path = os.path.join(output_folder, "interlock_members.csv")
interlock_members_df.to_csv(output_path, index=False)
print(f"\nInterlock members saved to: {output_path}")


Processing year: 1999


KeyboardInterrupt: 

In [66]:
# '''
# determine what universities are reported for each year
# '''
# institutions_by_year = {}
# for year in years:
#     boards_df = pd.read_csv(f"{absolute_path}{board_dataframes}{year}_boards.csv")
#     boards_df = remove_non_samples(boards_df)

#     institutions_by_year[year] = set(boards_df['Institution'].unique())
# common_institutions = set.intersection(*institutions_by_year.values())

# # Build a mapping of each institution to the list of years it appears in.
# institution_years = {}
# for year, inst_set in institutions_by_year.items():
#     for institution in inst_set:
#         institution_years.setdefault(institution, []).append(year)

# # Identify institutions that do not appear in every year.
# # (i.e. their years list is not equal to the full list of years)
# non_common_institutions = {
#     institution: yrs 
#     for institution, yrs in institution_years.items() 
#     if set(yrs) != set(years)
# }

# # Print the results.
# print("Institutions that appear in every year:")
# print(len(common_institutions),", ", common_institutions)

# print("\nInstitutions that do not appear in every year (with the years they appear):")
# for institution, yrs in non_common_institutions.items():
#     print(f"{institution}: {yrs}")
