In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from matplotlib.lines import Line2D
from nameparser import HumanName
import gender_guesser.detector as gender
from scipy.stats import chi2_contingency
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import ethnicolr
from ethnicolr import pred_census_ln
from itertools import combinations
from IPython.display import display




In [2]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary = "temporary_data\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{board_dataframes}"
temporary_data_path = f"{absolute_path}{temporary}"

# Valid Years
years = ["1999", "2000", "2005", "2008", "2009", "2013"]

#Created Files
diversity_statistics_path = f"{altered_dataframe_path}diversity_statistics.csv"

In [3]:
def calculate_similarity(set1, set2):
    """
    Calculates the similarity between two sets based on the Jaccard index.

    Args:
        set1 (set): The first set of names.
        set2 (set): The second set of names.

    Returns:
        float: The similarity percentage (0 to 100).
    """
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0
    return (intersection / union) * 100

def find_similar_boards(years, similarity_threshold=50):
    """
    Identifies groups of institutions with board similarity >= the given threshold.

    Args:
        years (list): List of years for which to process the data.
        boards_path (str): The path to the boards data CSV files.
        similarity_threshold (float): The minimum similarity percentage to consider.

    Returns:
        pd.DataFrame: A DataFrame containing pairs of institutions and their similarity percentage.
    """
    similar_institutions = []

    for year in years:
        # Load the data for the current year
        boards_df = pd.read_csv(f"{boards_path}{year}_boards.csv")
        
        # Group by Institution and aggregate unique Name values into sets
        grouped = boards_df.groupby('Institution')['Name'].apply(set).reset_index()

        # Iterate over all possible pairs of institutions
        for (i, group1), (j, group2) in combinations(grouped.iterrows(), 2):
            institution1 = group1['Institution']
            institution2 = group2['Institution']
            names_set1 = group1['Name']
            names_set2 = group2['Name']
            
            # Calculate similarity
            similarity = calculate_similarity(names_set1, names_set2)
            
            # Check if similarity meets or exceeds the threshold
            if similarity >= similarity_threshold:
                similar_institutions.append({
                    'Year': year,
                    'Institution 1': institution1,
                    'Institution 2': institution2,
                    'Similarity (%)': similarity
                })
    
    # Convert the result to a DataFrame
    similar_df = pd.DataFrame(similar_institutions)
    similar_df = similar_df.sort_values(by=['Year', 'Similarity (%)'], ascending=[True, False])
    
    return similar_df

In [7]:
university_board_statistics = pd.read_csv(f"{altered_dataframe_path}university_board_statistics.csv")
similar_boards = find_similar_boards(years )

In [9]:
display(similar_boards)

Unnamed: 0,Year,Institution 1,Institution 2,Similarity (%)
8,1999,Iowa State University,University Of Iowa,100.000000
9,1999,Iowa State University,University Of Northern Iowa,100.000000
12,1999,Kent State University,Ohio University,100.000000
13,1999,Kent State University,University Of Akron,100.000000
18,1999,Ohio University,University Of Akron,100.000000
...,...,...,...,...
138,2009,East Tennessee State University,Middle Tennessee State University,66.666667
137,2009,Birmingham Southern College,University Of Idaho,63.636364
144,2009,Idaho State University,University Of Idaho,63.636364
158,2009,Tennessee Technological University,University Of Memphis,57.894737
