In [1]:
import pandas as pd
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations
from collections import Counter, defaultdict
from matplotlib.lines import Line2D
from collections import Counter
import networkx as nx


In [2]:
path_connected_data = "C:\\Users\\tykun\\OneDrive\\Documents\\SchoolDocs\\VSCodeProjects\\connectedData\\dataframes\\"
path_temp_data = "C:\\Users\\tykun\\OneDrive\\Documents\\SchoolDocs\\VSCodeProjects\\connectedData\\temporaryData\\"
path_pos_disambig = "C:\\Users\\tykun\\OneDrive\\Documents\\SchoolDocs\\VSCodeProjects\\connectedData\\temporaryData\\position_disambiguation\\"

valid_years = ["1999", "2000", "2005", "2008", "2009", "2013"]
year = 2009
path_read = f"{path_pos_disambig}{year}_split_positions.csv"
boards_path = f"{path_temp_data}{year}_boards_region.csv"
interlocked_nodes_disjoint_path = f"{path_temp_data}interlocked_nodes_disjoint.csv"
interlocked_edges_disjoint_path = f"{path_temp_data}interlocked_edges_disjoint.csv"

interlocked_nodes_continuous_path = f"{path_temp_data}interlocked_nodes_continuous.csv"
interlocked_edges_continuous_path = f"{path_temp_data}interlocked_edges_continuous.csv"

interlocked_nodes_sample_path = f"{path_temp_data}interlocked_nodes_sample.csv"
interlocked_edges_sample_path = f"{path_temp_data}interlocked_edges_sample.csv"

In [3]:
board_member_dict = defaultdict(set)
edges_list = []
nodes_dict = defaultdict(lambda: {'Interlock_Count': 0, 'Region': None})

# Iterate through each year
for year in valid_years:
    boards_path = f"{path_pos_disambig}{year}_boards_region.csv"
    boards_df = pd.read_csv(boards_path)

    # Iterate over each board member
    for index, row in boards_df.iterrows():
        name = row['Name']
        institution = row['Institution']
        region = row['Region']  # Assuming 'Region' column exists in boards_df

        # If this board member has been seen before in a different institution, record an interlock
        for previous_institution in board_member_dict[name]:
            if previous_institution != institution:
                # Record the interlock as an edge
                edges_list.append({
                    'Source': previous_institution,
                    'Target': institution,
                    'Type': 'Undirected',
                    'Weight': 1  # Each interlock counts as 1 by default
                })
                # Increment the interlock count for the involved institutions
                nodes_dict[previous_institution]['Interlock_Count'] += 1
                nodes_dict[institution]['Interlock_Count'] += 1

        # Add the current institution to the set of institutions this member is associated with
        board_member_dict[name].add(institution)

        # Ensure the Region is recorded for each institution
        nodes_dict[institution]['Region'] = region

# Create a DataFrame for nodes (universities) with their interlock counts and regions
nodes_df = pd.DataFrame([(key, value['Interlock_Count'], value['Region']) for key, value in nodes_dict.items()], 
                        columns=['Id', 'Interlock_Count', 'Region'])
nodes_df['Label'] = nodes_df['Id']  # Use the institution name as the label

# Ensure correct column order and uniqueness
nodes_df = nodes_df[['Id', 'Label', 'Interlock_Count', 'Region']]

# Create a DataFrame for edges (interlocks between institutions)
edges_df = pd.DataFrame(edges_list)

# Ensure correct column order for edges
edges_df = edges_df[['Source', 'Target', 'Type', 'Weight']]

# Save the DataFrames to CSV files
nodes_df.to_csv(interlocked_nodes_disjoint_path, index=False)
edges_df.to_csv(interlocked_edges_disjoint_path, index=False)

In [7]:
G = nx.Graph()

for _, row in edges_df.iterrows():
    source_region = nodes_dict[row['Source']]['Region']
    target_region = nodes_dict[row['Target']]['Region']
    
    if pd.notna(source_region) and pd.notna(target_region) and source_region != "" and target_region != "":
        G.add_edge(row['Source'], row['Target'], weight=row['Weight'])

region_mapping = {k: v['Region'] for k, v in nodes_dict.items() if pd.notna(v['Region']) and v['Region'] != ""}
nx.set_node_attributes(G, region_mapping, 'Region')

def region_similarity(n1, n2):
    return G.nodes[n1]['Region'] == G.nodes[n2]['Region']

assortativity_coefficient = nx.attribute_assortativity_coefficient(G, 'Region')

print("Assortativity Coefficient based on Region (SAMPLE ONLY):", assortativity_coefficient)

Assortativity Coefficient based on Region: 0.2832630650063732


In [5]:
edges_list = []
nodes_dict = defaultdict(lambda: {'Interlock_Count': 0, 'Region': None})

# Iterate through each year
for year in valid_years:
    boards_path = f"{path_pos_disambig}{year}_boards_region.csv"
    boards_df = pd.read_csv(boards_path)

    # Group by 'Name' to find individuals who served on multiple boards in the same year
    grouped = boards_df.groupby('Name')
    
    for name, group in grouped:
        # Get a list of all unique institutions this individual is part of within the year
        institutions = group['Institution'].unique().tolist()
        regions = group['Region'].unique().tolist()  # Assuming 'Region' exists in boards_df
        
        # Check if the individual serves on more than one institution within the same year
        if len(institutions) > 1:
            # Record the interlock between institutions
            for i in range(len(institutions)):
                for j in range(i + 1, len(institutions)):
                    edges_list.append({
                        'Source': institutions[i],
                        'Target': institutions[j],
                        'Type': 'Undirected',
                        'Weight': 1  # Each interlock counts as 1
                    })
                    # Increment the interlock count for the involved institutions
                    nodes_dict[institutions[i]]['Interlock_Count'] += 1
                    nodes_dict[institutions[j]]['Interlock_Count'] += 1

                    # Add the region for each institution if it's available
                    if len(regions) > i:
                        nodes_dict[institutions[i]]['Region'] = regions[i]
                    if len(regions) > j:
                        nodes_dict[institutions[j]]['Region'] = regions[j]

# Create a DataFrame for nodes (universities) with their interlock counts and regions
nodes_df = pd.DataFrame([(key, value['Interlock_Count'], value['Region']) for key, value in nodes_dict.items()], 
                        columns=['Id', 'Interlock_Count', 'Region'])
nodes_df['Label'] = nodes_df['Id']  # Use the institution name as the label

# Create a DataFrame for edges (interlocks between institutions)
edges_df = pd.DataFrame(edges_list)

# Save the DataFrames to CSV files
nodes_df.to_csv(interlocked_nodes_continuous_path, index=False)
edges_df.to_csv(interlocked_edges_continuous_path, index=False)

In [6]:
#Same thing as above cell, but remove non sample universities

edges_list = []
nodes_dict = defaultdict(lambda: {'Interlock_Count': 0, 'Region': None})

# Iterate through each year
for year in valid_years:
    boards_path = f"{path_pos_disambig}{year}_boards_region.csv"
    boards_df = pd.read_csv(boards_path)

    # Filter out rows where 'Region' is NaN or an empty string
    boards_df = boards_df[boards_df['Region'].notna() & (boards_df['Region'] != "")]

    # Group by 'Name' to find individuals who served on multiple boards in the same year
    grouped = boards_df.groupby('Name')
    
    for name, group in grouped:
        # Get a list of all unique institutions this individual is part of within the year
        institutions = group['Institution'].unique().tolist()
        regions = group['Region'].unique().tolist()  # Assuming 'Region' exists in boards_df
        
        # Check if the individual serves on more than one institution within the same year
        if len(institutions) > 1:
            # Record the interlock between institutions
            for i in range(len(institutions)):
                for j in range(i + 1, len(institutions)):
                    edges_list.append({
                        'Source': institutions[i],
                        'Target': institutions[j],
                        'Type': 'Undirected',
                        'Weight': 1  # Each interlock counts as 1
                    })
                    # Increment the interlock count for the involved institutions
                    nodes_dict[institutions[i]]['Interlock_Count'] += 1
                    nodes_dict[institutions[j]]['Interlock_Count'] += 1

                    # Add the region for each institution if it's available
                    if len(regions) > i:
                        nodes_dict[institutions[i]]['Region'] = regions[i]
                    if len(regions) > j:
                        nodes_dict[institutions[j]]['Region'] = regions[j]

# Create a DataFrame for nodes (universities) with their interlock counts and regions
nodes_df = pd.DataFrame([(key, value['Interlock_Count'], value['Region']) for key, value in nodes_dict.items()], 
                        columns=['Id', 'Interlock_Count', 'Region'])

# Filter out rows in nodes_df where 'Region' is None, NaN, or an empty string
nodes_df = nodes_df[nodes_df['Region'].notna() & (nodes_df['Region'] != "")]

nodes_df['Label'] = nodes_df['Id']  # Use the institution name as the label

# Create a DataFrame for edges (interlocks between institutions)
edges_df = pd.DataFrame(edges_list)


# Save the DataFrames to CSV files
nodes_df.to_csv(interlocked_nodes_sample_path, index=False)
edges_df.to_csv(interlocked_edges_sample_path, index=False)