In [19]:
# Lesson 1 - Conditioning a Beleif Network

# (I'm not gonna make markdown cells this time because they fuck up cursor!!!!)

########################################################

# In this lesson, we will learn how to condition a network on a variable.

# In plain English, this means we will generate the belief network of all people who hold a certain belief (or lack thereof).

# The archetypical example from our discussions is to generate the belief network of self-proclaimed liberals and compare it to the belief network of self-proclaimed conservatives. 

# This amounts to conditioning the network on the variables 'POLVIEWS' with liberals being those <0 and conservatives being those >0.

# First, we import and clean the data.

In [1]:
from datasets.import_gss import import_dataset
from datasets.clean_raw_data import clean_datasets
from source_code.generators.corr_make_network import calculate_correlation_matrix, CorrelationMethod, EdgeSuppressionMethod
from source_code.generators.corr_make_conditioned_network import calculate_conditioned_correlation_matrix
from source_code.visualizers.network_visualizer import generate_html_visualization
from source_code.analyzers.graph_similarity import graph_similarity

In [5]:
df, _ = import_dataset()
cleaned_df = clean_datasets()

Loading dataset from from cache...
Done! ✨
Loading dataset from from cache...
Done! ✨


In [6]:
## Next.. selecting a variable to condition on 🐈

#Now we can choose a variable to condition on. 

# To do this, we simply refer to the corr_make_conditioned_network.py file and make use of the function calculate_conditioned_correlation_matrix().

# This function accecpts the usual paramets (as in calculate_correlation_matrix()) plus some extra ones: 

# 1. variable_to_condition  (string telling us which variable to condition on)
# 2. condition  (string telling us what condition to apply to the variable - either 'equal_to', 'less_than_zero', 'greater_than_zero')
# 3. value (the value of the variable to condition on - only used if condition is 'equal_to')
# 4. return_df (boolean) - whether to return (filtered dataframe, conditioned correlation matrix) or just the conditioned correlation matrix

#To understand what less_than_zero and greater_than_zero actually correspond to, simply control-F the variable name in clean_raw_data.py and search up the variable in the [GSS data explorer](https://gssdataexplorer.norc.org/variables/vfilter).

var_to_condition = 'POLVIEWS'
years_of_interest = [2018, 2020, 2022]

conditioned_corr_matrix = calculate_conditioned_correlation_matrix(
    cleaned_df, 
    years_of_interest=years_of_interest,
    method=CorrelationMethod.PEARSON,
    partial=True,
    edge_suppression=EdgeSuppressionMethod.REGULARIZATION,
    suppression_params={'regularization': 0.18},

    variable_to_condition=var_to_condition, 
    condition='less_than_zero',

    verbose=True
)




CONDITIONING INFORMATION
Conditioning variable: POLVIEWS
Condition: less_than_zero
Filtered samples count: 17604 of 72390 (24.3%)
--------------------------------------------------

CORRELATION NETWORK STATISTICS
Variables filtered out (metadata): 3
Variables included in analysis: 133
Total number of samples: 1734
--------------------------------------------------
Variables removed due to NaN correlations: 16
Remaining variables after NaN filtering: 117
Removed variables (first 10): NATDRUGY, MARHOMO, NATFAREY, NATHEALY, NATARMSY, NATCITYY, NATAIDY, NATSPACY, NATCRIMY, NATEDUCY...
--------------------------------------------------
Sample size statistics for correlations:
  Mean: 7106.1
  Min: 823
  Max: 17604


In [4]:

# We can now plot this network and see what it looks like. 

# You are looking at the network of liberal-leaning individuals: the ideas that they associate with each other.

generate_html_visualization(
    conditioned_corr_matrix,
    output_path='delete_this_file.html'
)

                 FAIR  NATMASS  POLHITOK   COLHOMO  POLVIEWS  ABHLTH  \
FAIR              0.0     -0.0 -0.000000 -0.000000       0.0    -0.0   
NATMASS          -0.0      0.0 -0.000000  0.000000      -0.0     0.0   
POLHITOK         -0.0     -0.0  0.000000  0.018933       0.0     0.0   
COLHOMO          -0.0      0.0  0.018933  0.000000      -0.0     0.0   
POLVIEWS          0.0     -0.0  0.000000 -0.000000       0.0     0.0   
...               ...      ...       ...       ...       ...     ...   
RELIG_Other      -0.0      0.0  0.000000  0.000000      -0.0     0.0   
RELIG_Buddhism   -0.0      0.0 -0.000000  0.000000      -0.0     0.0   
RELIG_Hinduism   -0.0      0.0 -0.000000 -0.000000       0.0     0.0   
RELIG_Muslim      0.0      0.0 -0.000000  0.000000       0.0     0.0   
RELIG_Christian   0.0     -0.0  0.000000  0.000000       0.0     0.0   

                 SUICIDE2  NATDRUGY  DIVLAW  CONFED  ...  DIDVOTELAST  \
FAIR                 -0.0 -0.000000    -0.0     0.0  ...    -0

In [1]:
## Further analysis (graph edit distance) 😹

# We can do some basic analysis now.

# A natural question is: how different are the networks of liberals and conservatives?

# To answer this question, we can calculate some sort of graph distance metric the network of liberals and the network of conservatives.

# In this case, we will use the so-called Graph Edit Distance -- the number of edits (deletions, insertions) needed to transform one network into the other.

########################################################

#First, we'll calculate the two networks.

var_to_condition = 'POLVIEWS'
years_of_interest = [1998, 2000, 2002, 2004]


liberal_corr_matrix = calculate_conditioned_correlation_matrix(
    cleaned_df, 
    years_of_interest=years_of_interest,
    method=CorrelationMethod.PEARSON,   
    partial=True,
    edge_suppression=EdgeSuppressionMethod.REGULARIZATION,
    suppression_params={'regularization': 0.18},

    variable_to_condition=var_to_condition, 
    condition='less_than_zero',

    return_sample_sizes=True
)




conservative_corr_matrix = calculate_conditioned_correlation_matrix(
    cleaned_df, 
    years_of_interest=years_of_interest,
    method=CorrelationMethod.PEARSON,
    partial=True,
    edge_suppression=EdgeSuppressionMethod.REGULARIZATION,
    suppression_params={'regularization': 0.18},

    variable_to_condition=var_to_condition, 
    condition='greater_than_zero',

    return_sample_sizes=True
)

# Now we can call the graph_similarity function to calculate the graph edit distance between the two networks.

# The threshold parameter simply determines the correlation strength that is sufficient to be considered an edge (0 means all edges are considered and 1 means no edges are considered).

# If the similarity score is 0, the networks are identical.

# If the similarity score is, say, 250, then 250 edge insertions/deletions are needed to transform one network into the other.

if liberal_corr_matrix is not None and conservative_corr_matrix is not None:
    graph_edit_distance = graph_similarity(liberal_corr_matrix, conservative_corr_matrix, 
                                        similarity_method="graph_edit_distance",
                                        edge_threshold=0.01
                                        )
    print(graph_edit_distance)

NameError: name 'calculate_conditioned_correlation_matrix' is not defined

In [None]:
import numpy as np

# In this section, we're comparing the correlation networks between liberal and conservative 
# respondents to identify key differences in their belief structures. This analysis helps us 
# understand how political ideology influences the relationships between different attitudes
# and opinions.
#
# We'll perform the following steps:
# 1. Align the correlation matrices to ensure direct comparability
# 2. Calculate the absolute differences between corresponding correlations
# 3. Identify the variable pairs with the largest differences between networks
# 4. Visualize and interpret these differences to understand ideological divides

# Align the matrices to ensure they have identical structure for comparison
# First, align the columns of both matrices
liberal_corr_matrix = liberal_corr_matrix.reindex(conservative_corr_matrix.columns)
conservative_corr_matrix = conservative_corr_matrix.reindex(liberal_corr_matrix.columns)

# Then, align the rows of both matrices
liberal_corr_matrix = liberal_corr_matrix.reindex(conservative_corr_matrix.index)
conservative_corr_matrix = conservative_corr_matrix.reindex(liberal_corr_matrix.index)

# Calculate the absolute difference between the two correlation matrices
# This shows how much the correlation strengths differ between liberal and conservative networks
difference = abs(liberal_corr_matrix - conservative_corr_matrix)

# Replace NaN values with 0 in the difference matrix
# This handles cases where a correlation exists in one network but not the other
difference = difference.where(~difference.isna(), 0)

# Display a preview of the difference matrix (top-left 5×5 section)
# This gives us a quick look at some of the differences between networks
print("Difference:")
print("\n", difference.iloc[:5, :5])

# Find the variable pairs with the largest differences between networks
print("\nTop 5 largest absolute values (upper triangle only):")

# Create a mask for the upper triangle of the matrix (k=1 excludes the diagonal)
# Since correlation matrices are symmetric, we only need to examine half
mask = np.triu(np.ones(difference.shape), k=1).astype(bool)

# Apply the mask to get only the upper triangle values
difference_upper = difference.where(mask)

# Convert to a Series, remove NaN values, and find the top 5 largest differences
# These represent the relationships that differ most between liberal and conservative networks
difference_flat = difference_upper.abs().stack().dropna()
print("\n", difference_flat.sort_values(ascending=False).head(5))

Difference:

              ABANY  ABDEFECT  ABHLTH  ABNOMORE  ABPOOR
POLABUSE  0.000000       0.0     0.0       0.0     0.0
POLMURDR  0.000000       0.0     0.0       0.0     0.0
CONCLERG  0.035218       0.0     0.0       0.0     0.0
SPKMIL    0.000000       0.0     0.0       0.0     0.0
HELPPOOR  0.000000       0.0     0.0       0.0     0.0

Top 5 largest absolute values (upper triangle only):

 PRESLAST_DEMREP      WOULDVOTELAST_NONCONFORM    0.188711
RELIG_Catholic       RELIG_Protestant            0.188631
PRESLAST_NONCONFORM  WOULDVOTELAST_NONCONFORM    0.157105
AFFRMACT             WRKWAYUP                    0.148075
HOMOSEX              PRAYER                      0.138932
dtype: float64


In [63]:
# The third and fourth biggest differences are:
# AFFRMACT, WRKWAYUP 0.148075
# PRAYER, HOMOSEX 0.138957
# Print the strengths of these two edges in the networks
print("\nEdge strengths for AFFRMACT-WRKWAYUP:")
print(f"Liberal network: {liberal_corr_matrix.loc['AFFRMACT', 'WRKWAYUP']:.6f}")
print(f"Conservative network: {conservative_corr_matrix.loc['AFFRMACT', 'WRKWAYUP']:.6f}")

print("\nEdge strengths for PRAYER-HOMOSEX:")
print(f"Liberal network: {liberal_corr_matrix.loc['PRAYER', 'HOMOSEX']:.6f}")
print(f"Conservative network: {conservative_corr_matrix.loc['PRAYER', 'HOMOSEX']:.6f}")


Edge strengths for AFFRMACT-WRKWAYUP:
Liberal network: -0.173209
Conservative network: -0.025133

Edge strengths for PRAYER-HOMOSEX:
Liberal network: -0.156688
Conservative network: -0.017731


In [18]:
# We can also identify edges that are present in one network but not the other.
# First ensure the two matrices have the same column and row order
liberal_corr_matrix = liberal_corr_matrix.reindex(conservative_corr_matrix.columns)
conservative_corr_matrix = conservative_corr_matrix.reindex(liberal_corr_matrix.columns)
liberal_corr_matrix = liberal_corr_matrix.reindex(conservative_corr_matrix.index)
conservative_corr_matrix = conservative_corr_matrix.reindex(liberal_corr_matrix.index)

threshold = 0.18

variables_list = liberal_corr_matrix.columns.tolist()

for variable in variables_list:
    for other_variable in variables_list:
        if variable != other_variable:
            if liberal_corr_matrix.loc[variable, other_variable] > threshold and (conservative_corr_matrix.loc[variable, other_variable] == 0 or np.isnan(conservative_corr_matrix.loc[variable, other_variable])):
                print(f"{variable} --- {other_variable}, is {liberal_corr_matrix.loc[variable, other_variable]:.6f} in the liberal network and {conservative_corr_matrix.loc[variable, other_variable]:.6f} in the conservative network.")

for variable in variables_list:
    for other_variable in variables_list:
        if variable != other_variable:
            if conservative_corr_matrix.loc[variable, other_variable] > threshold and (liberal_corr_matrix.loc[variable, other_variable] == 0 or np.isnan(liberal_corr_matrix.loc[variable, other_variable])):
                print(f"{variable} --- {other_variable}, is {conservative_corr_matrix.loc[variable, other_variable]:.6f} in the conservative network and {liberal_corr_matrix.loc[variable, other_variable]:.6f} in the liberal network.")


PRESLAST_DEMREP --- WOULDVOTELAST_NONCONFORM, is 0.188711 in the liberal network and 0.000000 in the conservative network.
WOULDVOTELAST_NONCONFORM --- PRESLAST_DEMREP, is 0.188711 in the liberal network and 0.000000 in the conservative network.
PARTYID --- WOULDVOTELAST_DEMREP, is 0.213058 in the conservative network and nan in the liberal network.
PRESLAST_DEMREP --- WOULDVOTELAST_DEMREP, is 0.289686 in the conservative network and nan in the liberal network.
WOULDVOTELAST_DEMREP --- PARTYID, is 0.213058 in the conservative network and nan in the liberal network.
WOULDVOTELAST_DEMREP --- PRESLAST_DEMREP, is 0.289686 in the conservative network and nan in the liberal network.
NATDRUG --- NATCRIME, is 0.215521 in the conservative network and nan in the liberal network.
NATCRIME --- NATDRUG, is 0.215521 in the conservative network and nan in the liberal network.


In [47]:
# let's now calculate the node-level centralities of both networks

import networkx as nx
import pandas as pd
import numpy as np

# Convert correlation matrices to networkx graphs
# Assuming liberal_corr and conservative_corr are your correlation matrices
def create_graph(corr_matrix):
    G = nx.from_numpy_array(np.abs(corr_matrix.values))
    G = nx.relabel_nodes(G, {i: name for i, name in enumerate(corr_matrix.columns)})
    return G

# Create graphs
liberal_graph = create_graph(liberal_corr_matrix)
conservative_graph = create_graph(conservative_corr_matrix)

# Calculate centrality measures for both networks
def calculate_centralities(G, network_name):
    # Calculate all centrality measures
    betweenness_cent = nx.betweenness_centrality(G)
    eigenvector_cent = nx.eigenvector_centrality(G, max_iter=1000)
    
    # Calculate absolute summed edge strength (weighted degree)
    strength = {node: sum(abs(data.get('weight', 0)) if not np.isnan(data.get('weight', 0)) else 0 
                for _, _, data in G.edges(node, data=True))
                for node in G.nodes()}
    
    # Combine all measures into a DataFrame
    centralities = pd.DataFrame({
        'Strength': strength,
        'Betweenness': betweenness_cent,
        'Eigenvector': eigenvector_cent
    })
    
    return centralities

# Get centrality measures for both networks
liberal_centralities = calculate_centralities(liberal_graph, "Liberal")
conservative_centralities = calculate_centralities(conservative_graph, "Conservative")

# Display top 5 nodes by eigenvector centrality for each network
print("Top 5 central nodes in Liberal network:")
print(liberal_centralities.sort_values('Betweenness', ascending=False).head(5))

print("\nTop 5 central nodes in Conservative network:")
print(conservative_centralities.sort_values('Betweenness', ascending=False).head(5))

# Calculate differences in each centrality type between networks

# Calculate the differences in betweenness centrality between networks
betweenness_diff = pd.DataFrame({
    'Liberal': liberal_centralities['Betweenness'],
    'Conservative': conservative_centralities['Betweenness']
})
betweenness_diff['Absolute_Difference'] = abs(betweenness_diff['Liberal'] - betweenness_diff['Conservative'])

# Display the top 5 nodes with largest differences in betweenness centrality
print("\nTop 5 largest differences in Betweenness Centrality:")
top_diff = betweenness_diff.sort_values('Absolute_Difference', ascending=False).head(5)
print(top_diff)

# Calculate the differences in eigenvector centrality between networks
eigenvector_diff = pd.DataFrame({
    'Liberal': liberal_centralities['Eigenvector'],
    'Conservative': conservative_centralities['Eigenvector']
})
eigenvector_diff['Absolute_Difference'] = abs(eigenvector_diff['Liberal'] - eigenvector_diff['Conservative'])

# Display the top 5 nodes with largest differences in eigenvector centrality
print("\nTop 5 largest differences in Eigenvector Centrality:")
top_eigenvector_diff = eigenvector_diff.sort_values('Absolute_Difference', ascending=False).head(5)
print(top_eigenvector_diff)

# Calculate the differences in strength (weighted degree) between networks
strength_diff = pd.DataFrame({
    'Liberal': liberal_centralities['Strength'],
    'Conservative': conservative_centralities['Strength']
})
strength_diff['Absolute_Difference'] = abs(strength_diff['Liberal'] - strength_diff['Conservative'])

# Display the top 5 nodes with largest differences in strength
print("\nTop 5 largest differences in Node Strength:")
top_strength_diff = strength_diff.sort_values('Absolute_Difference', ascending=False).head(5)
print(top_strength_diff)



Top 5 central nodes in Liberal network:
         Strength  Betweenness  Eigenvector
NATHEAL       0.0     0.057298     0.201287
NATCITY       0.0     0.057298     0.201287
NATSCI        0.0     0.057298     0.201287
NATFARE       0.0     0.057298     0.201287
NATDRUG       0.0     0.057298     0.201287

Top 5 central nodes in Conservative network:
          Strength  Betweenness  Eigenvector
LETDIE1   0.738909     0.018882     0.143223
FECHLD    0.436149     0.018882     0.143223
LIBATH    0.699512     0.018882     0.143223
POSTLIFE  0.000000     0.018882     0.143223
HELPBLK   0.466559     0.018882     0.143223

Top 5 largest differences in Betweenness Centrality:
          Liberal  Conservative  Absolute_Difference
NATAID   0.057298      0.000000             0.057298
NATSCI   0.057298      0.000000             0.057298
NATCITY  0.057298      0.000008             0.057289
NATDRUG  0.057298      0.000017             0.057281
NATHEAL  0.057298      0.000033             0.057265

Top 5 l

In [None]:
generate_html_visualization(
    liberal_corr_matrix,
    highlight_nodes=['POLVIEWS', 'HOMOSEX'],
    output_path='delete_this_file_Liberal.html'
)
generate_html_visualization(
    conservative_corr_matrix,
    highlight_nodes=['POLVIEWS', 'HOMOSEX'],
    output_path='delete_this_file_Conservative.html'
)

Network visualization has been saved to c:\Users\timbo\Github\BeliefNetworkEvo\CLEAN\notebooks\tutorials\delete_this_file_Liberal.html
Network visualization has been saved to c:\Users\timbo\Github\BeliefNetworkEvo\CLEAN\notebooks\tutorials\delete_this_file_Conservative.html


In [18]:
# For the values I used (threshold=0.1), the similarity score is 70, meaning 70 edge insertions/deletions are needed to transform one network into the other.

# A simple thing we can plot to understand how the threshold affects the similarity score is to plot the similarity score as a function of the threshold.

import numpy as np
import plotly.express as px

# Define thresholds and calculate graph edit distances
thresholds = np.linspace(0, 1, 666)
graph_edit_distances = [graph_similarity(liberal_corr_matrix, conservative_corr_matrix, 
                                         similarity_method="graph_edit_distance",
                                         edge_threshold=threshold).score 
                        for threshold in thresholds]
normalised_graph_edit_distances = [graph_similarity(liberal_corr_matrix, conservative_corr_matrix, 
                                         similarity_method="graph_edit_distance",
                                         edge_threshold=threshold).normalized_score 
                        for threshold in thresholds]


# Plotting with plotly
fig = px.line(x=thresholds, y=normalised_graph_edit_distances, title='Graph Edit Distance as a Function of Threshold Strength',
              labels={'x': 'Threshold Strength', 'y': 'Graph Edit Distance'})
fig.update_layout(xaxis_range=[0, 1], yaxis_range=[min(normalised_graph_edit_distances), max(normalised_graph_edit_distances)])
fig.add_hline(y=0, line_dash="dash", line_color="red")
fig.show()



In [None]:
########################################################

## Further analysis - the graph edit distance of ALL variable conditionings
## WITH ERROR HANDLING for convergence errors

# It would also be good to put this into context of the whole network. 
# Let's see the graph edge distances when we condition on all the other variables...
# This version includes error handling for convergence errors

# Define the years we want to analyze
years_of_interest = [2010, 2012, 2014, 2016, 2018]

# Define the list of variables we want to condition on
# This is a comprehensive list of variables from the GSS dataset
variable_list = ['PARTYID', 'NATEDUC', 'HOMOSEX', 'NATROAD', 'PORNLAW', 'POLABUSE', 'ABHLTH', 'FEPRESCH', 'HELPOTH', 'NATAID', 
                 'XMARSEX', 'SPANKING', 'POPULAR', 'SPKCOM', 'FEFAM', 'NATCITY', 'CONEDUC', 'LIBHOMO', 'DIVLAW', 'COLMIL', 
                 'WORKHARD', 'COLATH', 'NATCHLD', 'SUICIDE2', 'NATFARE', 'FEPOL', 'NATARMS', 'GETAHEAD', 'CONSCI', 'POLMURDR', 
                 'ABANY', 'CONPRESS', 'NATSCI', 'RACDIF3', 'CONMEDIC', 'TEENSEX', 'NATHEAL', 'HELPBLK', 'LETDIE1', 'COLHOMO', 
                 'NATSPAC', 'NATENVIR', 'RACDIF2', 'CONARMY', 'CONCLERG', 'NATDRUG', 'CONLABOR', 'POLESCAP', 'PRAYER', 'OBEY', 
                 'SPKHOMO', 'POLATTAK', 'HELPNOT', 'HELPPOOR', 'POSTLIFE', 'POLVIEWS', 'SEXEDUC', 'ABRAPE', 'CONBUS', 'ABDEFECT', 
                 'POLHITOK', 'CONFINAN', 'CAPPUN', 'LIBATH', 'CONLEGIS', 'ABNOMORE', 'FECHLD', 'SPKMIL', 'CONJUDGE', 'TRUST', 
                 'RACDIF4', 'LIBRAC', 'FAIR', 'AFFRMACT', 'PREMARSX', 'ABPOOR', 'SUICIDE1', 'NATMASS', 'COURTS', 'CONFED', 
                 'CONTV', 'NATPARK', 'LIBMIL', 'NATCRIME', 'COLCOM', 'GUNLAW', 'NATSOC', 'THNKSELF', 'SPKRAC', 'RACDIF1', 
                 'GRASS', 'LIBCOM', 'HELPFUL', 'EQWLTH', 'SPKATH', 'ABSINGLE', 'WRKWAYUP', 'COLRAC', 'PRESLAST_DEMREP', 
                 'PRESLAST_NONCONFORM', 'WOULDVOTELAST_NONCONFORM', 'DIDVOTELAST', 'RELIG_Protestant', 'RELIG_Catholic', 
                 'RELIG_Jewish', 'RELIG_None', 'RELIG_Other', 'RELIG_Buddhism', 'RELIG_Hinduism', 'RELIG_Other_eastern_religions', 
                 'RELIG_Muslim', 'RELIG_Orthodox_christian', 'RELIG_Christian', 'RELIG_Native_american', 'RELIG_Inter_nondenominational']

# Dictionary to store the graph edit distance results for each variable
ged_results = {}

# List to track variables that had to be skipped due to convergence errors
skipped_variables = []

# Set threshold GED calculation -- determines the cuttoff for considering an edge in the count
threshold = 0.001

# Import tqdm to create a progress bar in the notebook
from tqdm.notebook import tqdm

# Iterate through each variable in the list with a progress bar
for var in tqdm(variable_list, desc="Processing variables"): 
    # Calculate correlation matrix conditioned on negative values of the variable
    # This creates a network of how variables correlate when the conditioned variable is below zero
    conditioned_corr_matrix_neg = calculate_conditioned_correlation_matrix(
        cleaned_df, 
        years_of_interest=years_of_interest,
        method=CorrelationMethod.PEARSON,  # Using Pearson correlation
        partial=True,  # Using partial correlations to control for other variables
        edge_suppression=EdgeSuppressionMethod.REGULARIZATION,  # Using regularization to reduce noise
        suppression_params={'regularization': 0.18},  # Regularization strength
        variable_to_condition=var,  # The variable we're conditioning on
        condition='less_than_zero'  # Condition on negative values
    )
    
    # Calculate correlation matrix conditioned on positive values of the variable
    # This creates a network of how variables correlate when the conditioned variable is above zero
    conditioned_corr_matrix_pos = calculate_conditioned_correlation_matrix(
        cleaned_df, 
        years_of_interest=years_of_interest,
        method=CorrelationMethod.PEARSON,
        partial=True,
        edge_suppression=EdgeSuppressionMethod.REGULARIZATION,
        suppression_params={'regularization': 0.18},
        variable_to_condition=var,
        condition='greater_than_zero'  # Condition on positive values
    )
    
    # Check if the negative condition matrix was successfully calculated
    # If not, skip this variable and continue to the next one
    if conditioned_corr_matrix_neg is None:
        # Add to the list of skipped variables
        skipped_variables.append(var)
        continue
        
    # Check if the positive condition matrix was successfully calculated
    if conditioned_corr_matrix_pos is None:
        # Add to the list of skipped variables
        skipped_variables.append(var)
        continue
    
    # If both matrices are valid, calculate the graph edit distance between them
    # This measures how different the networks are when conditioned on positive vs negative values
    ged_result = graph_similarity(
        conditioned_corr_matrix_neg,
        conditioned_corr_matrix_pos,
        similarity_method="graph_edit_distance",
        edge_threshold=threshold  # Only consider correlations above this threshold
    )
    
    # Store the normalized score if the calculation was successful
    if ged_result is not None:
        ged_results[var] = ged_result.normalized_score
    else:
        # If calculation failed, add to skipped variables
        skipped_variables.append(var)



In [13]:
# Print summary statistics about the analysis
print(f"\nSkipped {len(skipped_variables)} out of {len(variable_list)} variables")
print(f"\nSuccessfully calculated GED for {len(ged_results)} out of {len(variable_list)} variables")
print(f"\nSkipped variables: {skipped_variables}")



Skipped 17 out of 115 variables

Successfully calculated GED for 98 out of 115 variables

Skipped variables: ['NATEDUC', 'PORNLAW', 'POPULAR', 'NATDRUG', 'NATPARK', 'NATCRIME', 'PRESLAST_NONCONFORM', 'RELIG_Jewish', 'RELIG_Other', 'RELIG_Buddhism', 'RELIG_Hinduism', 'RELIG_Other_eastern_religions', 'RELIG_Muslim', 'RELIG_Orthodox_christian', 'RELIG_Christian', 'RELIG_Native_american', 'RELIG_Inter_nondenominational']


In [14]:
# Now we can plot the results.
import plotly.express as px
import pandas as pd

print(ged_results)

# Create a DataFrame for plotting
ged_df = pd.DataFrame({
    'Variable': list(ged_results.keys()),
    'Graph Edit Distance': list(ged_results.values())
})

# Sort by GED in descending order
ged_df = ged_df.sort_values(by='Graph Edit Distance', ascending=False)

# Create a bar plot with increased height
fig = px.bar(ged_df, 
             x='Normalised Graph Edit Distance', 
             y='Variable', 
             orientation='h',
             title='Graph Edit Distance by Variable',
             height=1000)  # Increase height to accommodate all labels

# Update the y-axis text angle
fig.update_yaxes(tickangle=-45)  # Angle the labels 45 degrees

# Show the plot
fig.show()

{'PARTYID': np.float64(0.050137741046831955), 'HOMOSEX': np.float64(0.03966498103666245), 'NATROAD': np.float64(0.0530812324929972), 'POLABUSE': np.float64(0.05711068211068211), 'ABHLTH': np.float64(0.053267326732673266), 'FEPRESCH': np.float64(0.041328236980410896), 'HELPOTH': np.float64(0.045428072218986607), 'NATAID': np.float64(0.07689075630252101), 'XMARSEX': np.float64(0.06131479140328698), 'SPANKING': np.float64(0.049689440993788817), 'SPKCOM': np.float64(0.03881188118811881), 'FEFAM': np.float64(0.051600573339703776), 'NATCITY': np.float64(0.06582633053221289), 'CONEDUC': np.float64(0.05425742574257426), 'LIBHOMO': np.float64(0.043275418275418275), 'DIVLAW': np.float64(0.04053724053724054), 'COLMIL': np.float64(0.042149292149292146), 'WORKHARD': np.float64(0.0489225393127548), 'COLATH': np.float64(0.034653465346534656), 'NATCHLD': np.float64(0.059072225484439224), 'SUICIDE2': np.float64(0.06568986568986569), 'NATFARE': np.float64(0.06523287281014101), 'FEPOL': np.float64(0.0533

In [None]:
# Make a GED time series for all the variables

start_year = 1972
end_year = 2018
window_size = 4
step_size = 2
threshold = 0.001 # The GED threshold
# Create an empty DataFrame for plotting. It should have columns containing the following:
# List of years in the window
# The variable that is being conditioned on
# The GED between the two conditioned networks

import pandas as pd
from tqdm.notebook import tqdm
ged_df = pd.DataFrame({
    'Years': [],
    'Variable': [],
    'GED': []
})


variable_list = ['PARTYID', 'NATEDUC', 'HOMOSEX', 'NATROAD', 'PORNLAW', 'POLABUSE', 'ABHLTH', 'FEPRESCH', 'HELPOTH', 'NATAID', 
                 'XMARSEX', 'SPANKING', 'POPULAR', 'SPKCOM', 'FEFAM', 'NATCITY', 'CONEDUC', 'LIBHOMO', 'DIVLAW', 'COLMIL', 
                 'WORKHARD', 'COLATH', 'NATCHLD', 'SUICIDE2', 'NATFARE', 'FEPOL', 'NATARMS', 'GETAHEAD', 'CONSCI', 'POLMURDR', 
                 'ABANY', 'CONPRESS', 'NATSCI', 'RACDIF3', 'CONMEDIC', 'TEENSEX', 'NATHEAL', 'HELPBLK', 'LETDIE1', 'COLHOMO', 
                 'NATSPAC', 'NATENVIR', 'RACDIF2', 'CONARMY', 'CONCLERG', 'NATDRUG', 'CONLABOR', 'POLESCAP', 'PRAYER', 'OBEY', 
                 'SPKHOMO', 'POLATTAK', 'HELPNOT', 'HELPPOOR', 'POSTLIFE', 'POLVIEWS', 'SEXEDUC', 'ABRAPE', 'CONBUS', 'ABDEFECT', 
                 'POLHITOK', 'CONFINAN', 'CAPPUN', 'LIBATH', 'CONLEGIS', 'ABNOMORE', 'FECHLD', 'SPKMIL', 'CONJUDGE', 'TRUST', 
                 'RACDIF4', 'LIBRAC', 'FAIR', 'AFFRMACT', 'PREMARSX', 'ABPOOR', 'SUICIDE1', 'NATMASS', 'COURTS', 'CONFED', 
                 'CONTV', 'NATPARK', 'LIBMIL', 'NATCRIME', 'COLCOM', 'GUNLAW', 'NATSOC', 'THNKSELF', 'SPKRAC', 'RACDIF1', 
                 'GRASS', 'LIBCOM', 'HELPFUL', 'EQWLTH', 'SPKATH', 'ABSINGLE', 'WRKWAYUP', 'COLRAC', 'PRESLAST_DEMREP', 
                 'PRESLAST_NONCONFORM', 'WOULDVOTELAST_NONCONFORM', 'DIDVOTELAST', 'RELIG_Protestant', 'RELIG_Catholic', 
                 'RELIG_Jewish', 'RELIG_None', 'RELIG_Other', 'RELIG_Buddhism', 'RELIG_Hinduism', 'RELIG_Other_eastern_religions', 
                 'RELIG_Muslim', 'RELIG_Orthodox_christian', 'RELIG_Christian', 'RELIG_Native_american', 'RELIG_Inter_nondenominational']

for year in tqdm(range(start_year, end_year + 1, step_size), desc="Processing years"):
    
    years_list = list(range(year, year + window_size))
    
    skipped_variables = []

    for var in tqdm(variable_list, desc=f"Processing variables for {year}", leave=False):
        
        # Optional print statements
        #print(f"Calculating GED for {var} in years {years_list}")
        
        # Calculate negative condition
        conditioned_corr_matrix_neg = calculate_conditioned_correlation_matrix(
            cleaned_df, 
            years_of_interest=years_list,
            method=CorrelationMethod.PEARSON,
            partial=True,
            edge_suppression=EdgeSuppressionMethod.REGULARIZATION,
            suppression_params={'regularization': 0.18},
            variable_to_condition=var,
            condition='less_than_zero'
        )
        
        # Calculate positive condition
        conditioned_corr_matrix_pos = calculate_conditioned_correlation_matrix(
            cleaned_df, 
            years_of_interest=years_list,
            method=CorrelationMethod.PEARSON,
            partial=True,
            edge_suppression=EdgeSuppressionMethod.REGULARIZATION,
            suppression_params={'regularization': 0.18},
            variable_to_condition=var,
            condition='greater_than_zero'
        )
        
        skip_ged_calc = False
        # Check if both matrices were successfully calculated
        if conditioned_corr_matrix_neg is None:
            #tqdm.write(f"  Negative condition for '{var}' could not be calculated (not enough datapoints; precision error): Correlation matrix is None")
            skipped_variables.append(var)
            ged_result = None
            skip_ged_calc = True
            
        if conditioned_corr_matrix_pos is None:
            #tqdm.write(f"  Positive condition for '{var}' could not be calculated (not enough datapoints; precision error): Correlation matrix is None")
            skipped_variables.append(var)
            ged_result = None
            skip_ged_calc = True
        
        # Calculate GED if both matrices are valid
        if not skip_ged_calc:
            ged_result = graph_similarity(
                conditioned_corr_matrix_neg,
                conditioned_corr_matrix_pos,
                similarity_method="graph_edit_distance",
                edge_threshold=threshold
            )   
            ged =  ged_result.normalized_score

        # Add years_list, var, and ged to ged_df
        ged_df = pd.concat([ged_df, pd.DataFrame({
            'Years': [years_list],
            'Variable': [var], 
            'GED': [ged]
        })], ignore_index=True)



In [18]:
# Print the top 5 highest GED variables
print(ged_df.sort_values(by='GED', ascending=False).head(5))

# plot the GED score for a list of variables
import plotly.express as px

all_variables = ged_df['Variable'].unique()
variables_to_plot = all_variables
print(all_variables)

#variables_to_plot = ['PARTYID', 'HOMOSEX', 'PRESLAST_DEMREP', 'RELIG_None', 'FAIR']

# Filter data for plotting
plot_df = ged_df[ged_df['Variable'].isin(variables_to_plot)].copy()

# Extract first year from Years list and add as a new column
plot_df['Year'] = plot_df['Years'].apply(lambda x: x[0])

# Create line plot
fig = px.line(plot_df,
              x='Year',
              y='GED',
              color='Variable',
              title='GED Scores Over Time')

# Update x-axis to show as continuous range
fig.update_xaxes(type='linear', title='Year')

# Save the figure as HTML
fig.write_html("ged_scores_over_time.html")

# Display the figure
fig.show()

                         Years  Variable       GED
368   [1978, 1979, 1980, 1981]  SUICIDE2  0.157218
138   [1974, 1975, 1976, 1977]  SUICIDE2  0.132867
9     [1972, 1973, 1974, 1975]    NATAID  0.131187
1771  [2002, 2003, 2004, 2005]  CONLABOR  0.129838
1772  [2002, 2003, 2004, 2005]  POLESCAP  0.129838
['PARTYID' 'NATEDUC' 'HOMOSEX' 'NATROAD' 'PORNLAW' 'POLABUSE' 'ABHLTH'
 'FEPRESCH' 'HELPOTH' 'NATAID' 'XMARSEX' 'SPANKING' 'POPULAR' 'SPKCOM'
 'FEFAM' 'NATCITY' 'CONEDUC' 'LIBHOMO' 'DIVLAW' 'COLMIL' 'WORKHARD'
 'COLATH' 'NATCHLD' 'SUICIDE2' 'NATFARE' 'FEPOL' 'NATARMS' 'GETAHEAD'
 'CONSCI' 'POLMURDR' 'ABANY' 'CONPRESS' 'NATSCI' 'RACDIF3' 'CONMEDIC'
 'TEENSEX' 'NATHEAL' 'HELPBLK' 'LETDIE1' 'COLHOMO' 'NATSPAC' 'NATENVIR'
 'RACDIF2' 'CONARMY' 'CONCLERG' 'NATDRUG' 'CONLABOR' 'POLESCAP' 'PRAYER'
 'OBEY' 'SPKHOMO' 'POLATTAK' 'HELPNOT' 'HELPPOOR' 'POSTLIFE' 'POLVIEWS'
 'SEXEDUC' 'ABRAPE' 'CONBUS' 'ABDEFECT' 'POLHITOK' 'CONFINAN' 'CAPPUN'
 'LIBATH' 'CONLEGIS' 'ABNOMORE' 'FECHLD' 'SPKMIL' 'CONJU

In [21]:
# Analyzing Temporal Trends in GED Scores

# This section analyzes how GED (Graph Edit Distance) scores change over time for different variables.
# We calculate and visualize the monotonicity of these trends to identify which variables show 
# consistent increases or decreases in network influence over the years.

# Key metrics:
# - Monotonicity: Measured using Spearman's rank correlation between years and GED scores.
#   Values range from -1 to 1:
#   * Values close to 1 indicate a strong increasing trend over time
#   * Values close to -1 indicate a strong decreasing trend over time
#   * Values close to 0 indicate no consistent directional trend
# - Absolute monotonicity: The magnitude of the monotonicity score, indicating the strength
#   of the trend regardless of direction.

# Calculate monotonicity for each variable's GED scores over time
monotonicity_df = pd.DataFrame(columns=['Variable', 'Monotonicity'])

for var in all_variables:
    # Get GED scores for this variable
    var_data = ged_df[ged_df['Variable'] == var]
    
    # Extract first year from each Years list and GED scores
    years = var_data['Years'].apply(lambda x: x[0])
    ged_scores = var_data['GED']
    
    # Calculate Spearman correlation between years and GED scores
    # This measures monotonicity - values closer to 1 or -1 indicate stronger trends
    monotonicity = ged_scores.corr(years, method='spearman')
    
    # Add to dataframe
    monotonicity_df = pd.concat([monotonicity_df, pd.DataFrame({
        'Variable': [var],
        'Monotonicity': [monotonicity]
    })], ignore_index=True)

# Filter for negative monotonicity scores and sort by absolute value
negative_trends = monotonicity_df[monotonicity_df['Monotonicity'] < 0].sort_values('Monotonicity')

# Create histogram of monotonicity scores using plotly express
fig = px.histogram(monotonicity_df, 
                  x='Monotonicity',
                  nbins=50,
                  title='Distribution of Monotonicity Scores',
                  labels={'Monotonicity': 'Monotonicity Score'},
                  marginal='box')  # Add box plot on top

# Update layout
fig.update_layout(
    xaxis_title='Monotonicity Score',
    yaxis_title='Count',
    showlegend=False,
    bargap=0.1
)

# Add vertical line at 0 for reference
fig.add_vline(x=0, line_dash="dash", line_color="red")

# Save the figure as HTML
fig.write_html("monotonicity_distribution.html")

fig.show()

# Sort and display variables with strongest monotonic trends (both positive and negative)
top_trends = monotonicity_df.sort_values('Monotonicity', key=abs, ascending=False).head(10)
print("\nTop 10 Variables with Strongest Monotonic Trends:")
print(top_trends)



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.




Top 10 Variables with Strongest Monotonic Trends:
    Variable  Monotonicity
96  WRKWAYUP      0.757391
74  PREMARSX      0.751304
17   LIBHOMO      0.716522
39   COLHOMO      0.689565
76  SUICIDE1      0.680000
50   SPKHOMO      0.660000
73  AFFRMACT      0.655652
56   SEXEDUC      0.649565
14     FEFAM      0.605916
4    PORNLAW     -0.596522
