## qué bolá, mi gente 😼

In this lesson, we will learn how to condition a network on a variable.

In plain English, this means we will generate the belief network of all people who hold a certain belief (or lack thereof).

First, we import and clean the data.

In [1]:
import os
import sys
project_root = os.path.dirname(os.path.dirname(os.path.abspath("..")))
if project_root not in sys.path:
    sys.path.append(project_root)
from CLEAN.datasets.import_gss import import_dataset
from CLEAN.datasets.clean_raw_data import clean_datasets
from CLEAN.source_code.generators.corr_make_network import calculate_correlation_matrix, CorrelationMethod, EdgeSuppressionMethod
from CLEAN.source_code.visualizers.network_visualizer import generate_html_visualization

In [2]:
df, _ = import_dataset()
cleaned_df = clean_datasets()
# print list of all vars
print(list(cleaned_df.columns))



Dataset loaded! 🐱✨           🐾🐱�
Cached raw data to: c:\Users\timbo\Github\BeliefNetworkEvo\CLEAN\datasets\cached_data\gss_cache.pkl
Loading from cache...
Done! ✨
['ABDEFECT', 'HELPNOT', 'NATFAREY', 'RACDIF3', 'YEAR', 'HELPFUL', 'POLESCAP', 'AFFRMACT', 'HELPOTH', 'NATAID', 'TEENSEX', 'CONARMY', 'NATEDUCY', 'NATAIDY', 'SPKHOMO', 'POPULAR', 'POLMURDR', 'TRUST', 'SUICIDE2', 'CONCLERG', 'NATDRUGY', 'LIBMSLM', 'GETAHEAD', 'LIBHOMO', 'POSTLIFE', 'NATFARE', 'ABHLTH', 'NATHEAL', 'NATSPACY', 'MARHOMO', 'POLABUSE', 'CONEDUC', 'NATCRIME', 'ABRAPE', 'NATEDUC', 'FEPOL', 'HELPBLK', 'WORKHARD', 'COLCOM', 'NATCITY', 'SPKMSLM', 'COLMIL', 'COLMSLM', 'SPKMIL', 'COLRAC', 'CONBUS', 'EQWLTH', 'SUICIDE1', 'POLVIEWS', 'BALLOT', 'CONFED', 'SPKRAC', 'GRASS', 'FEPRESCH', 'FEFAM', 'CONTV', 'POLHITOK', 'NATENVIY', 'OBEY', 'HOMOSEX', 'SEXEDUC', 'NATSCI', 'WRKWAYUP', 'NATDRUG', 'NATMASS', 'HELPPOOR', 'NATPARK', 'ABANY', 'SPANKING', 'NATRACEY', 'THNKSELF', 'SPKCOM', 'XMARSEX', 'NATHEALY', 'FECHLD', 'ABNOMORE', 'CONLE

https://gssdataexplorer.norc.org/variables/vfilter 
and clean_raw_data.py

In [3]:
vars_to_condition = 'POLVIEWS'
years_of_interest = [2018, 2020, 2022]
# Value counts of this variable
cleaned_df[vars_to_condition].value_counts()


POLVIEWS
 0.000000    23992
 0.333333     9596
 0.666667     9361
-0.333333     7900
-0.666667     7623
 1.000000     2165
-1.000000     2081
Name: count, dtype: int64

In [41]:
import numpy as np
import pandas as pd

# --- Your existing helper functions ---

def binarize_dataframe(df, threshold=0.0):
    """
    Binarize a DataFrame based on a threshold.
    - If threshold == 0: Only exactly zero values are set to 0; all non-zero values are set to 1.
    - If threshold > 0: Values with absolute magnitude >= threshold are set to 1; others are set to 0.
    """
    if threshold == 0:
        return (df != 0).astype(int)
    else:
        return (np.abs(df) >= threshold).astype(int)

def align_dataframes(df1, df2):
    """
    Ensure df1 and df2 have the same columns and index, 
    filling missing values with zero.
    """
    all_columns = df1.columns.union(df2.columns)
    all_index = df1.index.union(df2.index)
    
    df1 = df1.reindex(index=all_index, columns=all_columns, fill_value=0)
    df2 = df2.reindex(index=all_index, columns=all_columns, fill_value=0)
    
    return df1, df2

def graph_edit_distance(df1, df2, threshold=0.0, undirected=True):
    """
    Compute the Graph Edit Distance (GED) between two DataFrames.
    
    Args:
        df1 (pd.DataFrame): First correlation matrix.
        df2 (pd.DataFrame): Second correlation matrix.
        threshold (float): Threshold for binarization. Default is 0.0.
        undirected (bool): Whether the graphs are undirected. Default is True.
    
    Returns:
        int: Graph Edit Distance (GED) as a single scalar value.
    """
    # Align DataFrames
    df1, df2 = align_dataframes(df1, df2)

    # Binarize the DataFrames
    bin_df1 = binarize_dataframe(df1, threshold)
    bin_df2 = binarize_dataframe(df2, threshold)

    # Compute the difference matrix and sum over all elements
    diff_matrix = np.abs(bin_df1 - bin_df2)
    ged = np.sum(diff_matrix.values)
    if undirected:
        ged = ged // 2  # For undirected graphs, each difference is counted twice
    
    return int(ged)

# --- The new function that runs the whole pipeline for each variable ---

def compute_and_save_ged(cleaned_df, variables, years_of_interest,
                         threshold=0.05,
                         output_file="ged_results.csv",
                         min_percent=0.05):
    """
    For each variable in `variables`, condition the cleaned DataFrame on >0 and <0,
    calculate the correlation matrices for each subset, compute the GED between these matrices,
    print the result, and save all GEDs to a CSV file.
    
    If one of the conditions has less than min_percent of the total records,
    the variable is skipped.
    
    Parameters:
        cleaned_df (pd.DataFrame): The pre-cleaned dataframe.
        variables (list): List of variable names (as strings) on which to condition the dataframe.
        years_of_interest (list): List of years to pass to the correlation matrix function.
        threshold (float): Threshold for binarization in GED computation.
        output_file (str): The file name to which the GED results will be saved.
        min_percent (float): Minimum percentage (as a decimal) of total records required for both conditions.
    
    Returns:
        dict: A dictionary mapping each variable to its computed GED.
    """
    # Make a copy so that we don't alter the original dataframe
    cleaned_df = cleaned_df.copy()

    # To store GED for each variable
    ged_results = {}
    total_records = len(cleaned_df)

    for var in variables:
        # Check if the variable is in the DataFrame
        if var not in cleaned_df.columns:
            print(f"Variable '{var}' not found in dataframe. Skipping.")
            continue
        
        # --- Condition on the variable ---
        # For values > 0:
        conditioned_df_positive = cleaned_df[cleaned_df[var] > 0]
        # For values < 0:
        conditioned_df_negative = cleaned_df[cleaned_df[var] < 0]
        
        pos_count = len(conditioned_df_positive)
        neg_count = len(conditioned_df_negative)
        pos_percent = pos_count / total_records
        neg_percent = neg_count / total_records
        
        # Print data completion stats
        print(f"Variable '{var}': {pos_count} records > 0 ({pos_percent:.1%}), "
              f"{neg_count} records < 0 ({neg_percent:.1%}).")
        
        # Check if either condition has less than the minimum percent of records
        if pos_percent < min_percent or neg_percent < min_percent:
            print(f"Skipping variable '{var}' due to low sample size in one of the conditions "
                  f"(min required: {min_percent:.1%}).")
            continue
        
        # --- Calculate correlation matrices ---
        # Note: `calculate_correlation_matrix`, `CorrelationMethod`, and 
        #       `EdgeSuppressionMethod` are assumed to be defined elsewhere.
        corr_matrix_A = calculate_correlation_matrix(
            conditioned_df_positive, 
            years_of_interest=years_of_interest,
            method=CorrelationMethod.PEARSON, 
            partial=True, 
            edge_suppression=EdgeSuppressionMethod.REGULARIZATION,
            suppression_params={'regularization': 0.2}
        )
        
        corr_matrix_B = calculate_correlation_matrix(
            conditioned_df_negative, 
            years_of_interest=years_of_interest,
            method=CorrelationMethod.PEARSON, 
            partial=True, 
            edge_suppression=EdgeSuppressionMethod.REGULARIZATION,
            suppression_params={'regularization': 0.2}
        )
        
        # --- Compute GED ---
        ged = graph_edit_distance(corr_matrix_A, corr_matrix_B, threshold)
        print(f"Variable: {var} - Graph Edit Distance (threshold={threshold}): {ged}")
        ged_results[var] = ged

    # --- Save the GED results to a CSV file ---
    ged_df = pd.DataFrame(list(ged_results.items()), columns=["Variable", "GED"])
    ged_df.to_csv(output_file, index=False)
    print(f"GED results saved to {output_file}")
    
    return ged_results

# --- Example usage ---
# Suppose you have:
# cleaned_df: your pre-cleaned dataframe

variables = ['ABDEFECT', 'HELPNOT', 'NATFAREY', 'RACDIF3', 'HELPFUL', 'POLESCAP', 
             'AFFRMACT', 'HELPOTH', 'NATAID', 'TEENSEX', 'CONARMY', 'NATEDUCY', 'NATAIDY', 
             'SPKHOMO', 'POPULAR', 'POLMURDR', 'TRUST', 'SUICIDE2', 'CONCLERG', 'NATDRUGY', 
             'LIBMSLM', 'GETAHEAD', 'LIBHOMO', 'POSTLIFE', 'NATFARE', 'ABHLTH', 'NATHEAL', 
             'NATSPACY', 'MARHOMO', 'POLABUSE', 'CONEDUC', 'NATCRIME', 'ABRAPE', 'NATEDUC', 
             'FEPOL', 'HELPBLK', 'WORKHARD', 'COLCOM', 'NATCITY', 'SPKMSLM', 'COLMIL', 
             'COLMSLM', 'SPKMIL', 'COLRAC', 'CONBUS', 'EQWLTH', 'SUICIDE1', 'POLVIEWS', 
             'CONFED', 'SPKRAC', 'GRASS', 'FEPRESCH', 'FEFAM', 'CONTV', 'POLHITOK', 
             'NATENVIY', 'OBEY', 'HOMOSEX', 'SEXEDUC', 'NATSCI', 'WRKWAYUP', 'NATDRUG', 
             'NATMASS', 'HELPPOOR', 'NATPARK', 'ABANY', 'SPANKING', 'NATRACEY', 'THNKSELF', 
             'SPKCOM', 'XMARSEX', 'NATHEALY', 'FECHLD', 'ABNOMORE', 'CONLEGIS', 'CONJUDGE', 
             'CONFINAN', 'NATCHLD', 'CONPRESS', 'NATARMS', 'COLATH', 'NATRACE', 'PARTYID', 
             'ABPOOR', 'GUNLAW', 'NATENRGY', 'CONMEDIC', 'LIBMIL', 'SPKATH', 'ABSINGLE', 
             'PORNLAW', 'NATENVIR', 'NATCRIMY', 'LIBCOM', 'RACDIF4', 'PREMARSX',
             'CONSCI', 'NATCITYY', 'PRAYER', 'COLHOMO', 'NATARMSY', 'LIBATH', 'RACDIF2', 
             'FAIR', 'CONLABOR', 'NATSOC', 'NATSPAC', 'NATROAD', 'CAPPUN', 'RACDIF1', 
             'LIBRAC', 'DIVLAW', 'LETDIE1', 'COURTS', 'POLATTAK', 'PRESLAST_DEMREP', 
             'PRESLAST_NONCONFORM', 'WOULDVOTELAST_DEMREP', 'WOULDVOTELAST_NONCONFORM', 
             'DIDVOTELAST', 'RELIG_Protestant', 'RELIG_Catholic', 'RELIG_Jewish', 'RELIG_None', 
             'RELIG_Other', 'RELIG_Buddhism', 'RELIG_Hinduism', 'RELIG_Other_eastern_religions', 
             'RELIG_Muslim', 'RELIG_Orthodox_christian', 'RELIG_Christian', 
             'RELIG_Native_american', 'RELIG_Inter_nondenominational']


years_of_interest = range(2010, 2021)
threshold = 0.075  # minimum correlation to be considered "connected"

min_percent = 0.15  # For example, require at least 5% of total records in each condition

ged_dict = compute_and_save_ged(cleaned_df, variables, years_of_interest,
                                threshold=threshold, min_percent=min_percent)


Variable 'ABDEFECT': 37199 records > 0 (51.4%), 9509 records < 0 (13.1%).
Skipping variable 'ABDEFECT' due to low sample size in one of the conditions (min required: 15.0%).
Variable 'HELPNOT': 10673 records > 0 (14.7%), 11323 records < 0 (15.6%).
Skipping variable 'HELPNOT' due to low sample size in one of the conditions (min required: 15.0%).
Variable 'NATFAREY': 18029 records > 0 (24.9%), 2635 records < 0 (3.6%).
Skipping variable 'NATFAREY' due to low sample size in one of the conditions (min required: 15.0%).
Variable 'RACDIF3': 16584 records > 0 (22.9%), 16678 records < 0 (23.0%).
Variable: RACDIF3 - Graph Edit Distance (threshold=0.075): 39
Variable 'HELPFUL': 20466 records > 0 (28.3%), 18809 records < 0 (26.0%).
Variable: HELPFUL - Graph Edit Distance (threshold=0.075): 47
Variable 'POLESCAP': 30769 records > 0 (42.5%), 11223 records < 0 (15.5%).
Variable: POLESCAP - Graph Edit Distance (threshold=0.075): 42
Variable 'AFFRMACT': 4854 records > 0 (6.7%), 18947 records < 0 (26.2%

In [42]:
# Suppose ged_dict is structured like:
# {'VAR1': 10, 'VAR2': 5, 'VAR3': 15, ...}

# Sort the dictionary items by GED value in descending order and take the top 5
top5 = sorted(ged_dict.items(), key=lambda x: x[1], reverse=True)[:10]

print("Top GED values:'")
for var, ged in top5:
    print(f"{var}: {ged}")

Top GED values:'
POLVIEWS: 76
COLHOMO: 71
LIBHOMO: 65
PREMARSX: 60
LIBCOM: 58
SPKCOM: 56
PARTYID: 56
DIDVOTELAST: 55
LIBMIL: 54
SPKATH: 54


In [44]:
var = top5[1][0]
print(var)

# --- Condition on the variable ---
# For values > 0:
conditioned_df_positive = cleaned_df[cleaned_df[var] > 0]
# For values < 0:
conditioned_df_negative = cleaned_df[cleaned_df[var] < 0]


corr_matrix_A = calculate_correlation_matrix(
    conditioned_df_positive, 
    years_of_interest=years_of_interest,
    method=CorrelationMethod.PEARSON, 
    partial=True, 
    edge_suppression=EdgeSuppressionMethod.REGULARIZATION,
    suppression_params={'regularization': 0.2}
)

corr_matrix_B = calculate_correlation_matrix(
    conditioned_df_negative, 
    years_of_interest=years_of_interest,
    method=CorrelationMethod.PEARSON, 
    partial=True, 
    edge_suppression=EdgeSuppressionMethod.REGULARIZATION,
    suppression_params={'regularization': 0.2}
)


generate_html_visualization(
    corr_matrix_A,
    highlight_nodes=['PARTYID'],
    output_path='delete_this_file_A.html'
)

generate_html_visualization(
    corr_matrix_B,
    highlight_nodes=['PARTYID'],
    output_path='delete_this_file_B.html'
)

COLHOMO
Network visualization has been saved to c:\Users\timbo\Github\BeliefNetworkEvo\CLEAN\notebooks\tutorials\delete_this_file_A.html
Network visualization has been saved to c:\Users\timbo\Github\BeliefNetworkEvo\CLEAN\notebooks\tutorials\delete_this_file_B.html
