### Check the no circle marks in PAG result from FastDAG2PAG

In [1]:
import networkx as nx
from fastdag2pag.Graph_utils import Mark
def check_no_circle_marks(mypag_df, adj, selection_bias_nodes):

    vars_list = adj.columns.to_list()
    DAG = nx.DiGraph(adj)
    ancList = {}
    for var in vars_list:
        ancList[var] = set(nx.ancestors(DAG, var))
    anclist_S = set()
    for s in selection_bias_nodes:
        anclist_S |= ancList[s]

    no_circle_marks = 0
    error_marks = 0
    for row_label in mypag_df.index:
        for col_label in mypag_df.columns:
            rvalue = mypag_df.at[row_label, col_label]
            lvalue = mypag_df.at[col_label, row_label]
            rmark_symbol = {Mark.TAIL.value: "-", Mark.ARROW.value: ">", Mark.CIRCLE.value: "o"}
            lmark_symbol = {Mark.TAIL.value: "-", Mark.ARROW.value: "<", Mark.CIRCLE.value: "o"}

            if rvalue != Mark.CIRCLE.value:
                no_circle_marks += 1
                # check the mark in the DAG
                if rvalue == Mark.ARROW.value:
                    # means col_label is not an ancetor of {row_label} and S
                    if (col_label not in ancList[row_label]) and (col_label not in anclist_S):
                        # print(f"{row_label}{lmark_symbol[lvalue]}-{rmark_symbol[rvalue]}{col_label} right mark correct")
                        pass
                    else:
                        print(f"Error in {row_label}{lmark_symbol[lvalue]}-{rmark_symbol[rvalue]}{col_label} right mark: {col_label} is an ancestor of {row_label} or S")
                        error_marks += 1

                    

                if rvalue == Mark.TAIL.value:
                    # means col_label is an ancetor of {row_label} or S
                    if (col_label in ancList[row_label]) or (col_label in anclist_S):
                        # print(f"{row_label}{lmark_symbol[lvalue]}-{rmark_symbol[rvalue]}{col_label} right mark correct")
                        pass

                    else:
                        print(f"Error in {row_label}{lmark_symbol[lvalue]}-{rmark_symbol[rvalue]}{col_label} right mark: {col_label} is not an ancestor of {row_label} or S")
                        error_marks += 1

            if lvalue != Mark.CIRCLE.value:
                no_circle_marks += 1
                # check the mark in the DAG
                if lvalue == Mark.ARROW.value:
                    # means row_label is not an ancetor of {col_label} and S
                    if (row_label not in ancList[col_label]) and (row_label not in anclist_S):
                        # print(f"{row_label}{lmark_symbol[lvalue]}-{rmark_symbol[rvalue]}{col_label} left mark correct")
                        pass
                    else:
                        print(f"Error in {row_label}{lmark_symbol[lvalue]}-{rmark_symbol[rvalue]}{col_label} left mark: {row_label} is an ancestor of {col_label} or S")
                        error_marks += 1

                if lvalue == Mark.TAIL.value:
                    # means row_label is an ancetor of {col_label} or S
                    if (row_label in ancList[col_label]) or (row_label in anclist_S):
                        # print(f"{row_label}{lmark_symbol[lvalue]}-{rmark_symbol[rvalue]}{col_label} left mark correct")
                        pass

                    else:
                        print(f"Error in {row_label}{lmark_symbol[lvalue]}-{rmark_symbol[rvalue]}{col_label} left mark: {row_label} is not an ancestor of {col_label} or S")
                        error_marks += 1
    error_rate = error_marks / no_circle_marks if no_circle_marks > 0 else 0

    return error_rate

In [3]:
# Benchmarking: Compare runtime and output consistency for different node sizes
import numpy as np
import pandas as pd
import time
from fastdag2pag.dag2pag import dag2pag
from fastdag2pag.Random_Graph import ErdosRenyi
from tqdm import tqdm



node_sizes = [20,40,60,80,100]
num_trials = 100
results = []

for n_nodes in node_sizes:
    error_count = []
    ER_graph_gen = ErdosRenyi(n_nodes, expected_degree=4, def_dataframe=True, seed=321)

    for trial in tqdm(range(num_trials)):
        # Generate random DAG adjacency matrix
        graph = ER_graph_gen.get_random_graph()
        num_latent = int(n_nodes * 0.1) if n_nodes >= 10 else 1
        num_sel = num_latent  # For simplicity, set number of selection bias nodes equal to number of latent nodes
        adj = ER_graph_gen.set_latent_nodes(graph, num_latent=num_latent, selection_bias=True, num_sel=num_sel)
        latent_nodes = [node for node in adj.columns if node.startswith('L')]
        selection_bias_nodes = [node for node in adj.columns if node.startswith('S')]


        # FastDAG2PAG
       
        pag2 = dag2pag(adj, latent_nodes=latent_nodes, selection_bias=selection_bias_nodes)['PAG.DataFrame']
        
        error_rate = check_no_circle_marks(pag2, adj, selection_bias_nodes)
        if error_rate > 0:
            path = f"Test_Data/error_dag_nodes{n_nodes}_{trial}.csv"
            adj.to_csv(path)
            # break
        error_count.append(error_rate)

        

    results.append({
        'nodes': n_nodes,
        'avg_error_rate': np.mean(error_count)
    })

results_df = pd.DataFrame(results)


100%|██████████| 100/100 [00:04<00:00, 21.62it/s]
100%|██████████| 100/100 [00:45<00:00,  2.19it/s]
100%|██████████| 100/100 [06:49<00:00,  4.10s/it]
100%|██████████| 100/100 [29:43<00:00, 17.83s/it]  
100%|██████████| 100/100 [2:26:08<00:00, 87.68s/it]   


In [4]:
print('Average error rate for each node size:')
for index, row in results_df.iterrows():
    print(f"Node Size: {row['nodes']}, Average Error Rate: {row['avg_error_rate']}")

Average error rate for each node size:
Node Size: 20.0, Average Error Rate: 0.0
Node Size: 40.0, Average Error Rate: 0.0
Node Size: 60.0, Average Error Rate: 0.0
Node Size: 80.0, Average Error Rate: 0.0
Node Size: 100.0, Average Error Rate: 0.0


In [None]:
import logging
import pandas as pd
from fastdag2pag.dag2pag import dag2pag
# logging.getLogger("Learner_Base").setLevel(logging.INFO)
# logging.basicConfig(
#     level=logging.INFO,  # Set the logging level, e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL
#     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",  # Set the logging format
#     datefmt="%Y-%m-%d %H:%M:%S"  # Set the time format
# )
logger = logging.getLogger("Learner_Base")
logger.disabled = True   # Disable logging for this logger
n_nodes= 
trial=
path = f"Test_Data/error_dag_nodes{n_nodes}_{trial}.csv"
adj = pd.read_csv(path, index_col=0)
latent_nodes = [node for node in adj.columns if node.startswith('L')]
selection_bias_nodes = [node for node in adj.columns if node.startswith('S')]
pag2 = dag2pag(adj, latent_nodes=latent_nodes, selection_bias=selection_bias_nodes)['PAG.DataFrame']


In [None]:
error_rate = check_no_circle_marks(pag2, adj, selection_bias_nodes)
error_rate

In [None]:
# Visualize the generated DAG and the resulting PAGs from both methods
from fastdag2pag.mixgraph import MixGraph
from IPython.display import Image, display

# Visualize the original DAG
dag = MixGraph()
dag.from_pandas_adjacency(adj, graph_type='DAG')
dag_pydot = dag.to_pydot()
png_bytes = dag_pydot.create_png()
display(Image(png_bytes))  



# Visualize PAG from FastDAG2PAG
Pag2 = MixGraph()
Pag2.from_pandas_adjacency(pag2, graph_type='MG')
Pag2_pydot = Pag2.to_pydot()
png_bytes_pag2 = Pag2_pydot.create_png()
display(Image(png_bytes_pag2))  