In [33]:
from gmpy2.gmpy2 import random_state

# Quick Start: Causal Analysis
'''
This notebook demonstrates:
1. Load & clean data
2. Feature selection (IAMB)
3. Causal discovery (PC)
4. Effect estimation (ATE)
5. Visualization (DAG)
'''

'\nThis notebook demonstrates:\n1. Load & clean data\n2. Feature selection (IAMB)\n3. Causal discovery (PC)\n4. Effect estimation (ATE)\n5. Visualization (DAG)\n'

In [34]:
import sys, os

# 1. Compute project root: one level up from the notebook folder
proj_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# 2. Prepend it to sys.path
if proj_root not in sys.path:
    sys.path.insert(0, proj_root)

In [35]:
import pandas as pd
from causallearn.utils.cit import kci
from causal import preprocess
from causal import restriction
from causal import causal_discovery as cd
from causal import identifier as ide
from causal import visualization as vis
from causal import utils
from causal import refuter as ref
from causallearn.search.FCMBased.lingam.utils import make_dot
from causallearn.search.FCMBased import lingam
from dowhy import CausalModel
from causallearn.utils.GraphUtils import GraphUtils

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [36]:
if __name__ == '__main__':

    '''
    1. Data Loading and Preprocessing
    '''

    path = '../Dataset/veremi_extension_simple.csv'
    data_origin = pd.read_csv(path)
    print(f'total:{data_origin.shape}')
    # print(data_origin)

    # filter Ddos and normal data
    data_origin = data_origin[data_origin['class'].isin([0, 11, 12, 13, 16, 17])]

    # filter fake data attack and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 1, 2, 3, 4, 5, 6, 7, 8])]

    # filter sybil attack and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 14, 15, 16, 17])]
    # print(data_origin.head(5))

    # filter sybil disruptive/data reply and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 10])]
    # print(data_origin.head(5))

    data_origin = utils.collapse_classes(data_origin, 1)
    print(data_origin[data_origin['class'] == 1].shape)

    data_origin = utils.min_sample_retention(data_origin, test_size = 80000, random_state=42)
    # print(data_origin)

    print(data_origin[data_origin['class'] == 1].shape)
    print('*-' * 50)




total:(1048575, 20)
(143130, 20)
(14934, 20)
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-


In [37]:
# Data Cleaning:
drop_column = ['type','Attack','Attack_type']
data_processed = preprocess.clean(data_origin, drop_column=drop_column, drop_na=True, data_numerical=True)

# Standardize features, target keep same as original data_processed:
data_processed = preprocess.standardize(data_processed, ['class','sendTime','sender','senderPseudo','messageID'])

# Combine axis related data such as pos, spd etc. by using M = \sqrt{X^2 + Y^2 + Z^2}
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['posx', 'posy', 'posz'], 'pos')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['spdx', 'spdy', 'spdz'], 'spd')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['aclx', 'acly', 'aclz'], 'acl')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['hedx', 'hedy', 'hedz'], 'hed')
data_processed.drop(
    columns=['posx', 'posy', 'posz', 'spdx', 'spdy', 'spdz', 'aclx', 'acly', 'aclz', 'hedx', 'hedy', 'hedz','sender'],
    inplace=True
)

# ID mapping to 0-N
data_processed['senderPseudo'] = data_processed['senderPseudo'].astype('category').cat.codes
data_processed['messageID'] = data_processed['messageID'].astype('category').cat.codes

# …run two separate CausalModel objects with *_z columns as treatment

with pd.option_context('display.max_columns', None):
    print(data_processed)
    print(type(data_processed))
print('*-' * 50)

            sendTime  senderPseudo  messageID  class       pos       spd  \
656639   63353.20816          4435      59100      0  1.354733  0.061347   
1026470  32982.84854          1677      22170      0  0.933590  1.149308   
524084   53998.32004          3139      41541      0  1.448848  0.061388   
19426    72339.34587          5769      70050      0  1.131498  0.862196   
19598    72511.34587          5769      70977      0  1.834559  1.713286   
...              ...           ...        ...    ...       ...       ...   
44377    72778.87973          5911      72217      0  0.709379  1.028168   
607815   62642.26475          4141      55049      0  2.292424  0.623311   
270668   27132.25916           770       9425      0  1.216551  1.573593   
1020577  32769.49372          1648      20835      0  2.196679  1.732858   
589186   62321.83816          4031      53523      0  1.094986  1.521279   

              acl       hed  
656639   0.006894  1.320137  
1026470  1.702268  1.442326

In [38]:
# X = data_processed.iloc[:, 1:].copy()     # 8 features
# y = data_processed.iloc[:, 0].copy()
#
y = data_processed['class'].copy()
X = data_processed.drop(columns='class')

# print(X)
# print('*-' * 50)
# print(y)
# print('*-' * 50)

df = pd.concat([X, y.rename('class')], axis=1)
print(df)
node_names = df.columns.tolist()
print(node_names)

zeros = df.columns[df.var()==0]
print("zero var column：", zeros.tolist())

corr = df.corr().abs()
perfect_pairs = [(i,j) for i in corr.columns for j in corr.columns
                 if i!=j and corr.loc[i,j]==1.0]
print("corr column：", perfect_pairs)

            sendTime  senderPseudo  messageID       pos       spd       acl  \
656639   63353.20816          4435      59100  1.354733  0.061347  0.006894   
1026470  32982.84854          1677      22170  0.933590  1.149308  1.702268   
524084   53998.32004          3139      41541  1.448848  0.061388  0.087245   
19426    72339.34587          5769      70050  1.131498  0.862196  0.573807   
19598    72511.34587          5769      70977  1.834559  1.713286  0.733850   
...              ...           ...        ...       ...       ...       ...   
44377    72778.87973          5911      72217  0.709379  1.028168  2.227905   
607815   62642.26475          4141      55049  2.292424  0.623311  2.252333   
270668   27132.25916           770       9425  1.216551  1.573593  0.052511   
1020577  32769.49372          1648      20835  2.196679  1.732858  0.069524   
589186   62321.83816          4031      53523  1.094986  1.521279  3.147976   

              hed  class  
656639   1.320137      0

In [39]:
'''
2.  Background knowledge creation
'''
bk_pc = restriction.PC_BGKnowledge(df, X, 'class')
bk_DirectLiNGAM = restriction.DirectLiNGAM_BGKnowledge(node_names, 'class')
# print(bk_DirectLiNGAM)

<class 'causallearn.utils.PCUtils.BackgroundKnowledge.BackgroundKnowledge'>


In [40]:
'''
3.  Algorithm for causal discovery
'''

'''3.1 Constrained Based'''
# PC algorithm with Kernal-based independence test
cg_pc = cd.pc_algorithm(
    df,
    indep_test_func = kci,
    alpha = 0.01,
    uc_rule = 1,
    max_k = 2,
    background_knowledge = bk_pc,
    node_names = node_names
)

pdy = GraphUtils.to_pydot(cg_pc.G)
print(type(pdy))
pdy.write_png('PC.png')


# FCI algorithm with Kernal-based independence test
# cg_fci, edges = cd.fci_algorithm(
#     df,
#     indep_test_func=kci,
#     alpha=0.01,
#     depth=-1,
#     max_path_length=-1,
#     verbose=False,
#     show_progress=True,
#     background_knowledge = bk_pc,
#     node_names = node_names
# )
# pdy = GraphUtils.to_pydot(cg_fci)
# pdy.write_png('FCI.png')

  0%|          | 0/8 [00:00<?, ?it/s]

<class 'pydot.core.Dot'>


In [41]:
'''3.2 constrained functional'''
# LiNGAM
model_LiNGAM = lingam.ICALiNGAM(random_state=42)
model_LiNGAM.fit(df)
print(model_LiNGAM.adjacency_matrix_)
graph_dot_model_LiNGAM = make_dot(model_LiNGAM.adjacency_matrix_, labels=node_names)
graph_dot_model_LiNGAM.format = 'png'
output_path = graph_dot_model_LiNGAM.render(filename='LiNGAM',directory='.',cleanup=True)
#
#
# # Direct-LiNGAM
# model_DirectLiNGAM = lingam.DirectLiNGAM(
#     random_state=42,
#     prior_knowledge=None,
#     apply_prior_knowledge_softly=False,
#     measure='pwling',
# )
#
# model_DirectLiNGAM.fit(df)
# graph_dot_DirectLiNGAM = make_dot(model_DirectLiNGAM.adjacency_matrix_, labels=node_names)
# graph_dot_DirectLiNGAM.format = 'png'
# output_path = graph_dot_DirectLiNGAM.render(filename='DirectLiNGAM',directory='.',cleanup=True)

[[ 0.00000000e+00 -2.40988648e-01  7.37252725e-01  0.00000000e+00
  -2.37696740e+01  0.00000000e+00 -1.83910816e+02  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.44488656e+02
  -4.41373119e+01  0.00000000e+00  2.62060300e+02  5.23232426e+02]
 [ 0.00000000e+00  1.06364428e+01  0.00000000e+00 -3.70146732e+02
   0.00000000e+00  0.00000000e+00 -3.20367510e+03 -6.21376476e+03]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   3.67896168e-02  0.00000000e+00 -9.47790781e-01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -6.98498596e-02
  -1.70189040e-01  0.00000000e+00  3.18366650e-01 -7.63231179e-02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   3.89047394e-02  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  2.33693381e-02
  -

In [42]:
'''3.4 Boss'''

# G = boss(df.to_numpy(), score_func='local_score_marginal_general', node_names=node_names)
G = cd.boss(df.to_numpy(), score_func='local_score_BIC', node_names=node_names)
pyd = GraphUtils.to_pydot(G)
pyd.write_png("BOSS.png")

order:[0, 1, 2, 3, 4, 5, 6, 7]
leaf_idx:7
gsts:[<causallearn.search.PermutationBased.gst.GST object at 0x32f352f80>, <causallearn.search.PermutationBased.gst.GST object at 0x32f353550>, <causallearn.search.PermutationBased.gst.GST object at 0x32f351900>, <causallearn.search.PermutationBased.gst.GST object at 0x351cba740>, <causallearn.search.PermutationBased.gst.GST object at 0x351cbb1c0>, <causallearn.search.PermutationBased.gst.GST object at 0x351cba4d0>, <causallearn.search.PermutationBased.gst.GST object at 0x351cbbfd0>, <causallearn.search.PermutationBased.gst.GST object at 0x351cbb760>]
=== GST #0 ===
vertex        : 0
forbidden     : [0, 7]
required      : []
root.grow_score : -1557844.0834874809
root.shrink_score: -1557844.0834874809
=== GST #1 ===
vertex        : 1
forbidden     : [1, 7]
required      : []
root.grow_score : -1212917.9297909713
root.shrink_score: -1212917.9297909713
=== GST #2 ===
vertex        : 2
forbidden     : [2, 7]
required      : []
root.grow_score : -16



In [43]:
'''3.5 NOTEARS'''

w = cd.notears_linear(df.values, lambda1= 0.5, loss_type='logistic')
print(w)
print(type(w))
NOTEARS_adjacency_matrix_ = pd.DataFrame(w, index=node_names, columns=node_names)
# print(NOTEARS_adjacency_matrix_)
graph_dot_NOTEARS = make_dot(w, labels=node_names)
graph_dot_NOTEARS.format = 'png'
output_path = graph_dot_NOTEARS.render(filename='NOTEARS',directory='.',cleanup=True)



  eAw = eAw @ eAw
  G_h = E.T * W * 2


[[ 0.00000000e+00  2.18422554e+02  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 6.76703577e+02  8.73079275e+00  0.00000000e+00 -9.41335933e-01
  -6.08063410e-01  0.00000000e+00 -1.47131015e+00  0.00000000e+00]
 [ 6.96749484e+00  4.48506213e-01  4.63757890e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 6.53292768e+00  4.18115529e-01  4.71956083e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 5.23799867e+00  3.37890919e-01  4.16918919e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 7.58280708e+00  4.90518701e-01  4.21249532e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 9.97381261e-01  0.00000000e+00  6.00413518e-01  0.00000000e+00
   

In [44]:
# Estimand and Estimate
import statsmodels.api as sm

# Using the Gaussian Family for multi class
method_params_gaussian = {
    "glm_family": sm.families.Gaussian()
}

# Using binomial for binary class
method_params_binomial = {
    "glm_family": sm.families.Binomial()
}


# Method
method_name = {
    'regression':'backdoor.generalized_linear_model',
    'distance_matching': 'backdoor.distance_matching',
    'propensity_score_stratification': 'backdoor.propensity_score_stratification',
    'propensity_score_matching': 'backdoor.propensity_score_matching',
    'frontdoor_regression': 'frontdoor.linear_regression'
}

graph = utils.make_graph(NOTEARS_adjacency_matrix_, labels=node_names)

causal_model, causal_estimand, causal_estimate = ide.estimate(
    df,
    treatment = 'messageID',
    outcome = 'class',
    method_params = method_params_binomial,
    method_name = 'frontdoor.linear_regression',
    graph = graph,
)
print(causal_estimand)
print('*-'*50)
print(causal_estimate)
graph_dot = utils.str_to_dot(graph.source)

No directed path from ['messageID'] to ['class'] in the causal graph.
Causal effect is zero.
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
*** Causal Estimate ***

## Identified estimand
No directed path from ['messageID'] to ['class'] in the causal graph.
Causal effect is zero.
## Realized estimand
None
## Estimate
Mean value: 0



In [45]:
# Refute estimand
refuter_list = ['bootstrap_refuter', "data_subset_refuter", 'dummy_outcome_refuter', 'placebo_treatment_refuter', 'random_common_cause']
refuter_method = 'bootstrap_refuter'

refuter_results = ref.causal_refuter(causal_model, causal_estimand, causal_estimate, refuter_method)
print(refuter_results)

AttributeError: 'CausalEstimate' object has no attribute 'estimator'