In [1]:
from gmpy2.gmpy2 import random_state

# Quick Start: Causal Analysis
'''
This notebook demonstrates:
1. Load & clean data
2. Feature selection (IAMB)
3. Causal discovery (PC)
4. Effect estimation (ATE)
5. Visualization (DAG)
'''

'\nThis notebook demonstrates:\n1. Load & clean data\n2. Feature selection (IAMB)\n3. Causal discovery (PC)\n4. Effect estimation (ATE)\n5. Visualization (DAG)\n'

In [1]:
import sys, os

# 1. Compute project root: one level up from the notebook folder
proj_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# 2. Prepend it to sys.path
if proj_root not in sys.path:
    sys.path.insert(0, proj_root)

In [2]:
import pandas as pd
from causallearn.utils.cit import kci
from causal import preprocess
from causal import restriction
from causal import causal_discovery as cd
from causal import identifier as ide
from causal import visualization as vis
from causal import utils
from causallearn.search.FCMBased.lingam.utils import make_dot
from causallearn.search.FCMBased import lingam
from dowhy import CausalModel
from causallearn.utils.GraphUtils import GraphUtils

In [17]:
if __name__ == '__main__':

    '''
    1. Data Loading and Preprocessing
    '''

    path = '../Dataset/veremi_extension_simple.csv'
    data_origin = pd.read_csv(path)
    print(f'total:{data_origin.shape}')
    # print(data_origin)

    # filter Ddos and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 11, 12, 13, 16, 17])]

    # filter fake data attack and normal data
    data_origin = data_origin[data_origin['class'].isin([0, 1, 2, 3, 4, 5, 6, 7, 8])]

    # filter sybil attack and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 14, 15, 16, 17])]
    # print(data_origin.head(5))

    # filter sybil disruptive/data reply and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 10])]
    # print(data_origin.head(5))

    data_origin = utils.collapse_classes(data_origin, 1)
    print(data_origin[data_origin['class'] == 1].shape)

    data_origin = utils.min_sample_retention(data_origin, test_size = 20000, random_state=42)
    # print(data_origin)

    print(data_origin[data_origin['class'] == 1].shape)
    print('*-' * 50)




total:(1048575, 20)
(112754, 20)
(3063, 20)
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-


In [18]:
# Data Cleaning:
drop_column = ['type','Attack','Attack_type']
data_processed = preprocess.clean(data_origin, drop_column=drop_column, drop_na=True, data_numerical=True)

# Standardize features, target keep same as original data_processed:
data_processed = preprocess.standardize(data_processed, ['class','sendTime','sender','senderPseudo','messageID'])

# Combine axis related data such as pos, spd etc. by using M = \sqrt{X^2 + Y^2 + Z^2}
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['posx', 'posy', 'posz'], 'pos')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['spdx', 'spdy', 'spdz'], 'spd')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['aclx', 'acly', 'aclz'], 'acl')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['hedx', 'hedy', 'hedz'], 'hed')
data_processed.drop(
    columns=['posx', 'posy', 'posz', 'spdx', 'spdy', 'spdz', 'aclx', 'acly', 'aclz', 'hedx', 'hedy', 'hedz','sender'],
    inplace=True
)

# ID mapping to 0-N
data_processed['senderPseudo'] = data_processed['senderPseudo'].astype('category').cat.codes
data_processed['messageID'] = data_processed['messageID'].astype('category').cat.codes

# …run two separate CausalModel objects with *_z columns as treatment

with pd.option_context('display.max_columns', None):
    print(data_processed)
    print(type(data_processed))
print('*-' * 50)

           sendTime  senderPseudo  messageID  class       pos       spd  \
777704  64770.68152          4913      17105      0  1.372042  0.137925   
755697  64497.01181          4791      16755      0  1.825699  1.546896   
952115  41856.61237          2070       7232      0  1.443544  1.487799   
47071   72759.86293          5244      18086      0  2.087577  1.485425   
971167  42701.72969          2181       7654      0  1.939775  1.356387   
...             ...           ...        ...    ...       ...       ...   
985798  43205.18483          2267       7865      1  1.408265  3.559244   
79607   73373.47047          5414      18686      0  0.928774  1.295812   
467487  52284.59261          2695       9400      0  1.963785  1.287427   
947042  41610.44452          2040       7130      0  0.953436  0.649031   
579052  62204.98480          3809      13393      0  1.416715  1.332581   

             acl       hed  
777704  0.012439  1.327697  
755697  1.234332  1.551161  
952115  0.24

In [19]:
# X = data_processed.iloc[:, 1:].copy()     # 8 features
# y = data_processed.iloc[:, 0].copy()
#
y = data_processed['class'].copy()
X = data_processed.drop(columns='class')

# print(X)
# print('*-' * 50)
# print(y)
# print('*-' * 50)

df = pd.concat([X, y.rename('class')], axis=1)
print(df)
node_names = df.columns.tolist()
print(node_names)

zeros = df.columns[df.var()==0]
print("zero var column：", zeros.tolist())

corr = df.corr().abs()
perfect_pairs = [(i,j) for i in corr.columns for j in corr.columns
                 if i!=j and corr.loc[i,j]==1.0]
print("corr column：", perfect_pairs)

           sendTime  senderPseudo  messageID       pos       spd       acl  \
777704  64770.68152          4913      17105  1.372042  0.137925  0.012439   
755697  64497.01181          4791      16755  1.825699  1.546896  1.234332   
952115  41856.61237          2070       7232  1.443544  1.487799  0.248442   
47071   72759.86293          5244      18086  2.087577  1.485425  0.139511   
971167  42701.72969          2181       7654  1.939775  1.356387  0.117084   
...             ...           ...        ...       ...       ...       ...   
985798  43205.18483          2267       7865  1.408265  3.559244  0.203661   
79607   73373.47047          5414      18686  0.928774  1.295812  0.170283   
467487  52284.59261          2695       9400  1.963785  1.287427  0.117453   
947042  41610.44452          2040       7130  0.953436  0.649031  1.940632   
579052  62204.98480          3809      13393  1.416715  1.332581  0.149151   

             hed  class  
777704  1.327697      0  
755697  1.5

In [21]:
'''
2.  Background knowledge creation
'''
bk_pc = restriction.PC_BGKnowledge(df, X, 'class')
bk_DirectLiNGAM = restriction.DirectLiNGAM_BGKnowledge(node_names, 'class')
# print(bk_DirectLiNGAM)

<class 'causallearn.utils.PCUtils.BackgroundKnowledge.BackgroundKnowledge'>


In [22]:
'''
3.  Algorithm for causal discovery
'''

'''3.1 Constrained Based'''
# PC algorithm with Kernal-based independence test
cg_pc = cd.pc_algorithm(
    df,
    indep_test_func=kci,
    alpha = 0.01,
    uc_rule = 1,
    max_k = 2,
    background_knowledge = bk_pc,
    node_names = node_names
)

pdy = GraphUtils.to_pydot(cg_pc.G)
print(type(pdy))
pdy.write_png('PC.png')


# FCI algorithm with Kernal-based independence test
# cg_fci, edges = cd.fci_algorithm(
#     df,
#     indep_test_func=kci,
#     alpha=0.01,
#     depth=-1,
#     max_path_length=-1,
#     verbose=False,
#     show_progress=True,
#     background_knowledge = bk_pc,
#     node_names = node_names
# )
# pdy = GraphUtils.to_pydot(cg_fci)
# pdy.write_png('FCI.png')

  0%|          | 0/8 [00:00<?, ?it/s]

<class 'pydot.core.Dot'>


In [23]:
'''3.2 constrained functional'''
# LiNGAM
model_LiNGAM = lingam.ICALiNGAM(random_state=42)
model_LiNGAM.fit(df)
print(model_LiNGAM.adjacency_matrix_)
graph_dot_model_LiNGAM = make_dot(model_LiNGAM.adjacency_matrix_, labels=node_names)
graph_dot_model_LiNGAM.format = 'png'
output_path = graph_dot_model_LiNGAM.render(filename='LiNGAM',directory='.',cleanup=True)
#
#
# # Direct-LiNGAM
# model_DirectLiNGAM = lingam.DirectLiNGAM(
#     random_state=42,
#     prior_knowledge=None,
#     apply_prior_knowledge_softly=False,
#     measure='pwling',
# )
#
# model_DirectLiNGAM.fit(df)
# graph_dot_DirectLiNGAM = make_dot(model_DirectLiNGAM.adjacency_matrix_, labels=node_names)
# graph_dot_DirectLiNGAM.format = 'png'
# output_path = graph_dot_DirectLiNGAM.render(filename='DirectLiNGAM',directory='.',cleanup=True)

[[ 0.00000000e+00 -5.17041827e+01  1.76793900e+01  1.65474586e+02
   0.00000000e+00 -4.73994207e+01  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.25980773e+02
  -3.81151385e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  3.48799551e+00  0.00000000e+00 -1.21043423e+01
   0.00000000e+00  0.00000000e+00 -3.14683356e+01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00 -2.40110348e-01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  4.55714293e-02
   0.00000000e+00  0.00000000e+00  6.19179862e-01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -7.34305861e-02
  -1.90515410e-01  0.00000000e+00 -6.02142859e-01  1.38077961e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  5.48923106e-02
   

In [24]:
'''3.4 Boss'''

# G = boss(df.to_numpy(), score_func='local_score_marginal_general', node_names=node_names)
G = cd.boss(df.to_numpy(), score_func='local_score_BIC', node_names=node_names)
pyd = GraphUtils.to_pydot(G)
pyd.write_png("BOSS.png")

BOSS edge count: 12    
BOSS completed in: 0.01s 




In [25]:
'''3.5 NOTEARS'''

w = cd.notears_linear(df.values, lambda1= 0.5, loss_type='l2')
print(w)
print(type(w))
NOTEARS_adjacency_matrix_ = pd.DataFrame(w, index=node_names, columns=node_names)
# print(NOTEARS_adjacency_matrix_)
graph_dot_NOTEARS = make_dot(w, labels=node_names)
graph_dot_NOTEARS.format = 'png'
output_path = graph_dot_NOTEARS.render(filename='NOTEARS',directory='.',cleanup=True)



[[  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [-51.56949502   0.           3.4763198    0.           0.
    0.           0.           0.        ]
 [ 17.6400642    0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]]
<class 'numpy.ndarray'>


In [26]:
# Estimand
import statsmodels.api as sm

# Using the Gaussian Family for multi class
method_params = {
    "glm_family": sm.families.Gaussian()          # ≡ linear regression
}

graph = utils.make_graph(model_LiNGAM.adjacency_matrix_, labels=node_names)

causal_estimand, causal_estimate = ide.estimate(df, 'acl','class', method_params, graph)
print(causal_estimand)
print(causal_estimate)
# graph_dot = utils.str_to_dot(graph.source)




No directed path from ['acl'] to ['class'] in the causal graph.
Causal effect is zero.
*** Causal Estimate ***

## Identified estimand
No directed path from ['acl'] to ['class'] in the causal graph.
Causal effect is zero.
## Realized estimand
None
## Estimate
Mean value: 0



In [12]:
# Define Causal Model
model=CausalModel(
        data = df,
        treatment='acl',
        outcome='class',
        graph=graph_dot)

# Identification
identified_estimand = model.identify_effect(proceed_when_unidentifiable=False)
print(identified_estimand)

# Estimation
estimate = model.estimate_effect(identified_estimand,
                                # method_name="backdoor.linear_regression",
                                method_name="backdoor.generalized_linear_model",
                                # control_value=0,
                                # treatment_value=1,
                                method_params=method_params,
                                confidence_intervals=True,
                                target_units='ate',
                                test_significance=True)
print("Causal Estimate is: " + str(estimate.value))
print(estimate)
print(type(estimate))



NameError: name 'graph_dot' is not defined

In [16]:
# Refute estimand
from dowhy.causal_refuter import CausalRefuter, CausalRefutation

refuter_list = ["data_subset_refuter", 'random_common_cause', 'dummy_outcome_refuter', 'placebo_treatment_refuter','bootstrap_refuter']


ref = model.refute_estimate(
        # df,
        identified_estimand,
        estimate,
        method_name="random_common_cause",
        show_progress_bar=True,
        # optional parameters ↓
        # placebo_type="permute",          # default: permute treatment column
        # subset_fraction = 0.85,
        num_simulations = 200,
        # random_state=42,
        n_job = -1,
        # noise = 0.1
)


TypeError: 'NoneType' object is not subscriptable

In [65]:
print(ref)

Refute: Add a random common cause
Estimated effect:-3.527382616219654e-06
New effect:-3.5287652550364257e-06
p value:0.94

