In [1]:
from gmpy2.gmpy2 import random_state

# Quick Start: Causal Analysis
'''
This notebook demonstrates:
1. Load & clean data
2. Feature selection (IAMB)
3. Causal discovery (PC)
4. Effect estimation (ATE)
5. Visualization (DAG)
'''

'\nThis notebook demonstrates:\n1. Load & clean data\n2. Feature selection (IAMB)\n3. Causal discovery (PC)\n4. Effect estimation (ATE)\n5. Visualization (DAG)\n'

In [2]:
import sys, os

# 1. Compute project root: one level up from the notebook folder
proj_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# 2. Prepend it to sys.path
if proj_root not in sys.path:
    sys.path.insert(0, proj_root)

In [3]:
import pandas as pd
from causallearn.utils.cit import kci
from causal import preprocess
from causal import restriction
from causal import causal_discovery as cd
from causal import identifier as ide
from causal import visualization as vis
from causal import utils
from causal import refuter as ref
from causallearn.search.FCMBased.lingam.utils import make_dot
from causallearn.search.FCMBased import lingam
from dowhy import CausalModel
from causallearn.utils.GraphUtils import GraphUtils

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [4]:
if __name__ == '__main__':

    '''
    1. Data Loading and Preprocessing
    '''

    path = '../Dataset/veremi_extension_simple.csv'
    data_origin = pd.read_csv(path)
    print(f'total:{data_origin.shape}')
    # print(data_origin)

    # filter Ddos and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 11, 12, 13, 16, 17])]

    # filter fake data attack and normal data
    data_origin = data_origin[data_origin['class'].isin([0, 1, 2, 3, 4, 5, 6, 7, 8])]

    # filter sybil attack and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 14, 15, 16, 17])]
    # print(data_origin.head(5))

    # filter sybil disruptive/data reply and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 10])]
    # print(data_origin.head(5))

    data_origin = utils.collapse_classes(data_origin, 1)
    print(data_origin[data_origin['class'] == 1].shape)

    data_origin = utils.min_sample_retention(data_origin, test_size = 50000, random_state=42)
    # print(data_origin)

    print(data_origin[data_origin['class'] == 1].shape)
    print('*-' * 50)




total:(1048575, 20)
(112754, 20)
(7656, 20)
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-


In [5]:
# Data Cleaning:
drop_column = ['type','Attack','Attack_type']
data_processed = preprocess.clean(data_origin, drop_column=drop_column, drop_na=True, data_numerical=True)

# Standardize features, target keep same as original data_processed:
data_processed = preprocess.standardize(data_processed, ['class','sendTime','sender','senderPseudo','messageID'])

# Combine axis related data such as pos, spd etc. by using M = \sqrt{X^2 + Y^2 + Z^2}
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['posx', 'posy', 'posz'], 'pos')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['spdx', 'spdy', 'spdz'], 'spd')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['aclx', 'acly', 'aclz'], 'acl')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['hedx', 'hedy', 'hedz'], 'hed')
data_processed.drop(
    columns=['posx', 'posy', 'posz', 'spdx', 'spdy', 'spdz', 'aclx', 'acly', 'aclz', 'hedx', 'hedy', 'hedz','sender'],
    inplace=True
)

# ID mapping to 0-N
data_processed['senderPseudo'] = data_processed['senderPseudo'].astype('category').cat.codes
data_processed['messageID'] = data_processed['messageID'].astype('category').cat.codes

# …run two separate CausalModel objects with *_z columns as treatment

with pd.option_context('display.max_columns', None):
    print(data_processed)
    print(type(data_processed))
print('*-' * 50)

            sendTime  senderPseudo  messageID  class       pos       spd  \
451617   51985.73133          2886      22743      0  0.827389  1.537568   
298735   27499.58093           973       7018      0  1.656312  0.841002   
554824   61823.95331          4056      31851      0  1.334815  1.784739   
1016908  32808.52658          1702      13068      0  0.815839  1.455762   
642519   63105.75474          4614      36102      0  1.381778  0.826587   
...              ...           ...        ...    ...       ...       ...   
99779    74062.45545          6117      47765      0  1.862186  1.163106   
728606   64080.69903          5140      40137      0  0.822393  1.596405   
732069   64157.66261          5163      40435      0  1.098133  0.146566   
855588   56981.04996          3732      29409      0  0.685365  0.768358   
151887   11250.49351            47        338      0  0.554933  1.461002   

              acl       hed  
451617   0.630566  1.408862  
298735   2.542334  1.467555

In [6]:
# X = data_processed.iloc[:, 1:].copy()     # 8 features
# y = data_processed.iloc[:, 0].copy()
#
y = data_processed['class'].copy()
X = data_processed.drop(columns='class')

# print(X)
# print('*-' * 50)
# print(y)
# print('*-' * 50)

df = pd.concat([X, y.rename('class')], axis=1)
print(df)
node_names = df.columns.tolist()
print(node_names)

zeros = df.columns[df.var()==0]
print("zero var column：", zeros.tolist())

corr = df.corr().abs()
perfect_pairs = [(i,j) for i in corr.columns for j in corr.columns
                 if i!=j and corr.loc[i,j]==1.0]
print("corr column：", perfect_pairs)

            sendTime  senderPseudo  messageID       pos       spd       acl  \
451617   51985.73133          2886      22743  0.827389  1.537568  0.630566   
298735   27499.58093           973       7018  1.656312  0.841002  2.542334   
554824   61823.95331          4056      31851  1.334815  1.784739  0.277195   
1016908  32808.52658          1702      13068  0.815839  1.455762  0.085157   
642519   63105.75474          4614      36102  1.381778  0.826587  2.221222   
...              ...           ...        ...       ...       ...       ...   
99779    74062.45545          6117      47765  1.862186  1.163106  0.005008   
728606   64080.69903          5140      40137  0.822393  1.596405  0.300590   
732069   64157.66261          5163      40435  1.098133  0.146566  0.009651   
855588   56981.04996          3732      29409  0.685365  0.768358  1.842533   
151887   11250.49351            47        338  0.554933  1.461002  0.555621   

              hed  class  
451617   1.408862      0

In [7]:
'''
2.  Background knowledge creation
'''
bk_pc = restriction.PC_BGKnowledge(df, X, 'class')
bk_DirectLiNGAM = restriction.DirectLiNGAM_BGKnowledge(node_names, 'class')
# print(bk_DirectLiNGAM)

<class 'causallearn.utils.PCUtils.BackgroundKnowledge.BackgroundKnowledge'>


In [8]:
'''
3.  Algorithm for causal discovery
'''

'''3.1 Constrained Based'''
# PC algorithm with Kernal-based independence test
cg_pc = cd.pc_algorithm(
    df,
    indep_test_func=kci,
    alpha = 0.01,
    uc_rule = 1,
    max_k = 2,
    background_knowledge = bk_pc,
    node_names = node_names
)

pdy = GraphUtils.to_pydot(cg_pc.G)
print(type(pdy))
pdy.write_png('PC.png')


# FCI algorithm with Kernal-based independence test
# cg_fci, edges = cd.fci_algorithm(
#     df,
#     indep_test_func=kci,
#     alpha=0.01,
#     depth=-1,
#     max_path_length=-1,
#     verbose=False,
#     show_progress=True,
#     background_knowledge = bk_pc,
#     node_names = node_names
# )
# pdy = GraphUtils.to_pydot(cg_fci)
# pdy.write_png('FCI.png')

  0%|          | 0/8 [00:00<?, ?it/s]

<class 'pydot.core.Dot'>


In [9]:
'''3.2 constrained functional'''
# LiNGAM
model_LiNGAM = lingam.ICALiNGAM(random_state=42)
model_LiNGAM.fit(df)
print(model_LiNGAM.adjacency_matrix_)
graph_dot_model_LiNGAM = make_dot(model_LiNGAM.adjacency_matrix_, labels=node_names)
graph_dot_model_LiNGAM.format = 'png'
output_path = graph_dot_model_LiNGAM.render(filename='LiNGAM',directory='.',cleanup=True)
#
#
# # Direct-LiNGAM
# model_DirectLiNGAM = lingam.DirectLiNGAM(
#     random_state=42,
#     prior_knowledge=None,
#     apply_prior_knowledge_softly=False,
#     measure='pwling',
# )
#
# model_DirectLiNGAM.fit(df)
# graph_dot_DirectLiNGAM = make_dot(model_DirectLiNGAM.adjacency_matrix_, labels=node_names)
# graph_dot_DirectLiNGAM.format = 'png'
# output_path = graph_dot_DirectLiNGAM.render(filename='DirectLiNGAM',directory='.',cleanup=True)

[[ 0.00000000e+00 -5.19599367e+01  7.75275195e+00  2.42643401e+02
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.35060400e+02
  -5.58300777e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  7.86555606e+00  0.00000000e+00 -3.18254266e+01
   0.00000000e+00  0.00000000e+00 -5.85921260e+01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00 -3.42995289e-01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  6.30711332e-01  6.48916554e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -6.35371020e-02
  -1.76313124e-01  0.00000000e+00 -6.74755698e-01  1.25526117e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  6.32451454e-02
   

In [10]:
'''3.4 Boss'''

# G = boss(df.to_numpy(), score_func='local_score_marginal_general', node_names=node_names)
G = cd.boss(df.to_numpy(), score_func='local_score_BIC', node_names=node_names)
pyd = GraphUtils.to_pydot(G)
pyd.write_png("BOSS.png")

BOSS edge count: 14    
BOSS completed in: 0.01s 




In [11]:
'''3.5 NOTEARS'''

w = cd.notears_linear(df.values, lambda1= 0.5, loss_type='l2')
print(w)
print(type(w))
NOTEARS_adjacency_matrix_ = pd.DataFrame(w, index=node_names, columns=node_names)
# print(NOTEARS_adjacency_matrix_)
graph_dot_NOTEARS = make_dot(w, labels=node_names)
graph_dot_NOTEARS.format = 'png'
output_path = graph_dot_NOTEARS.render(filename='NOTEARS',directory='.',cleanup=True)



[[  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [-51.6918418    0.           7.85920255   0.           0.
    0.           0.           0.        ]
 [  7.71828506   0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]]
<class 'numpy.ndarray'>


In [12]:
# Estimand and Estimate
import statsmodels.api as sm

# Using the Gaussian Family for multi class
method_params = {
    "glm_family": sm.families.Gaussian()          # ≡ linear regression
}

graph = utils.make_graph(model_LiNGAM.adjacency_matrix_, labels=node_names)

causal_model, causal_estimand, causal_estimate = ide.estimate(df, 'pos','class', method_params, graph)
print(causal_estimand)
print('*-'*50)
print(causal_estimate)
graph_dot = utils.str_to_dot(graph.source)

Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
  d             
──────(E[class])
d[pos]          
Estimand assumption 1, Unconfoundedness: If U→{pos} and U→class then P(class|pos,,U) = P(class|pos,)

### Estimand : 2
Estimand name: iv
Estimand expression:
 ⎡                             -1⎤
 ⎢  d           ⎛  d          ⎞  ⎥
E⎢──────(class)⋅⎜──────([pos])⎟  ⎥
 ⎣d[hed]        ⎝d[hed]       ⎠  ⎦
Estimand assumption 1, As-if-random: If U→→class then ¬(U →→{hed})
Estimand assumption 2, Exclusion: If we remove {hed}→{pos}, then ¬({hed}→class)

### Estimand : 3
Estimand name: frontdoor
No such variable(s) found!

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
*** Causal Estimate ***

## Identified estimand
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
  d             
──────(E[class])
d[pos]          
Estimand assump

In [16]:
# Refute estimand
refuter_list = ['bootstrap_refuter', "data_subset_refuter", 'dummy_outcome_refuter', 'placebo_treatment_refuter', 'random_common_cause']
refuter_method = 'bootstrap_refuter'

refuter_results = ref.causal_refuter(causal_model, causal_estimand, causal_estimate, refuter_method)
print(refuter_results)

  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercep

Refute: Bootstrap Sample Dataset
Estimated effect:0.06373737162472634
New effect:0.06475616608610603
p value:0.97

