In [39]:
from gmpy2.gmpy2 import random_state

# Quick Start: Causal Analysis
'''
This notebook demonstrates:
1. Load & clean data
2. Feature selection (IAMB)
3. Causal discovery (PC)
4. Effect estimation (ATE)
5. Visualization (DAG)
'''

'\nThis notebook demonstrates:\n1. Load & clean data\n2. Feature selection (IAMB)\n3. Causal discovery (PC)\n4. Effect estimation (ATE)\n5. Visualization (DAG)\n'

In [40]:
import sys, os

# 1. Compute project root: one level up from the notebook folder
proj_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# 2. Prepend it to sys.path
if proj_root not in sys.path:
    sys.path.insert(0, proj_root)

In [41]:
import warnings
import webbrowser
import os
import pandas as pd
import numpy as np
import shap
import networkx as nx
import inspect
from causallearn.search.ConstraintBased.FCI import fci
from causallearn.utils.cit import kci
from causal import preprocess
from causal import restriction
from causal import causal_discovery as cd
from causal import visualization as vis
from causal import utils
from causallearn.search.FCMBased.lingam.utils import make_dot
from causallearn.search.FCMBased import lingam
from dowhy import CausalModel
import itertools
from causallearn.utils.GraphUtils import GraphUtils
from sklearn.preprocessing import KBinsDiscretizer
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import io
from sklearn.metrics import mutual_info_score
from sklearn.feature_selection import mutual_info_regression
import scipy as sp
import pydot


In [42]:
if __name__ == '__main__':

    '''
    1. Data Loading and Preprocessing
    '''

    path = '../Dataset/veremi_extension_simple.csv'
    data_origin = pd.read_csv(path)
    data_origin = data_origin.sample(n=50000, random_state=42)

    # filter dos and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 11, 12])]

    # filter sybil and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 14, 15, 16, 17])]
    print(data_origin.head(5))
    print('*-' * 50)


        type     sendTime  sender  senderPseudo  messageID  class  \
781974     4  64786.79042  102705     101027056  314608216      0   
937737     4  41423.20153   53703      10537034  162475454      0   
907828     4  40404.51425   52437      10524374  159087544      0   
784628     4  64742.75607  102843     101028436  314056141      0   
662460     4  63359.25486   97329      10973296  291942663      0   

               posx        posy  posz       spdx      spdy  spdz      aclx  \
781974   766.006969  388.626455     0 -11.730578 -4.089087     0 -0.020099   
937737   937.440493  885.596937     0   7.065199  6.790865     0  0.046714   
907828   212.555763  393.407409     0  -1.114930  6.797425     0 -0.272466   
784628  1265.321975  975.176831     0 -11.239370  0.950841     0  0.235476   
662460   202.301959  558.407352     0   0.000346  0.000346     0  0.000602   

            acly  aclz      hedx      hedy  hedz  Attack      Attack_type  
781974 -0.005977     0 -0.999999  0.0011

In [43]:
# Data Cleaning:
drop_column = ['type','Attack','Attack_type']
data_processed = preprocess.clean(data_origin, drop_column=drop_column, drop_na=True, data_numerical=True)

# Standardize features, target keep same as original data_processed:
data_processed = preprocess.standardize(data_processed, ['class','sendTime','sender','senderPseudo'])

# Combine axis related data such as pos, spd etc. by using M = \sqrt{X^2 + Y^2 + Z^2}
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['posx', 'posy', 'posz'], 'pos')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['spdx', 'spdy', 'spdz'], 'spd')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['aclx', 'acly', 'aclz'], 'acl')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['hedx', 'hedy', 'hedz'], 'hed')
data_processed.drop(
    columns=['posx', 'posy', 'posz', 'spdx', 'spdy', 'spdz', 'aclx', 'acly', 'aclz', 'hedx', 'hedy', 'hedz'],
    inplace=True
)

data_processed['sender'] = data_processed['sender'].astype('category').cat.codes
data_processed['senderPseudo'] = data_processed['senderPseudo'].astype('category').cat.codes

with pd.option_context('display.max_columns', None):
    print(data_processed)
    print(type(data_processed))
print('*-' * 50)

             sendTime  sender  senderPseudo  messageID  class       pos  \
781974   64786.790420    6483          6792   0.772846      0  1.022659   
937737   41423.201530    2622          2519  -0.394801      0  1.096027   
907828   40404.514250    2423          2330  -0.420804      0  1.352391   
784628   64742.756070    6504          6812   0.768609      0  1.949102   
662460   63359.254860    5638          5402   0.598884      0  1.085600   
...               ...     ...           ...        ...    ...       ...   
412557   51001.319270    3154          3027   0.097283      0  1.712751   
148718    6078.303069      30            30  -1.638168      0  0.561974   
1008472  32627.751570    1973          6359  -0.814320     18  2.296967   
873928   57558.785960    4568         10089   0.253301     19  1.190406   
217021   26351.621540     552           546  -1.436946      0  0.968191   

              spd       acl       hed  
781974   1.474932  0.118041  1.384319  
937737   0.650514  

In [44]:
# X = data_processed.iloc[:, 1:].copy()     # 8 features
# y = data_processed.iloc[:, 0].copy()
#
y = data_processed['class'].copy()
X = data_processed.drop(columns='class')

# print(X)
# print('*-' * 50)
# print(y)
# print('*-' * 50)

df = pd.concat([X, y.rename('class')], axis=1)
print(df)
node_names = df.columns.tolist()
print(node_names)

             sendTime  sender  senderPseudo  messageID       pos       spd  \
781974   64786.790420    6483          6792   0.772846  1.022659  1.474932   
937737   41423.201530    2622          2519  -0.394801  1.096027  0.650514   
907828   40404.514250    2423          2330  -0.420804  1.352391  0.513848   
784628   64742.756070    6504          6812   0.768609  1.949102  1.286390   
662460   63359.254860    5638          5402   0.598884  1.085600  0.310177   
...               ...     ...           ...        ...       ...       ...   
412557   51001.319270    3154          3027   0.097283  1.712751  1.068474   
148718    6078.303069      30            30  -1.638168  0.561974  0.642490   
1008472  32627.751570    1973          6359  -0.814320  2.296967  2.243776   
873928   57558.785960    4568         10089   0.253301  1.190406  0.328512   
217021   26351.621540     552           546  -1.436946  0.968191  1.704689   

              acl       hed  class  
781974   0.118041  1.38431

In [45]:
'''
2.  Background knowledge creation
'''
bk_pc = restriction.PC_BGKnowledge(df, X, 'class')

<class 'causallearn.utils.PCUtils.BackgroundKnowledge.BackgroundKnowledge'>


In [46]:
'''
3.  Algorithm for causal discovery
'''

'''3.1 Constrained Based'''
# PC algorithm with Kernal-based independence test
cg_pc = cd.pc_algorithm(
    df,
    kci,
    alpha = 0.01,
    uc_rule = 1,
    max_k = 2,
    background_knowledge = bk_pc,
    node_names = node_names
)

# Visualize the PC graph：
# vis.causal_graph(cg_pc, 'PC')
pdy = GraphUtils.to_pydot(cg_pc.G)
print(type(pdy))
pdy.write_png('PC.png')


# FCI algorithm with Kernal-based independence test
cg_fci, edges = cd.fci_algorithm(
    df,
    indep_test_func=kci,
    alpha=0.01,
    depth=-1,
    max_path_length=-1,
    verbose=False,
    show_progress=True,
    background_knowledge = bk_pc,
    node_names = node_names
)
pdy = GraphUtils.to_pydot(cg_fci)
pdy.write_png('FCI.png')

Depth=4, working on node 8: 100%|██████████| 9/9 [00:00<00:00, 2231.67it/s]

<class 'pydot.core.Dot'>



Depth=0, working on node 8: 100%|██████████| 9/9 [00:00<00:00, 1444.60it/s]


Starting BK Orientation.
Orienting edge (Knowledge): senderPseudo --> class
Orienting edge (Knowledge): pos --> class
Orienting edge (Knowledge): spd --> class
Orienting edge (Knowledge): acl --> class
Orienting edge (Knowledge): hed --> class
Finishing BK Orientation.
Starting BK Orientation.
Orienting edge (Knowledge): senderPseudo --> class
Orienting edge (Knowledge): pos --> class
Orienting edge (Knowledge): spd --> class
Orienting edge (Knowledge): acl --> class
Orienting edge (Knowledge): hed --> class
Finishing BK Orientation.
senderPseudo --> class
pos --> spd
pos --> class
spd --> class


In [47]:
'''3.2 constrained functional'''
# LiNGAM
model_LiNGAM = lingam.ICALiNGAM(random_state=42)
model_LiNGAM.fit(df)
print(model_LiNGAM.adjacency_matrix_)
graph_dot_model_LiNGAM = make_dot(model_LiNGAM.adjacency_matrix_, labels=node_names)
graph_dot_model_LiNGAM.format = 'png'
output_path = graph_dot_model_LiNGAM.render(filename='LiNGAM',directory='.',cleanup=True)


# Direct-LiNGAM
model_DirectLiNGAM = lingam.DirectLiNGAM(
    random_state=42,
    prior_knowledge=None,
    apply_prior_knowledge_softly=False,
    measure='pwling',
)

model_DirectLiNGAM.fit(df)
graph_dot_DirectLiNGAM = make_dot(model_DirectLiNGAM.adjacency_matrix_, labels=node_names)
graph_dot_DirectLiNGAM.format = 'png'
output_path = graph_dot_DirectLiNGAM.render(filename='DirectLiNGAM',directory='.',cleanup=True)

[[ 0.00000000e+00  3.49169239e+00 -1.32768531e-01  9.46641359e+03
   6.14773096e+01  5.95169997e+01  2.77325640e+01 -1.93874612e+02
   1.02038522e+01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  2.14019162e+03
   1.15946382e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [ 0.00000000e+00  8.68108602e-01  0.00000000e+00  1.90808814e+02
  -2.98479979e+01  1.00405696e+02  1.78582012e+01  0.00000000e+00
   1.03910318e+02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -2.16296559e-02
   0.00000000e+00  0.00000000e+00  0.00000000e+00 -6.80278192e-01
   0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   8.28039098e-02  0.00000000e+00 -1.89937963e-02 -7.67663340e-01
   0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  -5.75172718e-02  0.0000000

In [48]:
# '''3.3 RCD'''


In [49]:
'''3.4 Boss'''
from causallearn.search.PermutationBased.BOSS import boss
from causallearn.utils.GraphUtils import GraphUtils

# G = boss(df.to_numpy(), score_func='local_score_marginal_general', node_names=node_names)
G = boss(df.to_numpy(), score_func='local_score_BIC', node_names=node_names)
pyd = GraphUtils.to_pydot(G)
pyd.write_png("BOSS.png")

BOSS edge count: 19    
BOSS completed in: 0.03s 




In [50]:
'''3.5 NOTEARS'''

#notears_linear(X, lambda1, loss_type, max_iter=100, h_tol=1e-8, rho_max=1e+16, w_threshold=0.3))

w = cd.notears_linear(df.values, lambda1= 0.5, loss_type='l2')
print(w)
print(type(w))
NOTEARS_adjacency_matrix_ = pd.DataFrame(w, index=node_names, columns=node_names)
# print(NOTEARS_adjacency_matrix_)
graph_dot_NOTEARS = make_dot(w, labels=node_names)
graph_dot_NOTEARS.format = 'png'
output_path = graph_dot_NOTEARS.render(filename='NOTEARS',directory='.',cleanup=True)



[[  0.           0.           0.           0.           0.
    0.           0.           0.           0.        ]
 [  7.63648756   0.           0.87480105   0.           0.
    0.           0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.           0.        ]
 [  9.24616396   0.60954622   0.           0.           0.
    0.           0.           0.           0.        ]
 [  0.           0.           1.24702188   0.           0.
    0.           0.           0.           0.        ]
 [  0.81840374  -0.37434358   3.99013381   0.           0.
    0.           0.           0.           0.        ]
 [  0.44610063   0.           1.00770876   0.           0.
    0.           0.           0.           0.        ]
 [  0.           0.          -0.83189466   0.           0.
    0.           0.           0.           0.        ]
 [  4.07665369  -7.76216792 106.87619634   0.           0.
    0.           0.          

In [51]:
# causal graph identify
# Obtain valid dot format
import statsmodels.api as sm

# Using the Gaussian Family for multi class
method_params = {
    "glm_family": sm.families.Gaussian()          # ≡ linear regression
}

graph_dot_LiNGAM = utils.make_graph(w, labels=node_names)
# print(graph_dot_LiNGAM)

# Define Causal Model
model=CausalModel(
        data = df,
        treatment='sender',
        outcome='class',
        graph=utils.str_to_dot(graph_dot_LiNGAM.source))

# Identification
identified_estimand_LiNGAM = model.identify_effect(proceed_when_unidentifiable=False)
print(identified_estimand_LiNGAM)

# Estimation
estimate = model.estimate_effect(identified_estimand_LiNGAM,
                                # method_name="backdoor.linear_regression",
                                method_name="backdoor.generalized_linear_model",
                                # control_value=0,
                                # treatment_value=1,
                                method_params=method_params,
                                confidence_intervals=True,
                                target_units='ate',
                                test_significance=True)
print("Causal Estimate is: " + str(estimate.value))
print(estimate)

Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
    d                                    
─────────(E[class|senderPseudo,sendTime])
d[sender]                                
Estimand assumption 1, Unconfoundedness: If U→{sender} and U→class then P(class|sender,senderPseudo,sendTime,U) = P(class|sender,senderPseudo,sendTime)

### Estimand : 2
Estimand name: iv
No such variable(s) found!

### Estimand : 3
Estimand name: frontdoor
No such variable(s) found!



  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercep

Causal Estimate is: -0.0020426098646808555
*** Causal Estimate ***

## Identified estimand
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
    d                                    
─────────(E[class|senderPseudo,sendTime])
d[sender]                                
Estimand assumption 1, Unconfoundedness: If U→{sender} and U→class then P(class|sender,senderPseudo,sendTime,U) = P(class|sender,senderPseudo,sendTime)

## Realized estimand
b: class~Sigmoid(sender+senderPseudo+sendTime)
Target units: ate

## Estimate
Mean value: -0.0020426098646808555
p-value: [0, 0.001]
95.0% confidence interval: (-0.0021829520492495647, -0.0019167450584252066)



  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
