In [1]:
# Quick Start: Causal Analysis
'''
This notebook demonstrates:
1. Load & clean data
2. Feature selection (IAMB)
3. Causal discovery (PC)
4. Effect estimation (ATE)
5. Visualization (DAG)
'''

'\nThis notebook demonstrates:\n1. Load & clean data\n2. Feature selection (IAMB)\n3. Causal discovery (PC)\n4. Effect estimation (ATE)\n5. Visualization (DAG)\n'

In [2]:
import sys, os

# 1. Compute project root: one level up from the notebook folder
proj_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# 2. Prepend it to sys.path
if proj_root not in sys.path:
    sys.path.insert(0, proj_root)

In [3]:
import warnings
import webbrowser
import os
import pandas as pd
import numpy as np
import shap
import networkx as nx
import inspect
from causallearn.search.ConstraintBased.FCI import fci
from causallearn.utils.cit import kci
from causal import preprocess
from causal import restriction
from causal import causal_discovery as cd
from causal import visualization as vis
from causal import utils
import itertools
from causallearn.utils.GraphUtils import GraphUtils
from sklearn.preprocessing import KBinsDiscretizer
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import io
from sklearn.metrics import mutual_info_score
from sklearn.feature_selection import mutual_info_regression
import scipy as sp


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
if __name__ == '__main__':

    '''
    1. Data Loading and Preprocessing
    '''

    path = '../Dataset/veremi_extension_simple.csv'
    data_origin = pd.read_csv(path)
    data_origin = data_origin.sample(n=50000, random_state=42)

    # filter dos and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 11, 12])]

    # filter sybil and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 14, 15, 16, 17])]
    print(data_origin.head(5))
    print('*-' * 50)


        type     sendTime  sender  senderPseudo  messageID  class  \
781974     4  64786.79042  102705     101027056  314608216      0   
937737     4  41423.20153   53703      10537034  162475454      0   
907828     4  40404.51425   52437      10524374  159087544      0   
784628     4  64742.75607  102843     101028436  314056141      0   
662460     4  63359.25486   97329      10973296  291942663      0   

               posx        posy  posz       spdx      spdy  spdz      aclx  \
781974   766.006969  388.626455     0 -11.730578 -4.089087     0 -0.020099   
937737   937.440493  885.596937     0   7.065199  6.790865     0  0.046714   
907828   212.555763  393.407409     0  -1.114930  6.797425     0 -0.272466   
784628  1265.321975  975.176831     0 -11.239370  0.950841     0  0.235476   
662460   202.301959  558.407352     0   0.000346  0.000346     0  0.000602   

            acly  aclz      hedx      hedy  hedz  Attack      Attack_type  
781974 -0.005977     0 -0.999999  0.0011

In [5]:
# Data Cleaning:
drop_column = ['type','Attack','Attack_type']
data_processed = preprocess.clean(data_origin, drop_column=drop_column, drop_na=True, data_numerical=True)

# Standardize features, target keep same as original data_processed:
data_processed = preprocess.standardize(data_processed, ['class','sendTime'])

# Combine axis related data such as pos, spd etc. by using M = \sqrt{X^2 + Y^2 + Z^2}
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['posx', 'posy', 'posz'], 'pos')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['spdx', 'spdy', 'spdz'], 'spd')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['aclx', 'acly', 'aclz'], 'acl')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['hedx', 'hedy', 'hedz'], 'hed')
data_processed.drop(
    columns=['posx', 'posy', 'posz', 'spdx', 'spdy', 'spdz', 'aclx', 'acly', 'aclz', 'hedx', 'hedy', 'hedz'],
    inplace=True
)

with pd.option_context('display.max_columns', None):
    print(data_processed)
    print(type(data_processed))
print('*-' * 50)

             sendTime    sender  senderPseudo  messageID  class       pos  \
781974   64786.790420  0.813130      0.081318   0.772846      0  1.022659   
937737   41423.201530 -0.423231     -0.235959  -0.394801      0  1.096027   
907828   40404.514250 -0.455173     -0.236003  -0.420804      0  1.352391   
784628   64742.756070  0.816612      0.081323   0.768609      0  1.949102   
662460   63359.254860  0.677489     -0.234429   0.598884      0  1.085600   
...               ...       ...           ...        ...    ...       ...   
412557   51001.319270  0.086331     -0.235250   0.097283      0  1.712751   
148718    6078.303069 -1.765410     -0.272535  -1.638168      0  0.561974   
1008472  32627.751570 -0.874661      0.078972  -0.814320     18  2.296967   
873928   57558.785960  0.309775      2.990765   0.253301     19  1.190406   
217021   26351.621540 -1.438570     -0.237369  -1.436946      0  0.968191   

              spd       acl       hed  
781974   1.474932  0.118041  1.3843

In [6]:
# X = data_processed.iloc[:, 1:].copy()     # 8 features
# y = data_processed.iloc[:, 0].copy()
#
y = data_processed['class'].copy()
X = data_processed.drop(columns='class')

# print(X)
# print('*-' * 50)
# print(y)
# print('*-' * 50)

df = pd.concat([X, y.rename('class')], axis=1)
print(df)
node_names = df.columns.tolist()
print(node_names)

             sendTime    sender  senderPseudo  messageID       pos       spd  \
781974   64786.790420  0.813130      0.081318   0.772846  1.022659  1.474932   
937737   41423.201530 -0.423231     -0.235959  -0.394801  1.096027  0.650514   
907828   40404.514250 -0.455173     -0.236003  -0.420804  1.352391  0.513848   
784628   64742.756070  0.816612      0.081323   0.768609  1.949102  1.286390   
662460   63359.254860  0.677489     -0.234429   0.598884  1.085600  0.310177   
...               ...       ...           ...        ...       ...       ...   
412557   51001.319270  0.086331     -0.235250   0.097283  1.712751  1.068474   
148718    6078.303069 -1.765410     -0.272535  -1.638168  0.561974  0.642490   
1008472  32627.751570 -0.874661      0.078972  -0.814320  2.296967  2.243776   
873928   57558.785960  0.309775      2.990765   0.253301  1.190406  0.328512   
217021   26351.621540 -1.438570     -0.237369  -1.436946  0.968191  1.704689   

              acl       hed  class  
78

In [7]:
'''
2.  Background knowledge creation
'''
bk_pc = restriction.PC_BGKnowledge(df, X, 'class')

<class 'causallearn.utils.PCUtils.BackgroundKnowledge.BackgroundKnowledge'>


In [8]:
'''
3.  Algorithm for causal discovery
'''

'''3.1 Constrained Based'''
# PC algorithm with Kernal-based independence test
cg_pc = cd.pc_algorithm(
    df,
    kci,
    alpha = 0.01,
    uc_rule = 1,
    max_k = 2,
    background_knowledge = bk_pc,
    node_names = node_names
)

# Visualize the PC graph：
# vis.causal_graph(cg_pc, 'PC')
pdy = GraphUtils.to_pydot(cg_pc.G)
pdy.write_png('PC.png')


# FCI algorithm with Kernal-based independence test
cg_fci, edges = cd.fci_algorithm(
    df,
    indep_test_func=kci,
    alpha=0.01,
    depth=-1,
    max_path_length=-1,
    verbose=False,
    show_progress=True,
    background_knowledge = bk_pc,
    node_names = node_names
)
pdy = GraphUtils.to_pydot(cg_fci)
pdy.write_png('FCI.png')

Depth=5, working on node 8: 100%|██████████| 9/9 [00:00<00:00, 2759.21it/s]
Depth=0, working on node 8: 100%|██████████| 9/9 [00:00<00:00, 1686.34it/s]


Starting BK Orientation.
Orienting edge (Knowledge): senderPseudo --> class
Orienting edge (Knowledge): pos --> class
Orienting edge (Knowledge): spd --> class
Orienting edge (Knowledge): acl --> class
Orienting edge (Knowledge): hed --> class
Finishing BK Orientation.
Starting BK Orientation.
Orienting edge (Knowledge): senderPseudo --> class
Orienting edge (Knowledge): pos --> class
Orienting edge (Knowledge): spd --> class
Orienting edge (Knowledge): acl --> class
Orienting edge (Knowledge): hed --> class
Finishing BK Orientation.
sender --> sendTime
messageID --> sendTime
senderPseudo --> class
pos --> spd
pos --> class
spd --> class


In [9]:
'''3.2 constrained functional'''
# LiNGAM
from causallearn.search.FCMBased import lingam
model_LiNGAM = lingam.ICALiNGAM(random_state=42)
model_LiNGAM.fit(df)

order_idx = model_LiNGAM.causal_order_
ordered_features = [node_names[i] for i in order_idx]
print("Causal order by name:", ordered_features)

B = model_LiNGAM.adjacency_matrix_
B_df = pd.DataFrame(B, index=node_names, columns=node_names)
print(B_df)
B_df.to_csv("lingam.csv", index=True)


# VAR-LiNGAM
from causallearn.search.FCMBased import lingam

model_varLiNGAM = lingam.DirectLiNGAM(
    random_state=42,
    prior_knowledge=None,
    apply_prior_knowledge_softly=False,
    measure='pwling',
)

model_varLiNGAM.fit(df)
order_idx = model_varLiNGAM.causal_order_
ordered_features = [node_names[i] for i in order_idx]
print("Causal order by name:", ordered_features)

B1 = model_varLiNGAM.adjacency_matrix_
B1_df = pd.DataFrame(B1, index=node_names, columns=node_names)
print(B1_df)
B1_df.to_csv("var_lingam.csv", index=True)

Causal order by name: ['sender', 'messageID', 'senderPseudo', 'pos', 'spd', 'hed', 'acl', 'sendTime', 'class']
              sendTime        sender  senderPseudo     messageID       pos  \
sendTime      0.000000  51368.315014     37.387661 -34624.895007  0.000000   
sender        0.000000      0.000000      0.000000      0.000000  0.000000   
senderPseudo  0.000000     -1.803414      0.000000      1.991136  0.000000   
messageID     0.000000      0.998317      0.000000      0.000000  0.000000   
pos           0.000000      0.331286      0.034326     -0.359816  0.000000   
spd           0.000000      0.363612      0.097989     -0.391037  0.140690   
acl           0.000000      0.243030      0.028576     -0.255524 -0.060014   
hed           0.000000     -0.069491     -0.013571      0.071039 -0.076251   
class         0.000093     -1.980905      2.121065      0.000000  1.020063   

                   spd       acl        hed  class  
sendTime      0.000000  0.000000  62.827393    0.0  
se

In [10]:
'''3.3 Additive noise model'''
from causallearn.search.FCMBased.ANM.ANM import ANM
anm = ANM()

sendTime = data_processed['sendTime'].copy()
sendTime = sendTime.to_frame()

sender = data_processed['sender'].copy()
sender = sender.to_frame()

senderPseudo = data_processed['senderPseudo'].copy()
senderPseudo = senderPseudo.to_frame()

messageID = data_processed['messageID'].copy()
messageID = messageID.to_frame()

pos = data_processed['pos'].copy()
pos = pos.to_frame()

spd = data_processed['spd'].copy()
spd = spd.to_frame()

hed = data_processed['hed'].copy()
hed = hed.to_frame()

acl = data_processed['acl'].copy()
acl = acl.to_frame()

y_test = data_processed['class'].copy()
y_test = y_test.to_frame()

# print(anm.cause_or_effect(sendTime, sender))
# print(anm.cause_or_effect(sender, messageID))
# print(anm.cause_or_effect(sendTime, messageID))



In [11]:
'''3.4 GRaSP/Boss'''
from causallearn.search.PermutationBased.GRaSP import grasp
from causallearn.search.PermutationBased.BOSS import boss
from causallearn.utils.GraphUtils import GraphUtils

G = boss(df, score_func='local_score_BIC', node_names=node_names)

pyd = GraphUtils.to_pydot(G)
pyd.write_png("boss.png")

BOSS edge count: 20    
BOSS completed in: 0.02s 




In [17]:
'''3.5 NOTEARS'''

#notears_linear(X, lambda1, loss_type, max_iter=100, h_tol=1e-8, rho_max=1e+16, w_threshold=0.3))

w = cd.notears_linear(df.values, lambda1= 0.5, loss_type='l2')
print(w)
print(w.shape)
results = pd.DataFrame(w, index=node_names, columns=node_names)
print(results)
results.to_csv("notears_linear.csv", index=True)


[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [ 8.21399601e+03  0.00000000e+00  0.00000000e+00 -1.89121455e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [ 1.40063230e+03  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   8.61925146e-01]
 [ 8.14131403e+03 -1.88800850e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [-1.54146003e+02  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [-7.15840500e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   3.02362595e-01]
 [-3.15016870e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.0000000

  eAw = eAw @ eAw
  G_h = E.T * W * 2


In [45]:
# causal graph identify

from dowhy import CausalModel
import networkx as nx
from dowhy.causal_model import BackdoorAdjustment

causal_graph = nx.DiGraph([('sendTime', 'senderPseudo'), ('senderPseudo', 'class')])
# graph_dot = "digraph { sendTime -> senderPseudo; senderPseudo -> class; }"
model = CausalModel(
    data=df, # some pandas dataframe
    treatment="sendTime",
    outcome="class",
    graph="\n".join(nx.generate_gml(causal_graph))
    # graph=graph_dot
)
print(model)
print(BackdoorAdjustment.__members__)
estimand = model.identify_effect(method_name="minimal-adjustment")
print(estimand)

<dowhy.causal_model.CausalModel object at 0x3666131f0>
{'BACKDOOR_DEFAULT': <BackdoorAdjustment.BACKDOOR_DEFAULT: 'default'>, 'BACKDOOR_EXHAUSTIVE': <BackdoorAdjustment.BACKDOOR_EXHAUSTIVE: 'exhaustive-search'>, 'BACKDOOR_MIN': <BackdoorAdjustment.BACKDOOR_MIN: 'minimal-adjustment'>, 'BACKDOOR_MAX': <BackdoorAdjustment.BACKDOOR_MAX: 'maximal-adjustment'>, 'BACKDOOR_EFFICIENT': <BackdoorAdjustment.BACKDOOR_EFFICIENT: 'efficient-adjustment'>, 'BACKDOOR_MIN_EFFICIENT': <BackdoorAdjustment.BACKDOOR_MIN_EFFICIENT: 'efficient-minimal-adjustment'>, 'BACKDOOR_MINCOST_EFFICIENT': <BackdoorAdjustment.BACKDOOR_MINCOST_EFFICIENT: 'efficient-mincost-adjustment'>}
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
     d               
───────────(E[class])
d[sendTime]          
Estimand assumption 1, Unconfoundedness: If U→{sendTime} and U→class then P(class|sendTime,,U) = P(class|sendTime,)

### Estimand : 2
Estimand name: iv
No such variab

TypeError: CausalModel.identify_effect() got an unexpected keyword argument 'backdoor_adjustment'