In [1]:
from gmpy2.gmpy2 import random_state

# Quick Start: Causal Analysis
'''
This notebook demonstrates:
1. Load & clean data
2. Feature selection (IAMB)
3. Causal discovery (PC)
4. Effect estimation (ATE)
5. Visualization (DAG)
'''

'\nThis notebook demonstrates:\n1. Load & clean data\n2. Feature selection (IAMB)\n3. Causal discovery (PC)\n4. Effect estimation (ATE)\n5. Visualization (DAG)\n'

In [2]:
import sys, os

# 1. Compute project root: one level up from the notebook folder
proj_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# 2. Prepend it to sys.path
if proj_root not in sys.path:
    sys.path.insert(0, proj_root)

In [3]:
import warnings
import webbrowser
import os
import pandas as pd
import numpy as np
import shap
import networkx as nx
import inspect
from causallearn.search.ConstraintBased.FCI import fci
from causallearn.utils.cit import kci
from causal import preprocess
from causal import restriction
from causal import causal_discovery as cd
from causal import visualization as vis
from causal import utils
from causallearn.search.FCMBased.lingam.utils import make_dot
from causallearn.search.FCMBased import lingam
from dowhy import CausalModel
import itertools
from causallearn.utils.GraphUtils import GraphUtils
from sklearn.preprocessing import KBinsDiscretizer
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import io
from sklearn.metrics import mutual_info_score
from sklearn.feature_selection import mutual_info_regression
import scipy as sp
import pydot


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
if __name__ == '__main__':

    '''
    1. Data Loading and Preprocessing
    '''

    path = '../Dataset/veremi_extension_simple.csv'
    data_origin = pd.read_csv(path)
    data_origin = data_origin.sample(n=80000, random_state=42)

    # filter dos and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 11, 12])]

    # filter sybil and normal data
    data_origin = data_origin[data_origin['class'].isin([0, 14, 15, 16, 17])]
    print(data_origin.head(5))
    print('*-' * 50)


        type     sendTime  sender  senderPseudo  messageID  class  \
781974     4  64786.79042  102705     101027056  314608216      0   
937737     4  41423.20153   53703      10537034  162475454      0   
907828     4  40404.51425   52437      10524374  159087544      0   
784628     4  64742.75607  102843     101028436  314056141      0   
662460     4  63359.25486   97329      10973296  291942663      0   

               posx        posy  posz       spdx      spdy  spdz      aclx  \
781974   766.006969  388.626455     0 -11.730578 -4.089087     0 -0.020099   
937737   937.440493  885.596937     0   7.065199  6.790865     0  0.046714   
907828   212.555763  393.407409     0  -1.114930  6.797425     0 -0.272466   
784628  1265.321975  975.176831     0 -11.239370  0.950841     0  0.235476   
662460   202.301959  558.407352     0   0.000346  0.000346     0  0.000602   

            acly  aclz      hedx      hedy  hedz  Attack      Attack_type  
781974 -0.005977     0 -0.999999  0.0011

In [5]:
# Data Cleaning:
drop_column = ['type','Attack','Attack_type']
data_processed = preprocess.clean(data_origin, drop_column=drop_column, drop_na=True, data_numerical=True)

# Standardize features, target keep same as original data_processed:
data_processed = preprocess.standardize(data_processed, ['class','sendTime','sender','senderPseudo'])

# Combine axis related data such as pos, spd etc. by using M = \sqrt{X^2 + Y^2 + Z^2}
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['posx', 'posy', 'posz'], 'pos')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['spdx', 'spdy', 'spdz'], 'spd')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['aclx', 'acly', 'aclz'], 'acl')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['hedx', 'hedy', 'hedz'], 'hed')
data_processed.drop(
    columns=['posx', 'posy', 'posz', 'spdx', 'spdy', 'spdz', 'aclx', 'acly', 'aclz', 'hedx', 'hedy', 'hedz','sender'],
    inplace=True
)

# data_processed['sender'] = data_processed['sender'].astype('category').cat.codes
data_processed['senderPseudo'] = data_processed['senderPseudo'].astype('category').cat.codes

with pd.option_context('display.max_columns', None):
    print(data_processed)
    print(type(data_processed))
print('*-' * 50)

           sendTime  senderPseudo  messageID  class       pos       spd  \
781974  64786.79042          5388   0.769131      0  1.079276  1.487702   
937737  41423.20153          2017  -0.396491      0  1.150458  0.874873   
907828  40404.51425          1865  -0.422448      0  1.381748  0.616549   
784628  64742.75607          5405   0.764901      0  2.025261  1.309596   
662460  63359.25486          4317   0.595471      0  1.089473  0.187500   
...             ...           ...        ...    ...       ...       ...   
742322  64242.23069          7167   0.714441     17  1.772194  0.647911   
163266  25431.76876           121  -1.489540      0  1.367052  0.448768   
587034  62320.02272          3879   0.497931     15  1.611294  3.477933   
721179  64052.01338          4640   0.690117      0  0.459829  0.818588   
207811  26128.67298           377  -1.448845      0  1.381964  0.154662   

             acl       hed  
781974  0.094718  1.377483  
937737  0.016209  1.482461  
907828  1.42

In [6]:
# X = data_processed.iloc[:, 1:].copy()     # 8 features
# y = data_processed.iloc[:, 0].copy()
#
y = data_processed['class'].copy()
X = data_processed.drop(columns='class')

# print(X)
# print('*-' * 50)
# print(y)
# print('*-' * 50)

df = pd.concat([X, y.rename('class')], axis=1)
print(df)
node_names = df.columns.tolist()
print(node_names)

zeros = df.columns[df.var()==0]
print("zero var column：", zeros.tolist())

corr = df.corr().abs()
perfect_pairs = [(i,j) for i in corr.columns for j in corr.columns
                 if i!=j and corr.loc[i,j]==1.0]
print("corr column：", perfect_pairs)

           sendTime  senderPseudo  messageID       pos       spd       acl  \
781974  64786.79042          5388   0.769131  1.079276  1.487702  0.094718   
937737  41423.20153          2017  -0.396491  1.150458  0.874873  0.016209   
907828  40404.51425          1865  -0.422448  1.381748  0.616549  1.423330   
784628  64742.75607          5405   0.764901  2.025261  1.309596  0.184231   
662460  63359.25486          4317   0.595471  1.089473  0.187500  0.075269   
...             ...           ...        ...       ...       ...       ...   
742322  64242.23069          7167   0.714441  1.772194  0.647911  3.012952   
163266  25431.76876           121  -1.489540  1.367052  0.448768  4.314247   
587034  62320.02272          3879   0.497931  1.611294  3.477933  0.663925   
721179  64052.01338          4640   0.690117  0.459829  0.818588  2.037673   
207811  26128.67298           377  -1.448845  1.381964  0.154662  0.065822   

             hed  class  
781974  1.377483      0  
937737  1.4

In [7]:
'''
2.  Background knowledge creation
'''
bk_pc = restriction.PC_BGKnowledge(df, X, 'class')
bk_DirectLiNGAM = restriction.DirectLiNGAM_BGKnowledge(node_names, 'class')
# print(bk_DirectLiNGAM)

<class 'causallearn.utils.PCUtils.BackgroundKnowledge.BackgroundKnowledge'>


In [8]:
'''
3.  Algorithm for causal discovery
'''

'''3.1 Constrained Based'''
# PC algorithm with Kernal-based independence test
cg_pc = cd.pc_algorithm(
    df,
    indep_test_func=kci,
    alpha = 0.01,
    uc_rule = 1,
    max_k = 2,
    background_knowledge = bk_pc,
    node_names = node_names
)

# Visualize the PC graph：
# vis.causal_graph(cg_pc, 'PC')
pdy = GraphUtils.to_pydot(cg_pc.G)
print(type(pdy))
pdy.write_png('PC.png')


# FCI algorithm with Kernal-based independence test
cg_fci, edges = cd.fci_algorithm(
    df,
    indep_test_func=kci,
    alpha=0.01,
    depth=-1,
    max_path_length=-1,
    verbose=False,
    show_progress=True,
    background_knowledge = bk_pc,
    node_names = node_names
)
pdy = GraphUtils.to_pydot(cg_fci)
pdy.write_png('FCI.png')

Depth=4, working on node 7: 100%|██████████| 8/8 [00:00<00:00, 2515.89it/s]

<class 'pydot.core.Dot'>



Depth=0, working on node 7: 100%|██████████| 8/8 [00:00<00:00, 1430.16it/s]


Starting BK Orientation.
Orienting edge (Knowledge): senderPseudo --> class
Orienting edge (Knowledge): pos --> class
Orienting edge (Knowledge): spd --> class
Orienting edge (Knowledge): acl --> class
Orienting edge (Knowledge): hed --> class
Finishing BK Orientation.
Starting BK Orientation.
Orienting edge (Knowledge): senderPseudo --> class
Orienting edge (Knowledge): pos --> class
Orienting edge (Knowledge): spd --> class
Orienting edge (Knowledge): acl --> class
Orienting edge (Knowledge): hed --> class
Finishing BK Orientation.
messageID --> senderPseudo
senderPseudo --> class
pos --> class


In [9]:
'''3.2 constrained functional'''
# LiNGAM
model_LiNGAM = lingam.ICALiNGAM(random_state=42)
model_LiNGAM.fit(df)
print(model_LiNGAM.adjacency_matrix_)
graph_dot_model_LiNGAM = make_dot(model_LiNGAM.adjacency_matrix_, labels=node_names)
graph_dot_model_LiNGAM.format = 'png'
output_path = graph_dot_model_LiNGAM.render(filename='LiNGAM',directory='.',cleanup=True)


# Direct-LiNGAM
model_DirectLiNGAM = lingam.DirectLiNGAM(
    random_state=42,
    prior_knowledge=None,
    apply_prior_knowledge_softly=False,
    measure='pwling',
)

model_DirectLiNGAM.fit(df)
graph_dot_DirectLiNGAM = make_dot(model_DirectLiNGAM.adjacency_matrix_, labels=node_names)
graph_dot_DirectLiNGAM.format = 'png'
output_path = graph_dot_DirectLiNGAM.render(filename='DirectLiNGAM',directory='.',cleanup=True)

[[ 0.00000000e+00  4.16627585e-01  1.59864320e+04  1.87610355e+02
   0.00000000e+00  3.97950567e+01 -4.18100552e+02 -8.80144285e+00]
 [ 0.00000000e+00  0.00000000e+00  1.66096235e+03 -3.43803647e+01
  -7.28566394e+01  0.00000000e+00  3.00982959e+02  3.91564288e+01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00 -2.36761262e-02  0.00000000e+00
   0.00000000e+00  0.00000000e+00 -5.92471947e-01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00 -1.61460216e-02  1.58353430e-01
   0.00000000e+00 -5.55507472e-02 -1.51722104e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -3.22312624e-02
   0.00000000e+00  0.00000000e+00 -5.00746117e-01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  1.20525000e+00
   

In [10]:
# '''3.3 RCD'''


In [11]:
'''3.4 Boss'''
from causallearn.search.PermutationBased.BOSS import boss
from causallearn.utils.GraphUtils import GraphUtils

# G = boss(df.to_numpy(), score_func='local_score_marginal_general', node_names=node_names)
G = boss(df.to_numpy(), score_func='local_score_BIC', node_names=node_names)
pyd = GraphUtils.to_pydot(G)
pyd.write_png("BOSS.png")

BOSS edge count: 17    
BOSS completed in: 0.01s 




In [12]:
'''3.5 NOTEARS'''

#notears_linear(X, lambda1, loss_type, max_iter=100, h_tol=1e-8, rho_max=1e+16, w_threshold=0.3))

w = cd.notears_linear(df.values, lambda1= 0.5, loss_type='l2')
print(w)
print(type(w))
NOTEARS_adjacency_matrix_ = pd.DataFrame(w, index=node_names, columns=node_names)
# print(NOTEARS_adjacency_matrix_)
graph_dot_NOTEARS = make_dot(w, labels=node_names)
graph_dot_NOTEARS.format = 'png'
output_path = graph_dot_NOTEARS.render(filename='NOTEARS',directory='.',cleanup=True)



  eAw = eAw @ eAw
  G_h = E.T * W * 2


[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 5.09861122e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 6.13102708e+03  6.44926973e+01  0.00000000e+00  0.00000000e+00
   4.74008811e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 2.07315063e+02 -4.06740149e+01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  1.15675001e+00]
 [ 6.51047757e+02 -9.65318291e+01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 1.22739923e+02 -1.89416051e+01  1.26969466e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [-1.34008675e+02  1.55581758e+01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [-2.91884749e+02  3.90789825e+01 -1.74856631e+01  0.00000000e+00
   

In [18]:
# causal graph identify
# Obtain valid dot format
import statsmodels.api as sm

# Using the Gaussian Family for multi class
method_params = {
    "glm_family": sm.families.Gaussian()          # ≡ linear regression
}

graph_dot_LiNGAM = utils.make_graph(w, labels=node_names)
print(graph_dot_LiNGAM.source)
print(type(graph_dot_LiNGAM.source))

digraph {
	sendTime
	senderPseudo
	messageID
	pos
	spd
	acl
	hed
	class
	sendTime -> senderPseudo [label=5.098611222630016]
	sendTime -> messageID [label=6131.027083044324]
	senderPseudo -> messageID [label=64.49269728045238]
	spd -> messageID [label=0.47400881126200234]
	sendTime -> pos [label=207.31506279014667]
	senderPseudo -> pos [label=-40.674014857634546]
	class -> pos [label=1.1567500087322062]
	sendTime -> spd [label=651.0477573256712]
	senderPseudo -> spd [label=-96.53182909247958]
	sendTime -> acl [label=122.73992338489118]
	senderPseudo -> acl [label=-18.941605078017936]
	messageID -> acl [label=1.2696946622355787]
	sendTime -> hed [label=-134.00867495587124]
	senderPseudo -> hed [label=15.558175834921615]
	sendTime -> class [label=-291.88474883217987]
	senderPseudo -> class [label=39.07898252102976]
	messageID -> class [label=-17.485663118194577]
	acl -> class [label=-0.44530778746819105]
}

<class 'str'>


In [17]:
# Define Causal Model
model=CausalModel(
        data = df,
        treatment='senderPseudo',
        outcome='class',
        graph=utils.str_to_dot(graph_dot_LiNGAM.source))

# Identification
identified_estimand_LiNGAM = model.identify_effect(proceed_when_unidentifiable=False)
print(identified_estimand_LiNGAM)

# Estimation
estimate = model.estimate_effect(identified_estimand_LiNGAM,
                                # method_name="backdoor.linear_regression",
                                method_name="backdoor.generalized_linear_model",
                                # control_value=0,
                                # treatment_value=1,
                                method_params=method_params,
                                confidence_intervals=True,
                                target_units='ate',
                                test_significance=True)
print("Causal Estimate is: " + str(estimate.value))
print(estimate)



Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
       d                          
───────────────(E[class|sendTime])
d[senderPseudo]                   
Estimand assumption 1, Unconfoundedness: If U→{senderPseudo} and U→class then P(class|senderPseudo,sendTime,U) = P(class|senderPseudo,sendTime)

### Estimand : 2
Estimand name: iv
No such variable(s) found!

### Estimand : 3
Estimand name: frontdoor
No such variable(s) found!



  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercep

Causal Estimate is: 0.0015000733796277643
*** Causal Estimate ***

## Identified estimand
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
       d                          
───────────────(E[class|sendTime])
d[senderPseudo]                   
Estimand assumption 1, Unconfoundedness: If U→{senderPseudo} and U→class then P(class|senderPseudo,sendTime,U) = P(class|senderPseudo,sendTime)

## Realized estimand
b: class~Sigmoid(senderPseudo+sendTime)
Target units: ate

## Estimate
Mean value: 0.0015000733796277643
p-value: [0, 0.001]
95.0% confidence interval: (0.001409057352296239, 0.0015903390133482631)



  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
