In [1]:
from gmpy2.gmpy2 import random_state

# Quick Start: Causal Analysis
'''
This notebook demonstrates:
1. Load & clean data
2. Feature selection (IAMB)
3. Causal discovery (PC)
4. Effect estimation (ATE)
5. Visualization (DAG)
'''

'\nThis notebook demonstrates:\n1. Load & clean data\n2. Feature selection (IAMB)\n3. Causal discovery (PC)\n4. Effect estimation (ATE)\n5. Visualization (DAG)\n'

In [2]:
import sys, os

# 1. Compute project root: one level up from the notebook folder
proj_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# 2. Prepend it to sys.path
if proj_root not in sys.path:
    sys.path.insert(0, proj_root)

In [3]:
import pandas as pd
from causallearn.utils.cit import kci
from causal import preprocess
from causal import restriction
from causal import causal_discovery as cd
from causal import identifier as ide
from causal import visualization as vis
from causal import utils
from causal import refuter as ref
from causallearn.search.FCMBased.lingam.utils import make_dot
from causallearn.search.FCMBased import lingam
from dowhy import CausalModel
from causallearn.utils.GraphUtils import GraphUtils

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [38]:
if __name__ == '__main__':

    '''
    1. Data Loading and Preprocessing
    '''

    path = '../Dataset/veremi_extension_simple.csv'
    data_origin = pd.read_csv(path)
    print(f'total:{data_origin.shape}')
    # print(data_origin)

    # filter Ddos and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 11, 12, 13, 16, 17])]

    # filter fake data attack and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 1, 2, 3, 4, 5, 6, 7, 8])]

    # filter sybil attack and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 14, 15, 16, 17])]
    # print(data_origin.head(5))

    # filter sybil disruptive/data reply and normal data
    data_origin = data_origin[data_origin['class'].isin([0, 10])]
    # print(data_origin.head(5))

    data_origin = utils.collapse_classes(data_origin, 1)
    print(data_origin[data_origin['class'] == 1].shape)

    data_origin = utils.min_sample_retention(data_origin, test_size = 100000, random_state=42)
    # print(data_origin)

    print(data_origin[data_origin['class'] == 1].shape)
    print('*-' * 50)




total:(1048575, 20)
(13802, 20)
(2165, 20)
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-


In [39]:
# Data Cleaning:
drop_column = ['type','Attack','Attack_type']
data_processed = preprocess.clean(data_origin, drop_column=drop_column, drop_na=True, data_numerical=True)

# Standardize features, target keep same as original data_processed:
data_processed = preprocess.standardize(data_processed, ['class','sendTime','sender','senderPseudo','messageID'])

# Combine axis related data such as pos, spd etc. by using M = \sqrt{X^2 + Y^2 + Z^2}
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['posx', 'posy', 'posz'], 'pos')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['spdx', 'spdy', 'spdz'], 'spd')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['aclx', 'acly', 'aclz'], 'acl')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['hedx', 'hedy', 'hedz'], 'hed')
data_processed.drop(
    columns=['posx', 'posy', 'posz', 'spdx', 'spdy', 'spdz', 'aclx', 'acly', 'aclz', 'hedx', 'hedy', 'hedz','sender'],
    inplace=True
)

# ID mapping to 0-N
data_processed['senderPseudo'] = data_processed['senderPseudo'].astype('category').cat.codes
data_processed['messageID'] = data_processed['messageID'].astype('category').cat.codes

# …run two separate CausalModel objects with *_z columns as treatment

with pd.option_context('display.max_columns', None):
    print(data_processed)
    print(type(data_processed))
print('*-' * 50)

            sendTime  senderPseudo  messageID  class       pos       spd  \
925697   41041.42605          1912      33576      0  0.472010  1.237448   
75404    73303.64786          5296      92947      0  1.360402  1.666388   
568079   61985.43366          3677      64796      0  1.391491  0.059565   
889719   40011.78985          1737      30275      0  1.194195  1.761321   
1043804  33143.64933          1660      28609      0  2.274738  1.907587   
...              ...           ...        ...    ...       ...       ...   
275039   27174.64178           740      12090      0  1.635080  1.499221   
274970   27285.85018           739      12705      0  1.373019  0.059558   
1002619  32648.17180          1465      24740      0  1.105423  1.682762   
838791   56385.73064          3242      57220      0  1.706922  2.160870   
680047   63581.53831          4304      75902      0  0.911132  0.918071   

              acl       hed  
925697   2.507596  1.375185  
75404    0.319907  1.470566

In [40]:
# X = data_processed.iloc[:, 1:].copy()     # 8 features
# y = data_processed.iloc[:, 0].copy()
#
y = data_processed['class'].copy()
X = data_processed.drop(columns='class')

# print(X)
# print('*-' * 50)
# print(y)
# print('*-' * 50)

df = pd.concat([X, y.rename('class')], axis=1)
print(df)
node_names = df.columns.tolist()
print(node_names)

zeros = df.columns[df.var()==0]
print("zero var column：", zeros.tolist())

corr = df.corr().abs()
perfect_pairs = [(i,j) for i in corr.columns for j in corr.columns
                 if i!=j and corr.loc[i,j]==1.0]
print("corr column：", perfect_pairs)

            sendTime  senderPseudo  messageID       pos       spd       acl  \
925697   41041.42605          1912      33576  0.472010  1.237448  2.507596   
75404    73303.64786          5296      92947  1.360402  1.666388  0.319907   
568079   61985.43366          3677      64796  1.391491  0.059565  0.008020   
889719   40011.78985          1737      30275  1.194195  1.761321  0.935157   
1043804  33143.64933          1660      28609  2.274738  1.907587  0.618158   
...              ...           ...        ...       ...       ...       ...   
275039   27174.64178           740      12090  1.635080  1.499221  0.119040   
274970   27285.85018           739      12705  1.373019  0.059558  0.007224   
1002619  32648.17180          1465      24740  1.105423  1.682762  0.821832   
838791   56385.73064          3242      57220  1.706922  2.160870  0.565142   
680047   63581.53831          4304      75902  0.911132  0.918071  2.337264   

              hed  class  
925697   1.375185      0

In [42]:
'''
2.  Background knowledge creation
'''
bk_pc = restriction.PC_BGKnowledge(df, X, 'class')
bk_DirectLiNGAM = restriction.DirectLiNGAM_BGKnowledge(node_names, 'class')
# print(bk_DirectLiNGAM)

<class 'causallearn.utils.PCUtils.BackgroundKnowledge.BackgroundKnowledge'>


In [44]:
'''
3.  Algorithm for causal discovery
'''

'''3.1 Constrained Based'''
# PC algorithm with Kernal-based independence test
cg_pc = cd.pc_algorithm(
    df,
    indep_test_func = kci,
    alpha = 0.01,
    uc_rule = 1,
    max_k = 2,
    background_knowledge = bk_pc,
    node_names = node_names
)

pdy = GraphUtils.to_pydot(cg_pc.G)
print(type(pdy))
pdy.write_png('PC.png')


# FCI algorithm with Kernal-based independence test
# cg_fci, edges = cd.fci_algorithm(
#     df,
#     indep_test_func=kci,
#     alpha=0.01,
#     depth=-1,
#     max_path_length=-1,
#     verbose=False,
#     show_progress=True,
#     background_knowledge = bk_pc,
#     node_names = node_names
# )
# pdy = GraphUtils.to_pydot(cg_fci)
# pdy.write_png('FCI.png')

  0%|          | 0/8 [00:00<?, ?it/s]

<class 'pydot.core.Dot'>


In [45]:
'''3.2 constrained functional'''
# LiNGAM
model_LiNGAM = lingam.ICALiNGAM(random_state=42)
model_LiNGAM.fit(df)
print(model_LiNGAM.adjacency_matrix_)
graph_dot_model_LiNGAM = make_dot(model_LiNGAM.adjacency_matrix_, labels=node_names)
graph_dot_model_LiNGAM.format = 'png'
output_path = graph_dot_model_LiNGAM.render(filename='LiNGAM',directory='.',cleanup=True)
#
#
# # Direct-LiNGAM
# model_DirectLiNGAM = lingam.DirectLiNGAM(
#     random_state=42,
#     prior_knowledge=None,
#     apply_prior_knowledge_softly=False,
#     measure='pwling',
# )
#
# model_DirectLiNGAM.fit(df)
# graph_dot_DirectLiNGAM = make_dot(model_DirectLiNGAM.adjacency_matrix_, labels=node_names)
# graph_dot_DirectLiNGAM.format = 'png'
# output_path = graph_dot_DirectLiNGAM.render(filename='DirectLiNGAM',directory='.',cleanup=True)

[[ 0.00000000e+00 -5.76508157e+01  3.82534037e+00  2.90661068e+02
   0.00000000e+00  0.00000000e+00 -4.40485147e+02 -2.93574059e+02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.28655475e+02
  -5.23282284e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  1.77317194e+01  0.00000000e+00 -6.78381677e+01
   0.00000000e+00  0.00000000e+00 -7.07278917e+01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  7.00091643e-02
   0.00000000e+00  0.00000000e+00  6.50200488e-01  4.64088389e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -5.35519612e-02
  -2.08625219e-01  0.00000000e+00 -5.78747703e-01  1.76678410e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.30083558e-02
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  1.38987029e-02
   

In [46]:
'''3.4 Boss'''

# G = boss(df.to_numpy(), score_func='local_score_marginal_general', node_names=node_names)
G = cd.boss(df.to_numpy(), score_func='local_score_BIC', node_names=node_names)
pyd = GraphUtils.to_pydot(G)
pyd.write_png("BOSS.png")

BOSS edge count: 17    
BOSS completed in: 0.02s 




In [47]:
'''3.5 NOTEARS'''

w = cd.notears_linear(df.values, lambda1= 0.5, loss_type='logistic')
print(w)
print(type(w))
NOTEARS_adjacency_matrix_ = pd.DataFrame(w, index=node_names, columns=node_names)
# print(NOTEARS_adjacency_matrix_)
graph_dot_NOTEARS = make_dot(w, labels=node_names)
graph_dot_NOTEARS.format = 'png'
output_path = graph_dot_NOTEARS.render(filename='NOTEARS',directory='.',cleanup=True)



  eAw = eAw @ eAw
  G_h = E.T * W * 2


[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -2.01610281e+01
  -1.33763662e+01  0.00000000e+00 -5.05290376e+01  0.00000000e+00]
 [ 4.04918311e+02  0.00000000e+00  0.00000000e+00 -1.29781522e+00
  -8.27944296e-01  0.00000000e+00 -2.89026630e+00  0.00000000e+00]
 [ 1.38405706e+03 -3.02612623e+01  0.00000000e+00 -3.96764056e-01
   0.00000000e+00  0.00000000e+00 -7.35880770e-01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   

In [20]:
# Estimand and Estimate
import statsmodels.api as sm

# Using the Gaussian Family for multi class
method_params_gaussian = {
    "glm_family": sm.families.Gaussian()
}

# Using binomial for binary class
method_params_binomial = {
    "glm_family": sm.families.Binomial()
}


# Method
method_name = {
    'regression':'backdoor.generalized_linear_model',
    'distance_matching': 'backdoor.distance_matching',
    'propensity_score_stratification': 'backdoor.propensity_score_stratification',
    'propensity_score_matching': 'backdoor.propensity_score_matching',
    'frontdoor_regression': 'frontdoor.linear_regression'
}

graph = utils.make_graph(NOTEARS_adjacency_matrix_, labels=node_names)

causal_model, causal_estimand, causal_estimate = ide.estimate(
    df,
    treatment = 'messageID',
    outcome = 'class',
    method_params = method_params_binomial,
    method_name = 'frontdoor.linear_regression',
    graph = graph,
)
print(causal_estimand)
print('*-'*50)
print(causal_estimate)
graph_dot = utils.str_to_dot(graph.source)

Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
     d                
────────────(E[class])
d[messageID]          
Estimand assumption 1, Unconfoundedness: If U→{messageID} and U→class then P(class|messageID,,U) = P(class|messageID,)

### Estimand : 2
Estimand name: iv
Estimand expression:
 ⎡                                                                         -1⎤
 ⎢            d                    ⎛            d                         ⎞  ⎥
E⎢─────────────────────────(class)⋅⎜─────────────────────────([messageID])⎟  ⎥
 ⎣d[senderPseudo  sendTime]        ⎝d[senderPseudo  sendTime]             ⎠  ⎦
Estimand assumption 1, As-if-random: If U→→class then ¬(U →→{senderPseudo,sendTime})
Estimand assumption 2, Exclusion: If we remove {senderPseudo,sendTime}→{messageID}, then ¬({senderPseudo,sendTime}→class)

### Estimand : 3
Estimand name: frontdoor
No such variable(s) found!

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

In [46]:
# Refute estimand
refuter_list = ['bootstrap_refuter', "data_subset_refuter", 'dummy_outcome_refuter', 'placebo_treatment_refuter', 'random_common_cause']
refuter_method = 'bootstrap_refuter'

refuter_results = ref.causal_refuter(causal_model, causal_estimand, causal_estimate, refuter_method)
print(refuter_results)

  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercept_parameter = self.model.params[0]
  intercep

Refute: Bootstrap Sample Dataset
Estimated effect:-0.4737965998472773
New effect:-0.45405432568795917
p value:0.8

