In [131]:
from gmpy2.gmpy2 import random_state

# Quick Start: Causal Analysis
'''
This notebook demonstrates:
1. Load & clean data
2. Feature selection (IAMB)
3. Causal discovery (PC)
4. Effect estimation (ATE)
5. Visualization (DAG)
'''

'\nThis notebook demonstrates:\n1. Load & clean data\n2. Feature selection (IAMB)\n3. Causal discovery (PC)\n4. Effect estimation (ATE)\n5. Visualization (DAG)\n'

In [132]:
import sys, os

# 1. Compute project root: one level up from the notebook folder
proj_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# 2. Prepend it to sys.path
if proj_root not in sys.path:
    sys.path.insert(0, proj_root)

In [133]:
import pandas as pd
import numpy as np
from causallearn.utils.cit import kci
from causal import preprocess
from causal import restriction
from causal import causal_discovery as cd
from causal import identifier as ide
from causal import visualization as vis
from causal import utils
from causal import refuter as ref
from causallearn.search.FCMBased.lingam.utils import make_dot
from causallearn.search.FCMBased import lingam
from dowhy import CausalModel
from causallearn.utils.GraphUtils import GraphUtils
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [134]:
if __name__ == '__main__':

    '''
    1. Data Loading and Preprocessing
    '''

    path = '../Dataset/veremi_extension_simple.csv'
    data_origin = pd.read_csv(path)
    print(f'total:{data_origin.shape}')
    # print(data_origin)

    # filter Ddos and normal data
    data_origin = data_origin[data_origin['class'].isin([0, 11, 12, 13, 16, 17])]

    # filter fake data attack and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 1, 2, 3, 4, 5, 6, 7, 8])]

    # filter sybil attack and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 14, 15, 16, 17])]
    # print(data_origin.head(5))

    # filter sybil disruptive/data reply and normal data
    # data_origin = data_origin[data_origin['class'].isin([0, 10])]
    # print(data_origin.head(5))

    data_origin = utils.collapse_classes(data_origin, 1)
    print(data_origin[data_origin['class'] == 1].shape)

    data_origin = utils.min_sample_retention(data_origin, test_size = 100000, random_state=42)
    # print(data_origin)

    print(data_origin[data_origin['class'] == 1].shape)
    print('*-' * 50)




total:(1048575, 20)
(143130, 20)
(18668, 20)
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-


In [135]:
# Data Cleaning:
drop_column = ['type','Attack','Attack_type']
data_processed = preprocess.clean(data_origin, drop_column=drop_column, drop_na=True, data_numerical=True)

# Standardize features, target keep same as original data_processed:
data_processed = preprocess.standardize(data_processed, ['class','sendTime','sender','senderPseudo','messageID'])

# Combine axis related data such as pos, spd etc. by using M = \sqrt{X^2 + Y^2 + Z^2}
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['posx', 'posy', 'posz'], 'pos')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['spdx', 'spdy', 'spdz'], 'spd')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['aclx', 'acly', 'aclz'], 'acl')
data_processed = preprocess.add_vector_magnitude_column(data_processed, ['hedx', 'hedy', 'hedz'], 'hed')
data_processed.drop(
    columns=['posx', 'posy', 'posz', 'spdx', 'spdy', 'spdz', 'aclx', 'acly', 'aclz', 'hedx', 'hedy', 'hedz','sender'],
    inplace=True
)

# ID mapping to 0-N
# data_processed['senderPseudo'] = data_processed['senderPseudo'].astype('category').cat.codes
# data_processed['messageID'] = data_processed['messageID'].astype('category').cat.codes

# …run two separate CausalModel objects with *_z columns as treatment

with pd.option_context('display.max_columns', None):
    print(data_processed)
    print(type(data_processed))
print('*-' * 50)

           sendTime  senderPseudo  messageID  class       pos       spd  \
28678   72524.93554     101314037  427449123      0  1.177110  1.905615   
200107  25997.45431      10126152   24069906      1  2.795565  0.062502   
430083  51456.19907      10747875  227970587      0  0.889361  1.033074   
672050  63495.90880      10977316  294191998      1  1.685966  1.864182   
598240  62513.11618      10942036  281163403      0  1.820064  2.141214   
...             ...           ...        ...    ...       ...       ...   
867493  57363.97413      10824615  246269618      0  1.185938  1.932187   
611642  62679.81393      10948636  282448261      0  1.032245  1.947845   
255116  26914.40500      10152432   32398993      0  1.403602  0.342722   
907233  40395.11123      20524134  159076506      1  1.373236  0.062503   
807673  54925.09253      10797675  240617007      0  1.541126  1.599667   

             acl       hed  
28678   0.053295  1.402316  
200107  0.005667  0.124483  
430083  1.86

In [136]:
# X = data_processed.iloc[:, 1:].copy()     # 8 features
# y = data_processed.iloc[:, 0].copy()
#
y = data_processed['class'].copy()
X = data_processed.drop(columns='class')

# print(X)
# print('*-' * 50)
# print(y)
# print('*-' * 50)

df = pd.concat([X, y.rename('class')], axis=1)
print(df)
node_names = df.columns.tolist()
print(node_names)

zeros = df.columns[df.var()==0]
print("zero var column：", zeros.tolist())

corr = df.corr().abs()
perfect_pairs = [(i,j) for i in corr.columns for j in corr.columns
                 if i!=j and corr.loc[i,j]==1.0]
print("corr column：", perfect_pairs)

           sendTime  senderPseudo  messageID       pos       spd       acl  \
28678   72524.93554     101314037  427449123  1.177110  1.905615  0.053295   
200107  25997.45431      10126152   24069906  2.795565  0.062502  0.005667   
430083  51456.19907      10747875  227970587  0.889361  1.033074  1.867734   
672050  63495.90880      10977316  294191998  1.685966  1.864182  0.313507   
598240  62513.11618      10942036  281163403  1.820064  2.141214  0.343610   
...             ...           ...        ...       ...       ...       ...   
867493  57363.97413      10824615  246269618  1.185938  1.932187  0.352998   
611642  62679.81393      10948636  282448261  1.032245  1.947845  0.062069   
255116  26914.40500      10152432   32398993  1.403602  0.342722  1.299022   
907233  40395.11123      20524134  159076506  1.373236  0.062503  0.005626   
807673  54925.09253      10797675  240617007  1.541126  1.599667  0.088822   

             hed  class  
28678   1.402316      0  
200107  0.1

In [137]:
'''
2.  Background knowledge creation
'''
bk_pc = restriction.PC_BGKnowledge(df, X, 'class')
bk_DirectLiNGAM = restriction.DirectLiNGAM_BGKnowledge(node_names, 'class')
# print(bk_DirectLiNGAM)

<class 'causallearn.utils.PCUtils.BackgroundKnowledge.BackgroundKnowledge'>


In [138]:
'''
3.  Algorithm for causal discovery
'''

'''3.1 Constrained Based'''
# PC algorithm with Kernal-based independence test
cg_pc = cd.pc_algorithm(
    df,
    indep_test_func = kci,
    alpha = 0.01,
    uc_rule = 1,
    max_k = 2,
    background_knowledge = bk_pc,
    node_names = node_names
)

pdy_PC = GraphUtils.to_pydot(cg_pc.G)
print(type(pdy_PC))
print(pdy_PC)
pdy_PC.write_png('PC.png')

adj_matrix_PC = utils.dot_to_adj(pdy_PC, desired_order = node_names)
print(adj_matrix_PC)
print(type(adj_matrix_PC))

# FCI algorithm with Kernal-based independence test
# cg_fci, edges = cd.fci_algorithm(
#     df,
#     indep_test_func=kci,
#     alpha=0.01,
#     depth=-1,
#     max_path_length=-1,
#     verbose=False,
#     show_progress=True,
#     background_knowledge = bk_pc,
#     node_names = node_names
# )
# pdy = GraphUtils.to_pydot(cg_fci)
# pdy.write_png('FCI.png')

  0%|          | 0/8 [00:00<?, ?it/s]

<class 'pydot.core.Dot'>
digraph {
fontsize=18;
dpi=200;
0 [label=sendTime];
0 [label=sendTime];
1 [label=senderPseudo];
1 [label=senderPseudo];
2 [label=messageID];
2 [label=messageID];
3 [label=pos];
3 [label=pos];
4 [label=spd];
4 [label=spd];
5 [label=acl];
5 [label=acl];
6 [label=hed];
6 [label=hed];
7 [label=class];
7 [label=class];
1 -> 0 [dir=both, arrowtail=none, arrowhead=normal];
0 -> 2 [dir=both, arrowtail=none, arrowhead=none];
3 -> 0 [dir=both, arrowtail=none, arrowhead=normal];
4 -> 0 [dir=both, arrowtail=none, arrowhead=normal];
1 -> 2 [dir=both, arrowtail=none, arrowhead=normal];
1 -> 3 [dir=both, arrowtail=none, arrowhead=normal];
1 -> 7 [dir=both, arrowtail=none, arrowhead=normal];
3 -> 2 [dir=both, arrowtail=none, arrowhead=normal];
4 -> 2 [dir=both, arrowtail=none, arrowhead=normal];
5 -> 3 [dir=both, arrowtail=none, arrowhead=normal];
6 -> 3 [dir=both, arrowtail=none, arrowhead=normal];
3 -> 7 [dir=both, arrowtail=none, arrowhead=normal];
4 -> 5 [dir=both, arrowta

In [139]:
'''3.2 constrained functional'''
# LiNGAM
# model_LiNGAM = lingam.ICALiNGAM(random_state=42)
# model_LiNGAM.fit(df)
# print(model_LiNGAM.adjacency_matrix_)
# print(type(model_LiNGAM.adjacency_matrix_))
# graph_dot_model_LiNGAM = make_dot(model_LiNGAM.adjacency_matrix_, labels=node_names)
# graph_dot_model_LiNGAM.format = 'png'
# output_path = graph_dot_model_LiNGAM.render(filename='LiNGAM',directory='.',cleanup=True)
#
#
# # Direct-LiNGAM
# model_DirectLiNGAM = lingam.DirectLiNGAM(
#     random_state=42,
#     prior_knowledge=None,
#     apply_prior_knowledge_softly=False,
#     measure='pwling',
# )
#
# model_DirectLiNGAM.fit(df)
# graph_dot_DirectLiNGAM = make_dot(model_DirectLiNGAM.adjacency_matrix_, labels=node_names)
# graph_dot_DirectLiNGAM.format = 'png'
# output_path = graph_dot_DirectLiNGAM.render(filename='DirectLiNGAM',directory='.',cleanup=True)

'3.2 constrained functional'

In [140]:
'''3.4 Boss'''

G = cd.boss(
    df.to_numpy(),
    score_func='local_score_BIC',
    node_names=node_names
)

pdy_BOSS = GraphUtils.to_pydot(G)
print(type(pdy_BOSS))
pdy_BOSS.write_png("BOSS.png")

adj_matrix_BOSS = utils.dot_to_adj(pdy_BOSS, desired_order = node_names)
print(adj_matrix_BOSS)
print(type(adj_matrix_BOSS))


order:[0, 1, 2, 3, 4, 5, 6, 7]
leaf_idx:7
gsts:[<causallearn.search.PermutationBased.gst.GST object at 0x35eb14970>, <causallearn.search.PermutationBased.gst.GST object at 0x35eb14790>, <causallearn.search.PermutationBased.gst.GST object at 0x35eb14f70>, <causallearn.search.PermutationBased.gst.GST object at 0x35eb144f0>, <causallearn.search.PermutationBased.gst.GST object at 0x35eb14d90>, <causallearn.search.PermutationBased.gst.GST object at 0x35eb15090>, <causallearn.search.PermutationBased.gst.GST object at 0x35eb157b0>, <causallearn.search.PermutationBased.gst.GST object at 0x35eb16b90>]
=== GST #0 ===
vertex        : 0
forbidden     : [0, 7]
required      : []
root.grow_score : -1947164.500571181
root.shrink_score: -1947164.500571181
=== GST #1 ===
vertex        : 1
forbidden     : [1, 7]
required      : []
root.grow_score : -3755885.846594797
root.shrink_score: -3755885.846594797
=== GST #2 ===
vertex        : 2
forbidden     : [2, 7]
required      : []
root.grow_score : -373649



In [141]:
'''3.5 NOTEARS'''

# X = df.values.astype(float)
# X = (X - X.mean(0, keepdims=True)) / (X.std(0, keepdims=True) + 1e-8)
# print(X)

adj_matrix_NOTEARS = cd.notears_linear(
    df.values,
    lambda1= 0.5,
    loss_type='logistic'
) #penalty_factor=2.0, discourage_parents=7

print(adj_matrix_NOTEARS)
print(type(adj_matrix_NOTEARS))

# NOTEARS_adjacency_matrix = pd.DataFrame(w, index=node_names, columns=node_names)
# print(NOTEARS_adjacency_matrix)
# print(type(NOTEARS_adjacency_matrix))
#
graph_dot_NOTEARS = make_dot(adj_matrix_NOTEARS, labels=node_names)
graph_dot_NOTEARS.format = 'png'
output_path = graph_dot_NOTEARS.render(filename='NOTEARS',directory='.',cleanup=True)

  eAw = eAw @ eAw
  G_h = E.T * W * 2
  obj = loss + 0.5 * rho * h * h + alpha * h + lambda1 * w.sum()
  G_smooth = G_loss + (rho * h + alpha) * G_h


[[  0.           0.          -0.5920529    0.           0.
    0.           0.           0.        ]
 [  0.16608836   0.         111.22887716   0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.        ]]
<class 'numpy.ndarray'>


In [142]:
# # Estimand and Estimate of causality
# import statsmodels.api as sm
# from dowhy import CausalModel
#
# # Using the Gaussian Family for multi class
# method_params_gaussian = {
#     "glm_family": sm.families.Gaussian()
# }
#
# # Using binomial for binary class
# method_params_binomial = {
#     "glm_family": sm.families.Binomial()
# }
#
#
# # Method
# method_name = {
#     'regression':'backdoor.generalized_linear_model',
#     'distance_matching': 'backdoor.distance_matching',
#     'propensity_score_stratification': 'backdoor.propensity_score_stratification',
#     'propensity_score_matching': 'backdoor.propensity_score_matching',
#     'frontdoor_regression': 'frontdoor.linear_regression'
# }
#
# graph = utils.make_graph(adj_matrix_NOTEARS, labels=node_names)
# print(graph)
# print(type(graph))
#
# # Total effective
# # causal_model, causal_estimand, causal_estimate = ide.estimate(
# #     df,
# #     treatment = 'acl',
# #     outcome = 'class',
# #     method_params = method_params_binomial,
# #     method_name = 'backdoor.linear_regression',
# #     graph = graph,
# # )
# # print(causal_model)
# # print('*-'*50)
# # print(causal_estimand)
# # print('*-'*50)
# # print(causal_estimate)
# # print('*-'*50)
#
# graph_dot = utils.str_to_dot(graph.source)
#
# model=CausalModel(
#             data = df,
#             treatment='hed',
#             outcome='class',
#             graph=graph_dot
# )
#
# # Identification
# estimand = model.identify_effect(proceed_when_unidentifiable=False)
# print(estimand)

In [143]:
# # Estimation
# estimate = model.estimate_effect(
#     estimand,
#     method_params=method_params_binomial,
#     method_name='backdoor.linear_regression',
#     confidence_intervals=True,
#     test_significance=True,
#     target_units='ate'
# )
# print(estimate)

In [144]:
# Natural direct effect (nde)
# identified_estimand_nde = causal_model.identify_effect(estimand_type="nonparametric-nde",
#                                             proceed_when_unidentifiable=True)
# print(identified_estimand_nde)
#
#
# # Natural indirect effect (nie)
# identified_estimand_nie = causal_model.identify_effect(estimand_type="nonparametric-nie",
#                                             proceed_when_unidentifiable=True)
# print(identified_estimand_nie)
#
#
# import dowhy.causal_estimators.linear_regression_estimator
# causal_estimate_nie = causal_model.estimate_effect(identified_estimand_nie,
#                                         method_name="mediation.two_stage_regression",
#                                        confidence_intervals=True,
#                                        test_significance=True,
#                                         method_params = {
#                                             'first_stage_model': dowhy.causal_estimators.linear_regression_estimator.LinearRegressionEstimator,
#                                             'second_stage_model': dowhy.causal_estimators.linear_regression_estimator.LinearRegressionEstimator
#                                         }
#                                        )
# print(causal_estimate_nie)

In [145]:
# # Refute estimand
# import warnings
# warnings.filterwarnings("ignore", category=FutureWarning, module="dowhy")
#
#
# refuter_list = ['bootstrap_refuter', "data_subset_refuter", 'dummy_outcome_refuter', 'placebo_treatment_refuter', 'random_common_cause']
# # r1 = model.refute_estimate(estimand, estimate, method_name="bootstrap_refuter")
# # r2 = model.refute_estimate(estimand, estimate, method_name="data_subset_refuter")
# # r3 = model.refute_estimate(estimand, estimate, method_name="dummy_outcome_refuter")
# # r4 = model.refute_estimate(estimand, estimate, method_name="placebo_treatment_refuter")
# # r5 = model.refute_estimate(estimand, estimate, method_name="random_common_cause")
# for refuters in refuter_list:
#     refute_results = ref.causal_refuter(model, estimand, estimate, method_name=refuters)
#     print(refute_results)
#     print('*'*100)

In [151]:
print(df)
filter_col = ['senderPseudo','pos', 'acl', 'spd', 'hed', 'class']
# filter_col = ['acl', 'spd', 'class']
# filter_col = ['sendTime', 'messageID', 'class']
# filter_col = ['senderPseudo', 'class']
# filter_col = ['senderPseudo', 'acl', 'spd', 'class']
df_filtered = df.loc[:, filter_col].copy()
print(df_filtered)

try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False
    print("No XGBoost available")

def evaluate_plain(df_like, name, models, n_splits=5):
    if 'class' not in df_like.columns:
        raise ValueError(f"{name} No target col 'class'")

    data = df_like.dropna(subset=['class']).copy()
    y = data['class'].astype(int).values
    X = data.drop(columns=['class'])

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    rows = []
    preds_dict = {}
    for model_name, est in models.items():
        y_pred = cross_val_predict(est, X, y, cv=cv, method='predict', n_jobs=-1)
        preds_dict[model_name] = y_pred

        acc = accuracy_score(y, y_pred)
        rec = recall_score(y, y_pred, zero_division=0)
        f1  = f1_score(y, y_pred, zero_division=0)

        cm = confusion_matrix(y, y_pred)
        print(f"\n[{name}] {model_name}")
        print(f"Accuracy: {acc:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
        print("Confusion Matrix [[TN, FP], [FN, TP]]:")
        print(cm)
        print("\nClassification Report:")
        print(classification_report(y, y_pred, digits=4, zero_division=0))

        rows.append({
            'FeatureSet': name,
            'Model': model_name,
            'Accuracy': acc,
            'Recall': rec,
            'F1': f1
        })

    return pd.DataFrame(rows).set_index(['FeatureSet','Model'])

models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, solver='lbfgs'),
    'DecisionTree(max_depth=6)': DecisionTreeClassifier(max_depth=6, random_state=42),
}
if HAS_XGB:
    models['XGBoost'] = XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.08,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42
    )

res_full = evaluate_plain(df, name='Full', models=models)
res_flt  = evaluate_plain(df_filtered, name='Filtered', models=models)
print(res_full)
print(res_flt)

# summary = pd.concat([res_full, res_flt]).reset_index()

# print("\n=== Summary Table ===")
# print(summary.pivot(index='Model', columns='FeatureSet', values=['Accuracy','Recall','F1']).round(4))


           sendTime  senderPseudo  messageID       pos       spd       acl  \
28678   72524.93554     101314037  427449123  1.177110  1.905615  0.053295   
200107  25997.45431      10126152   24069906  2.795565  0.062502  0.005667   
430083  51456.19907      10747875  227970587  0.889361  1.033074  1.867734   
672050  63495.90880      10977316  294191998  1.685966  1.864182  0.313507   
598240  62513.11618      10942036  281163403  1.820064  2.141214  0.343610   
...             ...           ...        ...       ...       ...       ...   
867493  57363.97413      10824615  246269618  1.185938  1.932187  0.352998   
611642  62679.81393      10948636  282448261  1.032245  1.947845  0.062069   
255116  26914.40500      10152432   32398993  1.403602  0.342722  1.299022   
907233  40395.11123      20524134  159076506  1.373236  0.062503  0.005626   
807673  54925.09253      10797675  240617007  1.541126  1.599667  0.088822   

             hed  class  
28678   1.402316      0  
200107  0.1