In [None]:
# 掛載到Google雲端硬碟
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import sys
import pickle as pickle
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn import tree
import re
from statistics import mode
import random
import warnings
#-------------------------------------------------------------------------------------------------
# 關掉新版 (>=1.5) 的 PerformanceWarning
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

# 關掉舊版 (<1.5) 的 PerformanceWarning
try:
    from pandas.core.common import PerformanceWarning   # 對舊版才 import 得到
    warnings.filterwarnings("ignore", category=PerformanceWarning)
except ImportError:
    # 新版 pandas 沒這個 symbol，忽略即可
    pass
#--------------------------------------------------------------------------------------------------
np.random.seed(42)

## 將訓練好的模型導入 ##
clf = pd.read_pickle('/content/drive/MyDrive/my_method/My_method_model.sav') ## replace with saved RF model file in .sav format

## 列出模型所使用的特徵
feature_names = clf.feature_names_in_
print(feature_names)

## 定義函式
### 從模型中，萃取出每棵決策樹中所有的分裂規則（split conditions），整理成一個pandas DataFrame供後續使用。
def get_splits(forest, feature_names):
    data = []
    #generate dataframe with all thresholds and features
    for t in range(len(forest.estimators_)):
        clf = forest[t]
        n_nodes = clf.tree_.node_count
        features  = [feature_names[i] for i in clf.tree_.feature]
        for i in range(0, n_nodes):
            node_id = i
            left_child_id = clf.tree_.children_left[i]
            right_child_id = clf.tree_.children_right[i]
            threshold = clf.tree_.threshold[i]
            feature = features[i]
            if threshold != -2.0:
                data.append([t, node_id, left_child_id,
                             right_child_id, threshold, feature])
    data = pd.DataFrame(data)
    data.columns = ["Tree","NodeID","LeftID","RightID","Threshold","Feature"]
    return data

## 根據特定特徵在多棵決策樹中的切分點，產生對應區間的二進位編碼表，用於後續在 P4 中作為特徵匹配的 codeword。
def get_feature_table(splits_data, feature_name):
    feature_data = splits_data[splits_data["Feature"]==feature_name]
    feature_data = feature_data.sort_values(by="Threshold")
    feature_data = feature_data.reset_index(drop=True)
    ##
    # feature_data["Threshold"] = (feature_data["Threshold"]).astype(int)
    feature_data["Threshold"] = feature_data["Threshold"].astype(int)
    ##
    code_table = pd.DataFrame()
    code_table["Threshold"] = feature_data["Threshold"]
    #print(feature_data)
    #create a column for each split in each tree
    for tree_id, node in zip(list(feature_data["Tree"]), list(feature_data["NodeID"])):
        colname = "s"+str(tree_id)+"_"+str(node)
        code_table[colname] = np.where((code_table["Threshold"] <=
                                        feature_data[(feature_data["NodeID"]== node) &
                                                     (feature_data["Tree"]==tree_id)]["Threshold"].values[0]), 0, 1)
    #add a row to represent the values above the largest threshold
    temp = [max(code_table["Threshold"])+1]
    temp.extend(list([1]*(len(code_table.columns)-1)))
    code_table.loc[len(code_table)] = temp
    code_table = code_table.drop_duplicates(subset=['Threshold'])
    code_table = code_table.reset_index(drop=True)
    return code_table

### 根據特徵切分表產生每棵樹的 codeword 編碼，並為每一筆資料對應出其數值範圍（Range），以利在 P4 中建構 table_entry。
def get_feature_codes_with_ranges(feature_table, num_of_trees):
    Codes = pd.DataFrame()
    for tree_id in range(num_of_trees):
        colname = f"code{tree_id}"
        mask_cols = [c for c in feature_table.columns if f"s{tree_id}_" in c]
        Codes[colname] = (
            feature_table[mask_cols]
            .apply(lambda x: ''.join(x.dropna().astype(str)), axis=1)
            .radd("0b")                      # 等於 ["0b"+x for x in …]
        )

    # ---------- 這裡是改過的部分 ---------- #
    feature_table = feature_table.copy()      # 明確複製，避開 view 問題
    feature_table["Range"] = ""               # 建空欄位，型別 object

    # 第 0 列
    first_hi = feature_table.loc[0, "Threshold"]
    feature_table.loc[0, "Range"] = f"0,{first_hi}"

    # 其他列
    for i in range(1, len(feature_table)):
        if i == len(feature_table) - 1:
            hi = feature_table.loc[i, "Threshold"]
            feature_table.loc[i, "Range"] = f"{hi},{hi}"
        else:
            lo = feature_table.loc[i - 1, "Threshold"] + 1
            hi = feature_table.loc[i,     "Threshold"]
            feature_table.loc[i, "Range"] = f"{lo},{hi}"
    # --------------------------------------- #

    Ranges = feature_table["Range"]
    return Ranges, Codes

### 從一棵決策樹 (estimator) 中提取所有從根節點到葉節點的路徑，每條路徑表示一條決策規則（branch）。
def retrieve_branches(estimator):
    number_nodes = estimator.tree_.node_count
    children_left_list = estimator.tree_.children_left
    children_right_list = estimator.tree_.children_right
    feature = estimator.tree_.feature
    threshold = estimator.tree_.threshold
    # Calculate if a node is a leaf
    is_leaves_list = [(False if cl != cr else True) for cl, cr in zip(children_left_list, children_right_list)]
    # Store the branches paths
    paths = []
    for i in range(number_nodes):
        if is_leaves_list[i]:
            # Search leaf node in previous paths
            end_node = [path[-1] for path in paths]
            # If it is a leave node yield the path
            if i in end_node:
                output = paths.pop(np.argwhere(i == np.array(end_node))[0][0])
                yield output
        else:
            # Origin and end nodes
            origin, end_l, end_r = i, children_left_list[i], children_right_list[i]
            # Iterate over previous paths to add nodes
            for index, path in enumerate(paths):
                if origin == path[-1]:
                    paths[index] = path + [end_l]
                    paths.append(path + [end_r])
            # Initialize path in first iteration
            if i == 0:
                paths.append([i, children_left_list[i]])
                paths.append([i, children_right_list[i]])

## 取得分類結果 & 分類的可信度
def get_classes(clf):
    leaves = []
    classes = []
    certainties = []
    for branch in list(retrieve_branches(clf)):
        leaves.append(branch[-1])
    for leaf in leaves:
        if clf.tree_.n_outputs == 1:
            value = clf.tree_.value[leaf][0]
        else:
            value = clf.tree_.value[leaf].T[0]
        class_name = np.argmax(value)
        certainty = int(round(max(value)/sum(value),2)*100)
        classes.append(class_name)
        certainties.append(certainty)
    return classes, certainties

## 將每一條「從 root 到葉節點的路徑」轉換成由左右子樹構成的「bit 路徑碼」（0=左、1=右），可用來還原 decision tree 中的路徑結構，通常搭配 ternary 編碼。
def get_leaf_paths(clf):
    depth = clf.max_depth
    branch_codes = []
    for branch in list(retrieve_branches(clf)):
        code = [0]*len(branch)
        for i in range(1, len(branch)):
            if (branch[i]==clf.tree_.children_left[branch[i-1]]):
                code[i] = 0
            elif (branch[i]==clf.tree_.children_right[branch[i-1]]):
                code[i] = 1
        branch_codes.append(list(code[1:]))
    return branch_codes

## 依據特徵名稱的順序，整理所有出現在決策樹中的節點（node）ID，並依據 threshold 由小到大排序。
def get_order_of_splits(data, feature_names):
    splits_order = []
    for feature_name in feature_names:
        feature_data = data[data.iloc[:,4]==feature_name]
        feature_data = feature_data.sort_values(by="Threshold")
        for node in list(feature_data.iloc[:,0]):
            splits_order.append(node)
    return splits_order

### 用來取得單棵決策樹中所有分裂節點的資訊，以便後續做轉換、對應、bit mapping、P4 table 建構等用途。
def get_splits_per_tree(clf, feature_names):
    data = []
    n_nodes = clf.tree_.node_count
    #set feature names
    features  = [feature_names[i] for i in clf.tree_.feature]
    #generate dataframe with all thresholds and features
    for i in range(0, n_nodes):
        node_id = i
        left_child_id = clf.tree_.children_left[i]
        right_child_id = clf.tree_.children_right[i]
        threshold = clf.tree_.threshold[i]
        feature = features[i]
        if threshold != -2.0:
            data.append([node_id, left_child_id,
                         right_child_id, threshold, feature])
    data = pd.DataFrame(data)
    data.columns = ["NodeID","LeftID","RightID","Threshold","Feature"]
    return data

### 給定一棵決策樹中的某個 內部節點編號，回傳該節點所使用的 特徵在原始特徵清單中的索引（0-based）。
def get_feature_index(node_id, feature_names, estimator):
    feat_raw_idx = estimator.tree_.feature[node_id]
    if feat_raw_idx == -2:                            # -2 表示 leaf，理論上不會傳進來
        raise ValueError(f"Node {node_id} is a leaf; it has no split feature.")
    return feat_raw_idx                               # 就是 0,1,2,...

# 比較MASK的精度
def bitcnt(bitstr: str) -> int:
    """計算 0x / 0b 字串裡有多少個 '1'"""
    return bin(int(bitstr, 0)).count("1")

## 取得拼接好的Codes & Masks
def get_codes_and_masks(est, feature_names, slice_map_t):
    splits = get_order_of_splits(get_splits_per_tree(est, feature_names), feature_names)
    n_bits = 256

    masks_256 = []
    codes_256 = []

    for branch, path_bits in zip(retrieve_branches(est), get_leaf_paths(est)):
        code_bits = ['0'] * n_bits
        mask_bits = ['0'] * n_bits

        # 建立 split → bit_index 的對應關係
        split_to_bit = {split: idx for idx, split in enumerate(splits)}

        for i, split_node in enumerate(branch[1:]):  # 跳過 root
            if split_node not in split_to_bit:
                continue
            feat_idx = get_feature_index(split_node, feature_names, est)
            hi, lo = slice_map_t[feat_idx]
            val_bit = path_bits[i]  # 0 or 1

            # 填入 mask = 1, code = val
            for b in range(lo, hi + 1):
                mask_bits[b] = '1'
                code_bits[b] = str(val_bit)  # 全填 val_bit for simplicity

        codeword_str = "0b" + ''.join(reversed(code_bits))
        mask_str     = "0b" + ''.join(reversed(mask_bits))
        codes_256.append(codeword_str)
        masks_256.append(mask_str)

    classes, _ = get_classes(est)
    return codes_256, masks_256, classes

## End of model manipulation ##

# ──────────────────────────────────────────────
# A. 計算每棵樹需要多少 bit
bits_per_tree = []
for est in clf.estimators_:
    n_bits = len(get_order_of_splits(
        get_splits_per_tree(est, feature_names), feature_names))
    bits_per_tree.append(n_bits)
print("[INFO] bits_per_tree =", bits_per_tree)      # 例 [112, 132, 112]

# B. 由高→低排 slice
MAX_BITS   = 256           # 每棵樹保留 256 bit 欄位
high_start = MAX_BITS - 1  # 255
offsets    = [0] * len(bits_per_tree)          # 已用位元 (從 MSB 方向計)
slice_map  = {t: [] for t in range(len(bits_per_tree))}

all_splits = get_splits(clf, feature_names)    # 只算一次後面共用

for feat_id, feat_name in enumerate(feature_names):
    ranges, codes = get_feature_codes_with_ranges(
        get_feature_table(all_splits, feat_name),
        len(clf.estimators_)
    )
    # 這個特徵在三棵樹的 codeword 長度
    lengths = [len(codes.iloc[0, col]) - 2 for col in range(len(clf.estimators_))]
    for t, ln in enumerate(lengths):
        hi = high_start - offsets[t]
        lo = hi - ln + 1
        slice_map[t].append((hi, lo))
        offsets[t] += ln
        if offsets[t] > MAX_BITS:
            raise ValueError(f"Tree-{t} 已用 {offsets[t]} bits (>256)，"
                             "請改雙 key 或哈希壓縮")

print("===== slice_map (hi:lo, 高→低) =====")
for t in slice_map:
    for feat_id, (hi, lo) in enumerate(slice_map[t]):
        ln = hi - lo + 1
        print(f"Tree-{t:<2}  feature{feat_id:<2} → [{hi}:{lo}]  ({ln} bit)")
print("===== end slice_map =====\n")

# C. 產生對應的 SetCode actions（直接複製到 P4）
print("===== SetCode actions  (copy to P4) =====")
for feat_id in range(len(feature_names)):
    ln0 = slice_map[0][feat_id][0] - slice_map[0][feat_id][1] + 1
    ln1 = slice_map[1][feat_id][0] - slice_map[1][feat_id][1] + 1
    ln2 = slice_map[2][feat_id][0] - slice_map[2][feat_id][1] + 1
    print(f"action SetCode{feat_id}(bit<{ln0}> code0, "
          f"bit<{ln1}> code1, bit<{ln2}> code2) {{")
    for t in range(3):
        hi, lo = slice_map[t][feat_id]
        print(f"    meta.codeword{t}[{hi}:{lo}] = code{t};")
    print("}\n")
print("===== end SetCode =====\n")
# ──────────────────────────────────────────────
priority_feature = 1
priority_code    = 1
priority_vote    = 1

with open("/content/drive/MyDrive/my_method/My_method_entries.txt", "w") as f:
    # ── 1. 清空 ──
    for feat_id in range(len(feature_names)):
        print(f"table_clear MyIngress.table_feature{feat_id}", file=f)
    for tree_id in range(len(clf.estimators_)):
        print(f"table_clear MyIngress.code_table{tree_id}", file=f)
    print("table_clear MyIngress.voting_table\n", file=f)

    # ── 2. Feature → Codeword tables ──
    all_splits = get_splits(clf, feature_names)

    for feat_id, feat_name in enumerate(feature_names):
        ranges, codes = get_feature_codes_with_ranges(
            get_feature_table(all_splits, feat_name),
            len(clf.estimators_)
        )

        # 每一條 range 產生一條 table_add
        for seg, code0, code1, code2 in zip(
                ranges,
                codes.iloc[:, 0],
                codes.iloc[:, 1],
                codes.iloc[:, 2]):

            lo, hi = seg.split(",")
            key    = f"{lo}->{hi}"

            # 依三棵樹的 code 長度決定是否帶參數
            cd_parts = []
            for code_bin in (code0, code1, code2):
                ln = len(code_bin) - 2           # 去掉 '0b'
                if ln > 0:                       # 0-bit 就跳過
                    cd_parts.append(code_bin)

            cd_str = " ".join(cd_parts)          # 可能是 1～3 個參數

            print(f"table_add MyIngress.table_feature{feat_id} "
                  f"SetCode{feat_id} {key} => {cd_str} {priority_feature}",
                  file=f)
        print("", file=f)

    # ── 3. 把拼接好的特徵進行分類 ──
    for tree_id, est in enumerate(clf.estimators_):
        vals, masks, classes = get_codes_and_masks(
            est, feature_names, slice_map[tree_id])          # ★ 傳入對應 slice

    # 依 mask 裡 1 的數量排優先（具體度高 → 1 多）
        entries = sorted(zip(vals, masks, classes),
                     key=lambda t: bitcnt(t[1]),
                     reverse=True)

        prio = len(entries)                                   # ★ NEW
        for v, m, c in entries:
            print(f"table_add MyIngress.code_table{tree_id} "
                  f"SetClass{tree_id} {v}&&&{m} => {c} {prio}",
                  file=f)
            prio -= 1                                         # ★ NEW
        print("", file=f)

    # ── 4. Voting table（二元分類範例）──
    for i in (0, 1):
        for j in (0, 1):
          for k in (0, 1):
            res = mode([i, j, k])       # 二元時只會是 0 或 1
            print(f"table_add MyIngress.voting_table set_final_class {i} {j} {k} => {res}", file=f)

print("[OK] BMv2 CLI commands written to entries_bmv2.txt")

['dst_ip' 'pkt_len' 'dstport' 'srcport' 'ack_flag' 'syn_flag']
[INFO] bits_per_tree = [88, 74, 130]
===== slice_map (hi:lo, 高→低) =====
Tree-0   feature0  → [255:247]  (9 bit)
Tree-0   feature1  → [246:232]  (15 bit)
Tree-0   feature2  → [231:204]  (28 bit)
Tree-0   feature3  → [203:177]  (27 bit)
Tree-0   feature4  → [176:172]  (5 bit)
Tree-0   feature5  → [171:168]  (4 bit)
Tree-1   feature0  → [255:247]  (9 bit)
Tree-1   feature1  → [246:233]  (14 bit)
Tree-1   feature2  → [232:207]  (26 bit)
Tree-1   feature3  → [206:192]  (15 bit)
Tree-1   feature4  → [191:185]  (7 bit)
Tree-1   feature5  → [184:182]  (3 bit)
Tree-2   feature0  → [255:235]  (21 bit)
Tree-2   feature1  → [234:207]  (28 bit)
Tree-2   feature2  → [206:167]  (40 bit)
Tree-2   feature3  → [166:138]  (29 bit)
Tree-2   feature4  → [137:132]  (6 bit)
Tree-2   feature5  → [131:126]  (6 bit)
===== end slice_map =====

===== SetCode actions  (copy to P4) =====
action SetCode0(bit<9> code0, bit<9> code1, bit<21> code2) {
    m

In [None]:
for idx, name in enumerate(feature_names):
    print(f"[MAP] table_feature{idx}  <->  {name}")

[MAP] table_feature0  <->  dst_ip
[MAP] table_feature1  <->  pkt_len
[MAP] table_feature2  <->  dstport
[MAP] table_feature3  <->  srcport
[MAP] table_feature4  <->  ack_flag
[MAP] table_feature5  <->  syn_flag


In [None]:
# ──────────────────────────────────────────────
# 🛠  列印每棵樹的完整決策規則
def print_tree_rules(forest, feature_names, max_rules=None):
    for tid, est in enumerate(forest.estimators_):
        print(f"\n🌳 Tree {tid}")
        splits_df = get_splits_per_tree(est, feature_names)
        classes, certs = get_classes(est)

        for r_idx, (branch, path_bits) in enumerate(
                zip(retrieve_branches(est), get_leaf_paths(est))):

            # 根據 path_bits 判斷「左≤ / 右>」
            conds = []
            for step, node in enumerate(branch[:-1]):        # 不含葉節點
                feat  = feature_names[est.tree_.feature[node]]
                thr   = est.tree_.threshold[node]
                op    = "≤" if path_bits[step] == 0 else ">"
                # numpy / pandas 顯示太多小數位時，可 round
                conds.append(f"{feat} {op} {thr:g}")

            rule_txt  = " AND ".join(conds)
            cls, cf = classes[r_idx], certs[r_idx]
            print(f"Rule {r_idx+1:2}: {rule_txt:<60} ⇒ class {cls} ({cf} %)")

            if max_rules and r_idx+1 >= max_rules:
                print("… (truncated)")
                break

# ────────────────  呼叫列印  ────────────────
print_tree_rules(clf, feature_names)



🌳 Tree 0
Rule  1: pkt_len > 66.5 AND dst_ip ≤ 3.23224e+09                      ⇒ class 0 (100 %)
Rule  2: pkt_len ≤ 66.5 AND dst_ip ≤ 3.23224e+09                      ⇒ class 0 (100 %)
Rule  3: pkt_len > 66.5 AND dst_ip > 3.23224e+09 AND dst_ip > 3.23224e+09 ⇒ class 0 (100 %)
Rule  4: pkt_len > 66.5 AND dst_ip > 3.23224e+09 AND dst_ip ≤ 3.23224e+09 AND syn_flag ≤ 0.5 AND srcport ≤ 3263 AND srcport ≤ 416 AND pkt_len ≤ 382.5 AND ack_flag > 0.5 AND pkt_len ≤ 380 ⇒ class 0 (70 %)
Rule  5: pkt_len > 66.5 AND dst_ip > 3.23224e+09 AND dst_ip ≤ 3.23224e+09 AND syn_flag ≤ 0.5 AND srcport ≤ 3263 AND srcport ≤ 416 AND pkt_len ≤ 382.5 AND ack_flag > 0.5 AND pkt_len > 380 ⇒ class 1 (100 %)
Rule  6: pkt_len > 66.5 AND dst_ip > 3.23224e+09 AND dst_ip ≤ 3.23224e+09 AND syn_flag ≤ 0.5 AND srcport ≤ 3263 AND srcport ≤ 416 AND pkt_len ≤ 382.5 AND ack_flag ≤ 0.5 AND pkt_len ≤ 136.5 ⇒ class 1 (59 %)
Rule  7: pkt_len > 66.5 AND dst_ip > 3.23224e+09 AND dst_ip ≤ 3.23224e+09 AND syn_flag ≤ 0.5 AND srcport ≤ 