In [37]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [30]:
def dtw_distance(sequence1, sequence2):
    n = len(sequence1)
    m = len(sequence2)
    
    # 初始化DTW矩阵
    dtw_matrix = np.full((n+1, m+1), np.inf)
    dtw_matrix[0, 0] = 0

    # 计算DTW矩阵
    for i in range(1, n+1):
        for j in range(1, m+1):
            cost = abs(sequence1[i-1] - sequence2[j-1])
            dtw_matrix[i, j] = cost + min(
                dtw_matrix[i-1, j],    # 插入
                dtw_matrix[i, j-1],    # 删除
                dtw_matrix[i-1, j-1]   # 匹配
            )
    return dtw_matrix[n, m]

In [None]:
df = pd.read_csv("tcp_flows_labeled.csv")
df["ts_combined"] = df["ts_combined"].apply(literal_eval)
flows = df["ts_combined"].tolist()

N = len(flows)
dist_matrix = np.zeros((N, N))
for i in range(N):
    for j in range(i + 1, N):
        d = dtw_distance(flows[i], flows[j])
        dist_matrix[i, j] = dist_matrix[j, i] = d

In [None]:
from sklearn.cluster import AgglomerativeClustering

# 使用 DTW 距离矩阵直接聚类
n_clusters = 5
clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage='average')
df["cluster"] = clustering.fit_predict(dist_matrix)

In [None]:
# 每个 cluster 选出 leader
leaders = {}
for k in range(n_clusters):
    indices = df[df["cluster"] == k].index.tolist()
    best_i = min(indices, key=lambda i: sum(dist_matrix[i][j] for j in indices))
    leaders[k] = df.loc[best_i, "ts_combined"]


In [None]:
# 每个 flow 已知它属于哪个 cluster，现在统计每个 action 在各 cluster 中的数量
X = []
y = []

grouped = df.groupby("action")
for action_label, group in grouped:
    counts = [0] * n_clusters
    for ts in group["ts_combined"]:
        # 重新用 leader 分配一次，避免聚类误差
        closest_cluster = min(leaders, key=lambda c: dtw_distance(ts, leaders[c]))
        counts[closest_cluster] += 1
    X.append(counts)
    y.append(action_label)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))