In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def preprocess_data(df, label_col="Label"):
    df.columns = df.columns.str.strip()
    if label_col not in df.columns:
        raise ValueError(f"Label column '{label_col}' not found in dataframe!")

    X = df.drop(label_col, axis=1, errors="ignore")
    y = df[label_col].values

    # numeric conversion + cleaning
    X = X.apply(pd.to_numeric, errors="coerce")
    X = X.replace([np.inf, -np.inf], np.nan).fillna(0)

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y

def enn_test(X, y, k=5):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    y_pred = knn.predict(X)

    mis_rate = np.mean(y_pred != y)
    per_class = {cls: np.mean(y_pred[y==cls] != cls) for cls in np.unique(y)}
    return mis_rate, per_class

BG_PORTS = {0, 53, 67, 68, 111, 123, 137, 161, 179, 389, 427, 520, 1723, 1900}
def port_test(df, label_col="Label", port_col="Destination Port"):
    if port_col not in df.columns:
        return {cls: np.nan for cls in df[label_col].unique()}  # skip if missing

    results = {}
    for cls, group in df.groupby(label_col):
        total = len(group)
        unclear = group[group[port_col].isin(BG_PORTS)]
        results[cls] = len(unclear)/total if total else 0
    return results


In [None]:
folder = "Awid_CSV" 
results = []

for subfolder in os.listdir(folder):
    subpath = os.path.join(folder, subfolder)
    if not os.path.isdir(subpath):
        continue

    for file in os.listdir(subpath):
        if not file.lower().endswith(".csv"):
            continue

        filepath = os.path.join(subpath, file)
        print(f"\n=== Processing {file} (folder: {subfolder}) ===")

        try:
            df = pd.read_csv(filepath)
            print("Shape:", df.shape)

            # Preprocess
            X_scaled, y = preprocess_data(df, label_col="Label")  # adjust if AWID uses "class"

            # ENN
            _, per_class = enn_test(X_scaled, y, k=5)

            # Port Test
            port_results = port_test(df, label_col="Label")

            # Store results
            for cls in np.unique(y):
                results.append({
                    "Folder": subfolder,
                    "File": file,
                    "Class": cls,
                    "ENN_misrate": per_class.get(cls, np.nan),
                    "PortTest_UGT_C": port_results.get(cls, np.nan),
                })

        except Exception as e:
            print(f" Error with {file}: {e}")


=== Processing Deauth_0.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_1.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_10.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_11.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_12.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_13.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_14.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_15.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_16.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_17.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_18.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_19.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_2.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_20.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_21.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_22.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_23.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_24.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_25.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_26.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_27.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_28.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_29.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)

=== Processing Deauth_3.csv (folder: 1.Deauth) ===


  df = pd.read_csv(filepath)


Shape: (50000, 254)


In [None]:
summary_df = pd.DataFrame(results)
print("\n=== Summary Results ===")
print(summary_df.head(20))

summary_df.to_csv("AWID_ENN_PortTest_Summary.csv", index=False, encoding="utf-8")
print("Saved AWID_ENN_PortTest_Summary.csv")


In [None]:
# 1. ENN misclassification
plt.figure(figsize=(12,6))
sns.barplot(data=summary_df, x="Class", y="ENN_misrate")
plt.title("AWID - ENN Misclassification Rate per Class")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 2. Port Test
plt.figure(figsize=(12,6))
sns.barplot(data=summary_df, x="Class", y="PortTest_UGT_C")
plt.title("AWID - PortTest UGT_C per Class")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 3. Heatmap
pivot = summary_df.pivot_table(index="Class", columns="File", values="ENN_misrate")
plt.figure(figsize=(12,8))
sns.heatmap(pivot, annot=True, fmt=".3f", cmap="viridis")
plt.title("AWID - ENN Misclassification Heatmap")
plt.tight_layout()
plt.show()