In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns

def preprocess_data(df, label_col="Label"):
    df.columns = df.columns.str.strip()
    if label_col not in df.columns:
        raise ValueError(f"Label column '{label_col}' not found in dataframe!")

    X = df.drop(label_col, axis=1, errors="ignore")
    y = df[label_col].values

    # numeric conversion + cleaning
    X = X.apply(pd.to_numeric, errors="coerce")
    X = X.replace([np.inf, -np.inf], np.nan).fillna(0)

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y

def enn_test(X, y, k=5):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    y_pred = knn.predict(X)

    mis_rate = np.mean(y_pred != y)
    per_class = {cls: np.mean(y_pred[y == cls] != cls) for cls in np.unique(y)}
    return mis_rate, per_class

folder = "Awid_CSV"
results = []

for subfolder in os.listdir(folder):
    subpath = os.path.join(folder, subfolder)
    if not os.path.isdir(subpath):
        continue

    for file in os.listdir(subpath):
        if not file.lower().endswith(".csv"):
            continue

        filepath = os.path.join(subpath, file)
        print(f"\n=== Processing {file} (folder: {subfolder}) ===")

        try:
            df = pd.read_csv(filepath)

            # OPTIONAL: sample data to reduce runtime
            if len(df) > 200000:
                df = df.sample(200000, random_state=42)

            X_scaled, y = preprocess_data(df, label_col="Label")
            _, per_class = enn_test(X_scaled, y, k=5)

            for cls in np.unique(y):
                results.append({
                    "Folder": subfolder,
                    "File": file,
                    "Class": cls,
                    "ENN_misrate": per_class.get(cls, np.nan),
                })

        except Exception as e:
            print(f" Error with {file}: {e}")

summary_df = pd.DataFrame(results)
summary_df.to_csv("AWID_ENN_Summary.csv", index=False, encoding="utf-8")
print("Saved AWID_ENN_Summary.csv")




In [None]:
plot_dir = "awid_plots"
os.makedirs(plot_dir, exist_ok=True)

# Mean misclassification per class
plt.figure(figsize=(12,6))
sns.barplot(data=summary_df, x="Class", y="ENN_misrate", errorbar="sd")
plt.title("AWID - Mean ENN Misclassification Rate per Class")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(plot_dir, "ENN_misrate_barplot.png"), dpi=300)
plt.show()

# Heatmap (Class × Folder)
pivot = summary_df.pivot_table(index="Class", columns="Folder", values="ENN_misrate", aggfunc="mean")
plt.figure(figsize=(14,8))
sns.heatmap(pivot, annot=True, fmt=".2f", cmap="magma", cbar_kws={'label': 'Mean ENN Misrate'})
plt.title("AWID - ENN Misclassification Heatmap (by Attack Folder)")
plt.tight_layout()
plt.savefig(os.path.join(plot_dir, "ENN_heatmap.png"), dpi=300)
plt.show()

# Boxplot (distribution across files per class)
plt.figure(figsize=(12,6))
sns.boxplot(data=summary_df, x="Class", y="ENN_misrate")
plt.title("AWID - Distribution of ENN Misclassification Rates per Class")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(plot_dir, "ENN_boxplot.png"), dpi=300)
plt.show()
