In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the data
df = pd.read_csv("ecu_cluster_dataset.csv")

# Store ID and status separately
ecu_ids = df["ECU_ID"]
operational_state = df["Operational_State"]

# Features to drop from preprocessing (they're stored separately)
drop_columns = ["ECU_ID", "Operational_State"]

# Define categorical and numerical columns
categorical_cols = [
    "ECU_Type", "Protocol", "Redundancy",
    "Manufacturer", "Software_Version"
]

numerical_cols = [
    "CPU_Speed_MHz", "Memory_MB", "Power_Watts", "Message_Frequency_msgs",
    "Unique_Message_IDs", "Error_Rate_percent", "Response_Time_ms",
    "Network_Topology_Level", "Inter_ECU_Dependencies",
    "Mean_Voltage_V", "Max_Voltage_V", "Bit_Time_us", "Plateau_Time_us"
]

# Define transformers
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Combine transformers
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# Apply transformation
X = preprocessor.fit_transform(df.drop(columns=drop_columns))

# `X` is now your feature matrix, ready for similarity search
# Keep original mapping:
processed_df = pd.DataFrame(X.toarray() if hasattr(X, 'toarray') else X)
processed_df["ECU_ID"] = ecu_ids
processed_df["Operational_State"] = operational_state

# Preview
print(processed_df.head())


          0         1         2         3         4         5         6  \
0  1.574289 -0.935771  0.557366  0.110024  0.137500  0.066816  1.479684   
1  0.148467  2.038078  1.469930  0.002157 -0.259135 -0.707272 -0.468978   
2  0.376404 -0.341001 -0.425395  1.404422  1.327406  0.637197 -1.638175   
3  1.423948 -0.341001 -0.565790 -0.483242 -0.655770 -0.951721 -0.079246   
4 -1.039719  0.452025  0.978549  0.919022 -0.655770  1.859442 -1.443309   

          7         8         9  ...   33   34   35   36   37   38   39   40  \
0 -1.320911 -1.487055 -0.357909  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0   
1  1.037859  1.447922 -0.814814  ...  1.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0   
2 -1.320911 -0.313064  1.469711  ...  0.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0   
3 -1.320911  0.860927  1.469711  ...  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0   
4 -1.320911  0.273931  0.670127  ...  1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0   

   ECU_ID  Operational_State  
0  ECU001             Active  
1  ECU

In [4]:
from sklearn.model_selection import train_test_split

# X is already preprocessed above
# Drop ECU_ID and Operational_State before train-test split
X_features = processed_df.drop(columns=["ECU_ID", "Operational_State"])
y_ids = processed_df["ECU_ID"]
y_states = processed_df["Operational_State"]

# Train-test split (adjust test_size as needed)
X_train, X_test, y_train_ids, y_test_ids, y_train_states, y_test_states = train_test_split(
    X_features, y_ids, y_states, test_size=0.2, random_state=42
)

# For clustering/similarity: You often use the whole set, but can still test performance on known outputs
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (120, 41)
Test shape: (30, 41)


In [2]:
from sklearn.neighbors import NearestNeighbors

knn_model = NearestNeighbors(n_neighbors=len(df), metric="cosine")
knn_model.fit(X)

In [3]:
def preprocess_live_input(live_data, original_df):
    # Extract static data
    ecu_id = live_data["ECU_ID"]
    static_row = original_df[original_df["ECU_ID"] == ecu_id].iloc[0].to_dict()

    # Update dynamic fields
    dynamic_fields = [
        "CPU_Speed_MHz", "Power_Watts", "Message_Frequency_msgs",
        "Error_Rate_percent", "Response_Time_ms"
    ]
    for key in dynamic_fields:
        if key in live_data:
            static_row[key] = live_data[key]

    updated_df = pd.DataFrame([static_row])
    return preprocessor.transform(updated_df)

def suggest_from_live_data(live_ecu_data, top_n=3):
    X_live = preprocess_live_input(live_ecu_data, df)
    distances, indices = knn_model.kneighbors(X_live, n_neighbors=len(df))

    suggestions = []
    ecu_id = live_ecu_data["ECU_ID"]
    for dist, i in zip(distances[0], indices[0]):
        candidate_id = df.iloc[i]["ECU_ID"]
        if candidate_id == ecu_id:
            continue
        suggestions.append((candidate_id, dist))
        if len(suggestions) == top_n:
            break

    return suggestions


In [7]:
# Function to evaluate performance (with corrected accuracy calculation)
def evaluate_knn(X_test, y_test_ids, y_test_states, top_n=3):
    correct_same_type = 0
    correct_active = 0
    correct_same_network_type = 0
    total_test_cases = len(X_test)

    for i in range(total_test_cases):
        # Get the test sample and its ECU_ID
        test_sample = X_test.iloc[i].values.reshape(1, -1)
        test_ecu_id = y_test_ids.iloc[i]
        test_state = y_test_states.iloc[i]

        # Get the network type and ECU type of the test sample
        test_network_type = df[df["ECU_ID"] == test_ecu_id]["Network_Topology_Level"].values[0]
        test_ecu_type = df[df["ECU_ID"] == test_ecu_id]["ECU_Type"].values[0]

        # Get top-N nearest neighbors (ECU_ID and distance)
        distances, indices = knn_model.kneighbors(test_sample, n_neighbors=top_n)
        
        # Flags to check if we found a correct match
        matched_same_type = False
        matched_active = False
        matched_same_network_type = False

        # Check for the top-N ECUs if they match type, network type, and operational state
        for j in range(top_n):
            suggested_ecu_id = df.iloc[indices[0][j]]["ECU_ID"]
            suggested_state = df.iloc[indices[0][j]]["Operational_State"]
            suggested_type = df.iloc[indices[0][j]]["ECU_Type"]
            suggested_network_type = df.iloc[indices[0][j]]["Network_Topology_Level"]

            # Check if the suggested ECU has the same type
            if not matched_same_type and test_ecu_type == suggested_type:
                correct_same_type += 1
                matched_same_type = True

            # Check if the suggested ECU is active and has the same operational state
            if not matched_active and test_state == "Active" and suggested_state == "Active":
                correct_active += 1
                matched_active = True

            # Check if the suggested ECU has the same network topology level
            if not matched_same_network_type and test_network_type == suggested_network_type:
                correct_same_network_type += 1
                matched_same_network_type = True

    # Calculate accuracy metrics
    accuracy_same_type = correct_same_type / total_test_cases
    accuracy_active = correct_active / total_test_cases
    accuracy_same_network_type = correct_same_network_type / total_test_cases

    print(f"Accuracy for Same Type Match: {accuracy_same_type:.2f}")
    print(f"Accuracy for Active State Match: {accuracy_active:.2f}")
    print(f"Accuracy for Same Network Type Match: {accuracy_same_network_type:.2f}")

# Evaluate the model on the test data
evaluate_knn(X_test, y_test_ids, y_test_states, top_n=3)


Accuracy for Same Type Match: 1.00
Accuracy for Active State Match: 0.40
Accuracy for Same Network Type Match: 1.00


In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

# Load the data
df = pd.read_csv("ecu_cluster_dataset.csv")

# Preprocessing (same as before)
ecu_ids = df["ECU_ID"]
operational_state = df["Operational_State"]
drop_columns = ["ECU_ID", "Operational_State"]

categorical_cols = ["ECU_Type", "Protocol", "Redundancy", "Manufacturer", "Software_Version"]
numerical_cols = ["CPU_Speed_MHz", "Memory_MB", "Power_Watts", "Message_Frequency_msgs", 
                  "Unique_Message_IDs", "Error_Rate_percent", "Response_Time_ms", 
                  "Network_Topology_Level", "Inter_ECU_Dependencies", "Mean_Voltage_V", 
                  "Max_Voltage_V", "Bit_Time_us", "Plateau_Time_us"]

categorical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent")),
                                          ("encoder", OneHotEncoder(handle_unknown="ignore"))])

numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean")),
                                        ("scaler", StandardScaler())])

preprocessor = ColumnTransformer(transformers=[("num", numerical_transformer, numerical_cols),
                                               ("cat", categorical_transformer, categorical_cols)])

# Apply transformation
X = preprocessor.fit_transform(df.drop(columns=drop_columns))

# Hierarchical Clustering (Agglomerative Clustering)
clustering_model = AgglomerativeClustering(n_clusters=5, linkage='ward')  # You can change the number of clusters

# Fit the model
clustering_model.fit(X)

# Add cluster labels to the dataframe
df["Cluster_Label"] = clustering_model.labels_

# Now, you can suggest alternatives based on the cluster
def suggest_alternatives_hierarchical(ecu_id_under_attack, top_n=3, same_type=True, only_active=True):
    # Find the index of the ECU
    idx = df[df["ECU_ID"] == ecu_id_under_attack].index[0]
    cluster_label = df.iloc[idx]["Cluster_Label"]

    # Get all ECUs in the same cluster
    cluster_ecus = df[df["Cluster_Label"] == cluster_label]

    # Apply filtering criteria
    suggestions = []
    for _, row in cluster_ecus.iterrows():
        candidate_id = row["ECU_ID"]
        if candidate_id == ecu_id_under_attack:
            continue  # Skip the attacked ECU

        # Optional filters
        if only_active and row["Operational_State"] != "Active":
            continue
        if same_type and row["ECU_Type"] != df.iloc[idx]["ECU_Type"]:
            continue

        suggestions.append(candidate_id)

        if len(suggestions) == top_n:
            break

    return suggestions

# Example of suggesting alternatives for a specific ECU
ecu_under_attack = "ECU001"
alternatives = suggest_alternatives_hierarchical(ecu_under_attack, top_n=3)

print(f"Suggested alternatives for ECU under attack ({ecu_under_attack}):")
print(alternatives)

# Evaluate the clustering (Optional: check the Silhouette Score)
silhouette_avg = silhouette_score(X, clustering_model.labels_)
print(f"Silhouette Score: {silhouette_avg}")


Suggested alternatives for ECU under attack (ECU001):
['ECU109']
Silhouette Score: 0.039512756848427054


In [9]:
# Experiment with different numbers of clusters
cluster_options = [3, 4, 5, 6, 7]

for n_clusters in cluster_options:
    clustering_model = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
    clustering_model.fit(X)
    
    # Evaluate with silhouette score
    silhouette_avg = silhouette_score(X, clustering_model.labels_)
    print(f"Silhouette Score for {n_clusters} clusters: {silhouette_avg}")
    
    # Add cluster labels to the dataframe for checking suggestions
    df[f"Cluster_Label_{n_clusters}"] = clustering_model.labels_


Silhouette Score for 3 clusters: 0.03500502151634319
Silhouette Score for 4 clusters: 0.03432195561265878
Silhouette Score for 5 clusters: 0.039512756848427054
Silhouette Score for 6 clusters: 0.044328615324279695
Silhouette Score for 7 clusters: 0.04482531187151266
