# Anomaly Detection Model Training

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import plotly.express as px

# Define the directory and file naming pattern
directory = '/Users/zxgan/FYP_Kubernetes/Dataset/'
file_prefix = 'node_node_'
file_suffix = '_dataset.csv'
num_files = 50

# Initialize a list to store dataset shapes for each file
dataset_shapes = []

# Iterate over all files
for i in range(num_files):
    file_path = f"{directory}{file_prefix}{i}{file_suffix}"
    print(f"Processing file: {file_path}")
    
    # Load the dataset
    data = pd.read_csv(file_path)
    
    # Drop irrelevant columns
    data.drop(columns=['timestamp', 'pod_status_Pending', 'pod_status_Running', 
                       'pod_status_Succeeded', 'pod_status_Failed', 'pod_status_Unknown'], inplace=True)
    
    # Select numeric columns only
    numeric_data = data.select_dtypes(include=[np.number])
    
    # Split the data into training and testing sets
    X_train, X_test = train_test_split(numeric_data, test_size=0.2, random_state=42)
    
    # Store the shapes of the datasets
    dataset_shapes.append({
        "File": f"node_node_{i}",
        "Training Set Features": X_train.shape[0],
        "Testing Set Features": X_test.shape[0],
        "Total Features": numeric_data.shape[1]
    })
    
    # Display the shapes for the current file
    print(f"Training Set Features for file {i}: {X_train.shape}")
    print(f"Testing Set Features for file {i}: {X_test.shape}")

# Create a DataFrame for summary of dataset shapes
shapes_df = pd.DataFrame(dataset_shapes)

# Display the summary in an interactive table
fig_table = go.Figure(data=[go.Table(
    header=dict(values=list(shapes_df.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[shapes_df[col] for col in shapes_df.columns],
               fill_color='lavender',
               align='left'))
])
fig_table.update_layout(title="Dataset Shapes Summary", title_x=0.5)
fig_table.show()

# Save the summary as a CSV file for further analysis
summary_csv_path = f"/Users/zxgan/FYP_Kubernetes/Anomaly_Detection_summary.csv"
shapes_df.to_csv(summary_csv_path, index=False)
print(f"Summary saved to: {summary_csv_path}")

## Isolation Forest

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, accuracy_score, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
import plotly.graph_objects as go

# Define a custom scoring function
def custom_scorer(estimator, X):
    y_pred = estimator.fit_predict(X)
    return f1_score(np.ones(len(y_pred)), np.where(y_pred == -1, 0, 1))

# Define parameter grid for fine-tuning Isolation Forest
param_grid_if = {
    'n_estimators': [50, 100, 200],
    'max_samples': ['auto', 0.8, 0.5],
    'contamination': [0.05, 0.1, 0.2],
    'max_features': [0.5, 0.75, 1.0]
}

# Directory and file naming
directory = '/Users/zxgan/FYP_Kubernetes/Dataset/'
file_prefix = 'node_node_'
file_suffix = '_dataset.csv'
num_files = 50

# Initialize a summary list
accuracy_summary = []

# Process each file
for i in range(num_files):
    file_path = f"{directory}{file_prefix}{i}{file_suffix}"
    print(f"Processing file: {file_path}")
    
    # Load the dataset
    data = pd.read_csv(file_path)
    data.drop(columns=['timestamp', 'pod_status_Pending', 'pod_status_Running', 
                       'pod_status_Succeeded', 'pod_status_Failed', 'pod_status_Unknown'], inplace=True)
    numeric_data = data.select_dtypes(include=[np.number])
    
    # Split the data
    from sklearn.model_selection import train_test_split
    X_train, X_test = train_test_split(numeric_data, test_size=0.2, random_state=42)
    
    # Initialize Isolation Forest
    base_model_if = IsolationForest(random_state=42)
    grid_search_if = GridSearchCV(
        estimator=base_model_if,
        param_grid=param_grid_if,
        cv=3,
        n_jobs=-1,
        scoring=make_scorer(custom_scorer)
    )
    grid_search_if.fit(X_train)
    
    # Get the best parameters and model
    best_params_if = grid_search_if.best_params_
    best_if = grid_search_if.best_estimator_
    y_test_pred_if = np.where(best_if.predict(X_test) == -1, 0, 1)
    accuracy_if = accuracy_score(np.ones(len(y_test_pred_if)), y_test_pred_if)
    
    # Save the Isolation Forest model
    model_filename = f"Isolation_Forest_Model_Node_{i}.pkl"
    with open(model_filename, "wb") as f:
        pickle.dump(best_if, f)
    print(f"Model saved: {model_filename}")
    
    # Collect accuracy
    accuracy_summary.append({
        "Node": f"Node_{i}",
        "Best Parameters": best_params_if,
        "Accuracy": accuracy_if
    })

# Final summary of accuracy across all datasets
accuracy_df = pd.DataFrame(accuracy_summary)

# Display the accuracy summary
fig_summary = go.Figure(data=[go.Table(
    header=dict(values=list(accuracy_df.columns), fill_color='paleturquoise', align='left'),
    cells=dict(values=[accuracy_df[col] for col in accuracy_df.columns], fill_color='lavender', align='left'))
])
fig_summary.update_layout(title="Isolation Forest Accuracy Summary Across Nodes", title_x=0.5)
fig_summary.show()

## One Class SVM

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, accuracy_score, f1_score, make_scorer
import pickle
import plotly.graph_objects as go

# Parameter grid for fine-tuning One-Class SVM
param_grid_svm = {
    'nu': [0.01, 0.05],
    'kernel': ['rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Custom scoring function
def custom_scorer(estimator, X):
    y_pred = estimator.fit_predict(X)
    return f1_score(np.ones(len(y_pred)), np.where(y_pred == -1, 0, 1))

# Perform grid search
grid_search_svm = GridSearchCV(
    estimator=OneClassSVM(),
    param_grid=param_grid_svm,
    cv=3,
    n_jobs=-1,
    scoring=make_scorer(custom_scorer)
)
grid_search_svm.fit(X_train)

# Retrieve the best model and its parameters
best_svm = grid_search_svm.best_estimator_
best_params_svm = grid_search_svm.best_params_
print("One-Class SVM - Best Parameters:", best_params_svm)

# Refit the best model on the full training set
best_svm.fit(X_train)

# Predict on the test set
y_test_pred = best_svm.predict(X_test)

# Convert predictions to binary format (0 for anomaly, 1 for normal)
y_test_binary = np.where(y_test_pred == -1, 0, 1)

# Calculate accuracy
accuracy_svm = accuracy_score(np.ones(len(y_test_binary)), y_test_binary)
print("One-Class SVM - Accuracy:", accuracy_svm)

# Save the best model
with open("best_one_class_svm.pkl", "wb") as f:
    pickle.dump(best_svm, f)

# Classification report
classification_rep = classification_report(
    np.ones(len(y_test_binary)),
    y_test_binary,
    labels=[0, 1],
    target_names=["Anomaly", "Normal"],
    output_dict=True
)

# Extract precision, recall, and F1-score
classification_results = pd.DataFrame(classification_rep).T

# Accuracy summary
accuracy_summary = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-Score"],
    "Value": [
        accuracy_svm,
        classification_results.loc["Normal"]["precision"],
        classification_results.loc["Normal"]["recall"],
        classification_results.loc["Normal"]["f1-score"]
    ]
})

# Display results in an interactive table using Plotly
fig_accuracy = go.Figure(data=[go.Table(
    header=dict(values=list(accuracy_summary.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[accuracy_summary["Metric"], accuracy_summary["Value"]],
               fill_color='lavender',
               align='left'))
])

fig_accuracy.update_layout(title="One-Class SVM Accuracy Summary", title_x=0.5)
fig_accuracy.show()

# Model selection summary
model_selection = pd.DataFrame({
    "Model": ["One-Class SVM"],
    "Best Parameters": [str(best_params_svm)],
    "Accuracy": [accuracy_svm]
})

fig_model = go.Figure(data=[go.Table(
    header=dict(values=list(model_selection.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[model_selection["Model"], model_selection["Best Parameters"], model_selection["Accuracy"]],
               fill_color='lavender',
               align='left'))
])

fig_model.update_layout(title="Model Selection Results", title_x=0.5)
fig_model.show()

# Count anomalies detected
num_anomalies = (y_test_binary == 0).sum()
print(f"Number of anomalies detected: {num_anomalies}")

# Scatter plot for anomalies visualization
for feature in numeric_data.columns:
    fig_scatter = go.Figure()

    normal_data = X_test[y_test_pred == 1]
    anomaly_data = X_test[y_test_pred == -1]

    fig_scatter.add_trace(go.Scatter(
        x=list(range(len(normal_data[feature]))),
        y=normal_data[feature],
        mode='markers',
        marker=dict(color='blue'),
        name='Normal Data'
    ))

    fig_scatter.add_trace(go.Scatter(
        x=list(range(len(anomaly_data[feature]))),
        y=anomaly_data[feature],
        mode='markers',
        marker=dict(color='red'),
        name='Anomalies'
    ))

    fig_scatter.update_layout(
        title=f"Anomaly Detection Scatter Plot for {feature}",
        xaxis_title="Index",
        yaxis_title=feature,
        legend_title="Legend"
    )
    fig_scatter.show()

## Autoencoder

## Result 

In [None]:
# Ensure input data is numeric
example_input = X_test.iloc[0:5].select_dtypes(include=[np.number])
print("Example Input (Numeric Only):")
print(example_input)

# Get predictions for each model
isolation_forest_output = best_if.predict(example_input)
one_class_svm_output = best_svm.predict(example_input)
ae_predictions = best_autoencoder.predict(example_input)

# Calculate per-feature anomaly scores for the autoencoder
ae_feature_anomaly_scores = np.power(ae_predictions - example_input.values, 2)

# Average anomaly score across features
ae_loss = np.mean(ae_feature_anomaly_scores, axis=1)

# Prepare results in a DataFrame
example_results = pd.DataFrame({
    "Index": example_input.index,
    "Isolation Forest": isolation_forest_output,
    "One-Class SVM": one_class_svm_output,
    "Autoencoder Loss": ae_loss
})

# Convert model outputs for better readability
example_results["Isolation Forest"] = example_results["Isolation Forest"].replace({1: "Normal", -1: "Anomaly"})
example_results["One-Class SVM"] = example_results["One-Class SVM"].replace({1: "Normal", -1: "Anomaly"})

# Identify anomalous features for the autoencoder
anomalous_features = []
for row_idx, scores in enumerate(ae_feature_anomaly_scores):
    feature_indices = np.where(scores > np.percentile(scores, 95))[0]  # Features with top 5% anomaly scores
    feature_names = example_input.columns[feature_indices]
    anomalous_features.append(", ".join(feature_names))

example_results["Anomalous Features (AE)"] = anomalous_features

# Display example inputs and outputs in an interactive table
fig = go.Figure(data=[go.Table(
    header=dict(values=["Feature " + str(i) for i in range(1, example_input.shape[1] + 1)],
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[example_input[col].tolist() for col in example_input.columns],
               fill_color='lavender',
               align='left'))
])
fig.update_layout(title="Example Input Features", title_x=0.5)
fig.show()

fig_results = go.Figure(data=[go.Table(
    header=dict(values=list(example_results.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[example_results[col] for col in example_results.columns],
               fill_color='lavender',
               align='left'))
])
fig_results.update_layout(title="Model Outputs and Anomalous Features", title_x=0.5)
fig_results.show()