<a href="https://colab.research.google.com/github/thegit-69/star-summit/blob/main/test_1_trad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [16]:
#!/usr/bin/env python3
# Anomaly-Based Detection for UNSW_NB15 Dataset - Outputs 10 Accuracy Values

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import IsolationForest

def preprocess_data(df):
    """
    Preprocess the UNSW_NB15 dataset for anomaly detection
    """
    # Drop unnecessary columns
    df = df.drop(['id', 'attack_cat'], axis=1, errors='ignore')

    # Convert categorical features to numerical using one-hot encoding
    categorical_features = ['proto', 'service', 'state']
    numerical_features = [col for col in df.columns if col not in categorical_features + ['label']]

    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

    # Separate features and target
    X = df.drop('label', axis=1)
    y = df['label']

    return X, y, preprocessor

def run_anomaly_detection(random_state=42):
    """
    Run anomaly detection and return accuracy
    """
    # Load the dataset
    df = pd.read_csv('/content/drive/MyDrive/data sets/UNSW_NB15_training-set.csv')

    # Preprocess data
    X, y, preprocessor = preprocess_data(df)

    # Split data into training and testing sets (70/30)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=random_state, stratify=y
    )

    # Get normal data for training (label=0)
    X_train_normal = X_train[y_train == 0]

    # Fit preprocessor on normal training data
    preprocessor.fit(X_train_normal)
    X_train_normal_prep = preprocessor.transform(X_train_normal)

    # Initialize and train anomaly detection model (Isolation Forest)
    # Contamination is set based on the known proportion of anomalies in the dataset
    model = IsolationForest(contamination=0.1, random_state=random_state)
    model.fit(X_train_normal_prep)

    # Preprocess test data
    X_test_prep = preprocessor.transform(X_test)

    # Predict anomalies (1 for inliers, -1 for outliers)
    y_pred_raw = model.predict(X_test_prep)

    # Convert predictions to match our labels (0 for normal, 1 for attack)
    y_pred = np.where(y_pred_raw == 1, 0, 1)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred) * 100
    precision = precision_score(y_test, y_pred) * 100
    recall = recall_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred) * 100

    return accuracy, precision, recall, f1

def main():
    print("UNSW_NB15 Dataset - Anomaly-Based Detection Results")
    print("="*60)
    print("Running 10 detection iterations with different random seeds...")

    # Use 10 different random seeds to get 10 different accuracy values
    random_seeds = [42, 123, 456, 789, 101, 202, 303, 404, 505, 606]

    results = []
    print("\nAccuracy values (%):")
    print("-"*60)
    print("   Run   |  Accuracy  |  Precision  |   Recall   |   F1-Score")
    print("-"*60)

    for i, seed in enumerate(random_seeds, 1):
        accuracy, precision, recall, f1 = run_anomaly_detection(random_state=seed)
        results.append((accuracy, precision, recall, f1))
        print(f"    {i:02d}    |   {accuracy:.2f}%   |   {precision:.2f}%    |   {recall:.2f}%   |   {f1:.2f}%")

    # Calculate average metrics
    avg_acc, avg_prec, avg_rec, avg_f1 = np.mean(results, axis=0)

    print("-"*60)
    print(f"  Average |   {avg_acc:.2f}%   |   {avg_prec:.2f}%    |   {avg_rec:.2f}%   |   {avg_f1:.2f}%")
    print("="*60)

    # Return just the accuracy values in a list format
    accuracies = [result[0] for result in results]
    print("\nList of 10 accuracy values (%):")
    print(accuracies)

if __name__ == "__main__":
    main()

UNSW_NB15 Dataset - Anomaly-Based Detection Results
Running 10 detection iterations with different random seeds...

Accuracy values (%):
------------------------------------------------------------
   Run   |  Accuracy  |  Precision  |   Recall   |   F1-Score
------------------------------------------------------------
    01    |   64.09%   |   83.39%    |   43.43%   |   57.12%
    02    |   65.64%   |   85.78%    |   45.06%   |   59.08%
    03    |   60.22%   |   81.00%    |   36.26%   |   50.09%
    04    |   65.06%   |   84.74%    |   44.57%   |   58.42%
    05    |   66.92%   |   84.95%    |   48.51%   |   61.76%
    06    |   65.06%   |   85.19%    |   44.24%   |   58.24%
    07    |   64.57%   |   84.45%    |   43.71%   |   57.61%
    08    |   67.45%   |   85.47%    |   49.26%   |   62.50%
    09    |   66.93%   |   85.63%    |   47.99%   |   61.51%
    10    |   67.37%   |   86.28%    |   48.45%   |   62.05%
------------------------------------------------------------
  Averag