<a href="https://colab.research.google.com/github/thegit-69/star-summit/blob/main/test_2_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
#!/usr/bin/env python3
# SVM Classification for UNSW_NB15 Dataset

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import time
import warnings
warnings.filterwarnings('ignore')

def preprocess_data(df):
    """
    Preprocess the dataset:
    - Perform one-hot encoding for categorical features
    - Scale numerical features
    """
    # Drop the 'id' column (not useful for classification)
    # Also drop 'attack_cat' as we're using binary classification with 'label'
    df = df.drop(['id', 'attack_cat'], axis=1, errors='ignore')

    # Define categorical and numerical features
    categorical_features = ['proto', 'service', 'state']
    numerical_features = [col for col in df.columns if col not in categorical_features + ['label']]

    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='drop'  # Drop other columns
    )

    # Separate features and target
    X = df.drop('label', axis=1)
    y = df['label']

    return X, y, preprocessor

def run_svm_classification(random_state=42, test_size=0.3, sample_fraction=0.1):
    """
    Run SVM classification on a sample of the dataset and return accuracy
    """
    print(f"Run with random_state={random_state}:")
    start_time = time.time()

    # Load the dataset
    print("  Loading dataset...")
    df = pd.read_csv('/content/drive/MyDrive/data sets/UNSW_NB15_training-set.csv')

    # Take a sample if the dataset is large (to speed up processing)
    if sample_fraction < 1.0:
        df = df.sample(frac=sample_fraction, random_state=random_state)
        print(f"  Using {len(df)} samples ({sample_fraction*100:.1f}% of the dataset)")

    # Preprocess data
    print("  Preprocessing data...")
    X, y, preprocessor = preprocess_data(df)

    # Split data
    print("  Splitting data into train/test sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Build pipeline with preprocessing and SVM
    print("  Building and training SVM model...")
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', SVC(kernel='rbf', C=1.0, gamma='scale', random_state=random_state))
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    print("  Making predictions...")
    y_pred = pipeline.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred) * 100

    elapsed_time = time.time() - start_time
    print(f"  Accuracy: {accuracy:.2f}%")
    print(f"  Time taken: {elapsed_time:.2f} seconds")

    return accuracy

def main():
    print("UNSW_NB15 Dataset - SVM Classification")
    print("=" * 50)

    # Use 10 different random seeds to get 10 different accuracy values
    random_seeds = [42, 123, 456, 789, 101, 202, 303, 404, 505, 606]

    accuracies = []

    # Determine an appropriate sample fraction (adjust as needed based on memory)
    sample_fraction = 0.1  # Using 10% of the data to make it run faster

    for seed in random_seeds:
        accuracy = run_svm_classification(
            random_state=seed,
            test_size=0.3,
            sample_fraction=sample_fraction
        )
        accuracies.append(accuracy)
        print()

    print("=" * 50)
    print("Summary of SVM Classification Results:")
    print("-" * 50)
    print("Run | Accuracy (%)")
    print("-" * 50)

    for i, acc in enumerate(accuracies, 1):
        print(f"{i:3d} | {acc:.2f}%")

    print("-" * 50)
    print(f"Average accuracy: {np.mean(accuracies):.2f}%")
    print(f"Standard deviation: {np.std(accuracies):.2f}%")
    print(f"Min accuracy: {min(accuracies):.2f}%")
    print(f"Max accuracy: {max(accuracies):.2f}%")

    # Print just the list of accuracies
    print("\nList of 10 accuracy values:")
    print([round(acc, 2) for acc in accuracies])

if __name__ == "__main__":
    main()

UNSW_NB15 Dataset - SVM Classification
Run with random_state=42:
  Loading dataset...
  Using 8233 samples (10.0% of the dataset)
  Preprocessing data...
  Splitting data into train/test sets...
  Building and training SVM model...
  Making predictions...
  Accuracy: 91.13%
  Time taken: 1.54 seconds

Run with random_state=123:
  Loading dataset...
  Using 8233 samples (10.0% of the dataset)
  Preprocessing data...
  Splitting data into train/test sets...
  Building and training SVM model...
  Making predictions...
  Accuracy: 91.30%
  Time taken: 1.43 seconds

Run with random_state=456:
  Loading dataset...
  Using 8233 samples (10.0% of the dataset)
  Preprocessing data...
  Splitting data into train/test sets...
  Building and training SVM model...
  Making predictions...
  Accuracy: 91.42%
  Time taken: 1.42 seconds

Run with random_state=789:
  Loading dataset...
  Using 8233 samples (10.0% of the dataset)
  Preprocessing data...
  Splitting data into train/test sets...
  Building