In [None]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, precision_recall_curve, auc, average_precision_score,)
from sklearn.neighbors import NearestNeighbors
from imblearn.over_sampling import SMOTE

In [None]:
# Step 1: Load dataset
data = pd.read_csv("results/cicids2017_cleaned_numericable.csv")
masv_df = pd.read_csv("results/xgbmasv.csv")  

In [None]:
# Change infinity and NaN with median
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(data.median(), inplace=True)

In [None]:
# Step 2: Split feature and label
X = data.drop(columns=['Label']).values  # Fitur
y = data['Label'].values  # Label

In [None]:
# Normalize MASV
masv_values = masv_df['masv'].values  # Retrieve MASV values
masv_normalized = (masv_values - np.min(masv_values)) / (np.max(masv_values) - np.min(masv_values))

# Ensure the normalization index matches the feature names in `masv_df`
masv_series = pd.Series(masv_normalized, index=masv_df['feature'])

In [None]:
# Add the normalized column to the DataFrame
masv_df['masv_normalized'] = masv_normalized

# Save the DataFrame to an Excel file
output_file = "results/xgbmasv_normalized.xlsx"
masv_df.to_excel(output_file, index=False, engine='openpyxl')

print(f"Data with normalized MASV has been saved to the Excel file: {output_file}")

Data dengan MASV Normalized telah disimpan ke file Excel: XGBmasv_normalized_results.xlsx


In [15]:
# Step 2: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Step 3: Feature selection based on MASV
def select_top_features(X, masv_df, percentile=20):
    start_time = time.time()
    
    # Sort MASV values and select the top features based on the given percentile
    masv_df_sorted = masv_df.sort_values(by='masv', ascending=False)
    num_features_to_select = int(len(masv_df) * (percentile / 100))
    selected_features = masv_df_sorted['feature'].head(num_features_to_select).values
    
    print(f"Selected features based on top {percentile}% MASV:")
    print(list(selected_features))
    
    # Ensure X is a DataFrame. If not, convert it and assign column names according to masv_df
    if isinstance(X, np.ndarray):
        X_df = pd.DataFrame(X, columns=masv_df['feature'])
    else:
        X_df = X
    
    duration = time.time() - start_time
    print(f"\n⏱️ Feature selection process time: {duration:.4f} seconds")
    
    # Return the selected feature data and list of selected feature names
    return X_df[selected_features], list(selected_features)

# Step 5: Apply feature selection
if isinstance(X_train, np.ndarray):
    X_train = pd.DataFrame(X_train, columns=masv_df['feature'])

X_train_selected, selected_features = select_top_features(X_train, masv_df, percentile=20)

# If `X_test` is a NumPy array, convert it to a DataFrame
if isinstance(X_test, np.ndarray):
    X_test = pd.DataFrame(X_test, columns=masv_df['feature'])

# Apply the same feature selection to `X_test`
X_test_selected = X_test[selected_features]

# Get the integer indices of the selected features
selected_indices = [X_train.columns.get_loc(feature) for feature in selected_features]

# Use the integer indices to select the corresponding MASV weights
masv_weights_selected = masv_normalized[selected_indices]


Selected features based on top 20% MASV:
['Destination Port', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'Bwd Packets/s', 'min_seg_size_forward', 'Packet Length Mean', 'Flow IAT Min', 'Fwd IAT Min', 'Flow Bytes/s', 'Total Length of Fwd Packets', 'Bwd Packet Length Std', 'Average Packet Size', 'Flow IAT Max', 'Bwd Packet Length Mean', 'Fwd Header Length']

⏱️ Waktu proses seleksi fitur: 0.0157 detik


In [None]:
# 4. Function for Weighted SMOTE with Minority Class Adjustment
def weighted_smote_multiclass(X, y, masv_weights, k_neighbors=1, target_sample_size=10000):
    """
    Weighted SMOTE for multiclass datasets based on MASV values.
    
    Parameters:
    X: Original feature data
    y: Class labels
    masv_weights: Feature weights derived from MASV values
    k_neighbors: Number of nearest neighbors for interpolation
    target_sample_size: Desired total number of samples for each class
    
    Returns:
    X_resampled: Feature data after SMOTE
    y_resampled: Label data after SMOTE
    """
    start_time = time.time()  # Start timer
    unique_classes, class_counts = np.unique(y, return_counts=True)
    X_synthetic = []
    y_synthetic = []
    
    for class_label, class_count in zip(unique_classes, class_counts):
        # Calculate how many samples need to be generated for this class
        n_samples_to_generate = target_sample_size - class_count
        if n_samples_to_generate <= 0:
            continue  # Skip if the class already meets the target

        # Extract data for the current class
        X_class = X[y == class_label]

        # Define nearest neighbors for interpolation
        neigh = NearestNeighbors(n_neighbors=k_neighbors)
        neigh.fit(X_class)

        for i in range(n_samples_to_generate):
            # Randomly select a minority sample and find its nearest neighbor
            index = np.random.randint(0, len(X_class))
            x_sample = X_class[index]
            neighbors = neigh.kneighbors([x_sample], return_distance=False)
            neighbor_index = np.random.choice(neighbors[0])
            x_neighbor = X_class[neighbor_index]

            # Weighted interpolation
            lambda_ = np.random.random()
            x_synthetic = x_sample + lambda_ * (x_neighbor - x_sample) * masv_weights
            X_synthetic.append(x_synthetic)
            y_synthetic.append(class_label)

    if len(X_synthetic) == 0:
        print("No synthetic samples were generated.")
        return X, y

    # Combine original and synthetic data
    X_resampled = np.vstack([X, np.array(X_synthetic)])
    y_resampled = np.hstack([y, np.array(y_synthetic)])
    
    duration = time.time() - start_time
    print(f"\n⏱️ MASV-weighted SMOTE process time: {duration:.2f} seconds")
    
    return X_resampled, y_resampled


# Display class distribution before balancing
print("Class distribution before balancing (training set):")
unique, counts = np.unique(y_train, return_counts=True)
class_distribution_before = dict(zip(unique, counts))
for label, count in class_distribution_before.items():
    print(f"Class {label}: {count} samples")

# Execute multiclass SMOTE
X_train_balanced, y_train_balanced = weighted_smote_multiclass(
    X_train_selected.values,
    y_train,
    masv_weights_selected,
    k_neighbors=1,
    target_sample_size=10000
)

# Display class distribution after balancing
print("\nClass distribution after balancing:")
unique_balanced, counts_balanced = np.unique(y_train_balanced, return_counts=True)
class_distribution_after = dict(zip(unique_balanced, counts_balanced))
for label, count in class_distribution_after.items():
    print(f"Class {label}: {count} samples")


Distribusi data sebelum balancing (training set):
Kelas 0: 795246 sampel
Kelas 1: 692 sampel
Kelas 2: 4797 sampel
Kelas 3: 132776 sampel
Kelas 4: 13 sampel
Kelas 5: 55495 sampel
Kelas 6: 745 sampel

⏱️ Waktu proses MASV-weighted SMOTE: 8.80 detik

Distribusi data setelah balancing:
Kelas 0: 795246 sampel
Kelas 1: 10000 sampel
Kelas 2: 10000 sampel
Kelas 3: 132776 sampel
Kelas 4: 10000 sampel
Kelas 5: 55495 sampel
Kelas 6: 10000 sampel


In [None]:
# Step 5: Modeling and Evaluation
def evaluate_model(X_train, y_train, X_test, y_test, num_classes):
    """
    Train and evaluate an XGBoost model for multiclass classification with global (micro-averaged) PR-AUC.
    """
    # Create the XGBoost model
    model = XGBClassifier(
        objective='multi:softprob',   # For multiclass classification
        num_class=num_classes,        # Number of classes
        eval_metric='mlogloss',       # Evaluation metric: log-loss
        use_label_encoder=False,      # Avoid XGBoost warning
        random_state=42
    )
    
    # Train the model
    start_train_time = time.time()
    model.fit(X_train, y_train)
    end_train_time = time.time()
    train_duration = end_train_time - start_train_time

    # Make predictions on the test set
    start_predict_time = time.time()
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    end_predict_time = time.time()
    predict_duration = end_predict_time - start_predict_time
    
    # Compute general performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)

    # One-hot encode y_test for PR-AUC computation
    y_test_one_hot = np.zeros((y_test.size, num_classes))
    y_test_one_hot[np.arange(y_test.size), y_test] = 1

    # Precision-Recall curve for micro-averaged PR-AUC
    precision_micro, recall_micro, _ = precision_recall_curve(
        y_test_one_hot.ravel(), y_pred_proba.ravel()
    )
    pr_auc_micro = auc(recall_micro, precision_micro)

    # Compute PR-AUC per class for reference
    pr_auc_per_class = []
    for i in range(num_classes):
        precision_class, recall_class, _ = precision_recall_curve(y_test_one_hot[:, i], y_pred_proba[:, i])
        pr_auc_per_class.append(auc(recall_class, precision_class))
    
    # Print evaluation results
    print("\nModel Performance Evaluation:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision (weighted): {precision:.4f}")
    print(f"Recall (weighted): {recall:.4f}")
    print(f"F1-score (weighted): {f1:.4f}")
    print("Confusion Matrix:\n", conf_matrix)
    print(f"\nPR-AUC (Micro-Averaged): {pr_auc_micro:.4f}")
    print("\nPR-AUC per Class:")
    for i, auc_val in enumerate(pr_auc_per_class):
        print(f"  Class {i}: {auc_val:.4f}")
    print()

    # Print execution time
    print("\nExecution Time:")
    print(f"  Training Time: {train_duration:.2f} seconds")
    print(f"  Prediction Time: {predict_duration:.2f} seconds")
    print()
    

# Evaluate the model before and after balancing
print("\nModel Evaluation Before Balancing:")
evaluate_model(X_train_selected, y_train, X_test_selected, y_test, num_classes=len(np.unique(y_train)))

print("\nModel Evaluation After Balancing:")
evaluate_model(X_train_balanced, y_train_balanced, X_test_selected, y_test, num_classes=len(np.unique(y_train_balanced)))


Evaluasi Model Sebelum Balancing:


  _warn_prf(average, modifier, msg_start, len(result))



Evaluasi Kinerja Model:
Akurasi: 0.9992
Precision (weighted): 0.9991
Recall (weighted): 0.9992
F1-score (weighted): 0.9991
Confusion Matrix:
 [[340180     16      0     62      0    162      0]
 [    63    221      0      0      0      0      0]
 [     1      0   2117      0      0      0      0]
 [    27      0      0  57069      0      3      0]
 [     5      0      0      0      0      0      0]
 [     3      0      0      3      0  23907      1]
 [     8      0      1      1      0      0    335]]

PR-AUC (Micro-Averaged): 1.0000

PR-AUC per kelas:
  Kelas 0: 1.0000
  Kelas 1: 0.9517
  Kelas 2: 1.0000
  Kelas 3: 1.0000
  Kelas 4: 0.2136
  Kelas 5: 0.9990
  Kelas 6: 0.9942


Waktu Eksekusi:
  Waktu Pelatihan: 19.44 detik
  Waktu Prediksi: 1.72 detik


Evaluasi Model Setelah Balancing:

Evaluasi Kinerja Model:
Akurasi: 0.9991
Precision (weighted): 0.9992
Recall (weighted): 0.9991
F1-score (weighted): 0.9991
Confusion Matrix:
 [[340078    117      0     63      0    162      0]
 [   

In [None]:
def evaluate_model_per_class(X_train, y_train, X_test, y_test, class_labels):
    """
    Evaluate a multiclass XGBoost model performance per class.
    """
    # Train the XGBoost model
    model = XGBClassifier(
        objective="multi:softprob",   # Multiclass probability output
        num_class=len(class_labels),  # Number of classes
        use_label_encoder=False,      # Avoid deprecation warning
        eval_metric="mlogloss",       # Default evaluation metric for multiclass
        random_state=42,
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)  # Probabilities for PR-AUC calculation

    # Overall Accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"\nOverall Accuracy: {overall_accuracy:.4f}")

    print("\nModel Performance Evaluation Per Class:")

    # Iterate through each class
    for cls in class_labels:
        # Binarize labels for the current class
        y_test_binary = (y_test == cls).astype(int)
        y_pred_binary = (y_pred == cls).astype(int)

        # Accuracy per class
        accuracy_class = accuracy_score(y_test_binary, y_pred_binary)

        # Precision, Recall, and F1-score
        precision = precision_score(y_test_binary, y_pred_binary, zero_division=0)
        recall = recall_score(y_test_binary, y_pred_binary, zero_division=0)
        f1 = f1_score(y_test_binary, y_pred_binary, zero_division=0)

        # Confusion Matrix for the current class
        conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)
        tn, fp, fn, tp = conf_matrix.ravel()

        # False Positive Rate (FPR) and False Negative Rate (FNR)
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

        # Precision-Recall Curve and PR-AUC
        precision_curve, recall_curve, _ = precision_recall_curve(y_test_binary, y_pred_proba[:, cls])
        pr_auc = auc(recall_curve, precision_curve)

        # Print metrics for the current class
        print(f"\nClass {cls}:")
        print(f"  Accuracy: {accuracy_class:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1-score: {f1:.4f}")
        print(f"  PR-AUC: {pr_auc:.4f}")
        print(f"  FNR (False Negative Rate): {fnr:.4f}")
        print(f"  FPR (False Positive Rate): {fpr:.4f}")
        print(f"  Confusion Matrix:\n{conf_matrix}")


# Example usage
class_labels = sorted(np.unique(y_test))  # Ensure consistent order of class labels

print("\nModel Evaluation Before Balancing (Per Class):")
evaluate_model_per_class(X_train_selected, y_train, X_test_selected, y_test, class_labels)

print("\nModel Evaluation After Balancing (Per Class):")
evaluate_model_per_class(X_train_balanced, y_train_balanced, X_test_selected, y_test, class_labels)



Evaluasi Model Sebelum Balancing Per Kelas:

Overall Accuracy: 0.9992

Evaluasi Kinerja Model Per Kelas:

Class 0:
  Accuracy: 0.9992
  Precision: 0.9997
  Recall: 0.9993
  F1-score: 0.9995
  PR-AUC: 1.0000
  FNR (False Negative Rate): 0.0007
  FPR (False Positive Rate): 0.0012
  Confusion Matrix:
[[ 83665    100]
 [   236 340184]]

Class 1:
  Accuracy: 0.9998
  Precision: 0.9109
  Recall: 0.7923
  F1-score: 0.8475
  PR-AUC: 0.9509
  FNR (False Negative Rate): 0.2077
  FPR (False Positive Rate): 0.0001
  Confusion Matrix:
[[423879     22]
 [    59    225]]

Class 2:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 0.9991
  F1-score: 0.9995
  PR-AUC: 1.0000
  FNR (False Negative Rate): 0.0009
  FPR (False Positive Rate): 0.0000
  Confusion Matrix:
[[422067      0]
 [     2   2116]]

Class 3:
  Accuracy: 0.9998
  Precision: 0.9990
  Recall: 0.9995
  F1-score: 0.9993
  PR-AUC: 1.0000
  FNR (False Negative Rate): 0.0005
  FPR (False Positive Rate): 0.0002
  Confusion Matrix:
[[367028     