In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import xgboost as xgb
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
)
from imblearn.over_sampling import SMOTE
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

In [2]:
# Load dataset
df = pd.read_csv("cleaned_data2_iotid23.csv")  # Replace with actual dataset
X = df.drop(columns=["Label"])
y = df["Label"]


In [3]:
# Encode labels
le = LabelEncoder()
y = le.fit_transform(y)


In [4]:
import numpy as np
import pandas as pd

# Check for infinite values
print("Number of infinite values in X:", np.isinf(X).sum().sum())

# Replace infinite values with NaN (if any)
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Check for NaN values
print("Number of NaN values in X:", X.isna().sum().sum())

# Fill or drop NaN values
X.fillna(X.median(), inplace=True)  # Replace NaN with median values


Number of infinite values in X: 1830
Number of NaN values in X: 1830


In [5]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [6]:
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 5, n_samples = 5

In [None]:





# Autoencoder Model with Weighted Loss
input_dim = X_train.shape[1]
encoding_dim = 16

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation="relu")(input_layer)
decoded = Dense(input_dim, activation="sigmoid")(encoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)


# Weighted Loss Function
def custom_loss(y_true, y_pred):
    weights = np.where(y_true == 1, 10, 1)  # Give more weight to minority class
    return keras.losses.mean_squared_error(y_true, y_pred) * weights


autoencoder.compile(optimizer="adam", loss=custom_loss)
autoencoder.fit(
    X_train,
    X_train,
    epochs=50,
    batch_size=256,
    shuffle=True,
    validation_data=(X_test, X_test),
)

# Feature extraction
X_train_encoded = encoder.predict(X_train)
X_test_encoded = encoder.predict(X_test)

# Calculate class weights for XGBoost
unique_classes, class_counts = np.unique(y_train, return_counts=True)
total_samples = len(y_train)
class_weights = {
    cls: total_samples / (len(unique_classes) * count)
    for cls, count in zip(unique_classes, class_counts)
}

# XGBoost Model with Class Weights
xgb_model = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=len(unique_classes),
    eval_metric="mlogloss",
    scale_pos_weight=[
        class_weights[cls] for cls in unique_classes
    ],  # Apply class weights
)
xgb_model.fit(X_train_encoded, y_train)

# Predictions
y_pred = xgb_model.predict(X_test_encoded)
y_pred_prob = xgb_model.predict_proba(X_test_encoded)

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(10, 7))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# ROC Curve
plt.figure(figsize=(8, 6))
for i in range(len(unique_classes)):
    fpr, tpr, _ = roc_curve(y_test == i, y_pred_prob[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"Class {i} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

# Precision-Recall Curve
plt.figure(figsize=(8, 6))
for i in range(len(unique_classes)):
    precision, recall, _ = precision_recall_curve(y_test == i, y_pred_prob[:, i])
    plt.plot(recall, precision, label=f"Class {i}")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()

# Save Models
encoder.save("autoencoder_encoder.h5")
xgb_model.save_model("xgboost_model.json")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")

print("Models saved successfully.")
