In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_recall_curve, roc_curve, auc

# ignore warnings in the output
import warnings

warnings.filterwarnings("ignore")

In [None]:
# 🔹 Step 1: Load Data
df = pd.read_csv("IotFinalDataset.csv")
X = df.drop(columns=["Label"])  # Features
y = df["Label"]  # Target

In [None]:
# 🔹 Step 2: Encode Class Labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Converts labels into numbers

In [None]:
# 🔹 Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [None]:
# 🔹 Step 4: Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# 🔹 Step 5: One-Hot Encode y_train and y_test
encoder = OneHotEncoder(sparse_output=False)
y_train_bin = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_bin = encoder.transform(y_test.reshape(-1, 1))

In [None]:
# 🔹 Step 6: Train Model with `CalibratedClassifierCV`
sgd = SGDClassifier(
    loss="log_loss", random_state=42
)  # Use 'log_loss' for probability support
sgd_calibrated = CalibratedClassifierCV(sgd, cv=5)  # Enables probability estimation
sgd_calibrated.fit(X_train_scaled, y_train)

In [None]:
# 🔹 Step 7: Predict and Evaluate
y_pred = sgd_calibrated.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

In [None]:
# 🔹 Step 8: Predict Probabilities for Precision-Recall & ROC Curves
y_prob = sgd_calibrated.predict_proba(X_test_scaled)

for i, class_name in enumerate(label_encoder.classes_):
    precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_prob[:, i])
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc = auc(fpr, tpr)

    # 📊 Precision-Recall Curve
    plt.figure()
    plt.plot(recall, precision, marker=".", label=f"Class {class_name}")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision-Recall Curve ({class_name})")
    plt.legend()
    plt.show()

    # 📊 ROC Curve
    plt.figure()
    plt.plot(fpr, tpr, marker=".", label=f"Class {class_name} (AUC = {roc_auc:.2f})")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curve ({class_name})")
    plt.legend()
    plt.show()

In [None]:
# 🔹 Step 9: Save Model, Scaler, and Label Encoder
joblib.dump(sgd_calibrated, "sgd_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

print("Model, scaler, and label encoder saved successfully!")

In [None]:
# 🔹 Step 10: Feature Importance Visualization
feature_importance = np.abs(sgd_calibrated.base_estimator_.coef_).mean(
    axis=0
)  # Get mean importance per feature
feature_names = X.columns  # Get feature names

# Sort by importance
sorted_idx = np.argsort(feature_importance)[::-1]
sorted_features = feature_names[sorted_idx]
sorted_importance = feature_importance[sorted_idx]

# 📊 Plot Feature Importance
plt.figure(figsize=(12, 6))
plt.barh(
    sorted_features[:10], sorted_importance[:10], color="skyblue"
)  # Top 10 features
plt.xlabel("Feature Importance (Absolute Coefficients)")
plt.ylabel("Feature Names")
plt.title("Top 10 Important Features (SGD Classifier)")
plt.gca().invert_yaxis()  # Reverse order to match sorted ranking
plt.show()
