Import All Necessary Libraries and Dependencies

In [None]:
#Dependencies
!pip install kagglehub[pandas-datasets] pandas scikit-learn matplotlib numpy seaborn

In [None]:
#Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

Load Dataset

In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "IoT_Intrusion.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "subhajournal/iotintrusion",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

#Show sample of data
df.head()

In [None]:
#Get column names and shape of data frame
print(df.columns[:])
print(df.shape)

Data Preprocessing

In [None]:
#Column model will try to predict
target_column = 'label'

#Column irrelevant to target column
excluded_columns = []

#Drop rows with missing target value
df.dropna(subset=[target_column], inplace=True)


#Separate features (X) and target (y)
X = df.drop(columns=[target_column] + excluded_columns, errors='ignore')
y = df[target_column]

#Identify categorical columns for encoding
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

#Apply Label Encoding to categorical features

le = LabelEncoder()
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    print(f"Encoded categorical column: {col}")


#Encode the target variable if it's categorical (e.g., 'DDoS', 'PortScan')
if y.dtype == 'object' or y.dtype == 'category':
    le_y = LabelEncoder()
    y = le_y.fit_transform(y)
    print("Target variable encoded.")
    # You can store le_y to inverse_transform predictions later if needed

#Split data into training and testing sets
#test_size=0.2
#random_state ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nShape of training features: {X_train.shape}")
print(f"Shape of testing features: {X_test.shape}")
print(f"Shape of training target: {y_train.shape}")
print(f"Shape of testing target: {y_test.shape}")

Model Training

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
print("Training Model...")

model.fit(X_train, y_train)
print("Model training complete!")

Model Evaluation

In [None]:
y_pred = model.predict(X_test)

#Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

#Classification Report (Precision, Recall, F1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_y.classes_, zero_division=0))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(18, 15))
class_names = le_y.classes_
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 8}, xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix (Raw Counts)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Feature Importance
print("\n--- Feature Importances ---")
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()