<a href="https://colab.research.google.com/github/sushilsayshello/ddos_ml/blob/main/IEEE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Step 1: Set up and install required libraries
!pip install pandas numpy scikit-learn xgboost

# Step 2: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
import zipfile




In [5]:
# Load the dataset
data = pd.read_csv('RT_IOT2022.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,id.orig_p,id.resp_p,proto,service,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,...,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,Attack_type
0,0,38667,1883,tcp,mqtt,32.011598,9,5,3,3,...,0.0,29729180.0,29729180.0,29729180.0,29729180.0,0.0,64240,26847,502,MQTT_Publish
1,1,51143,1883,tcp,mqtt,31.883584,9,5,3,3,...,0.0,29855280.0,29855280.0,29855280.0,29855280.0,0.0,64240,26847,502,MQTT_Publish
2,2,44761,1883,tcp,mqtt,32.124053,9,5,3,3,...,0.0,29842150.0,29842150.0,29842150.0,29842150.0,0.0,64240,26847,502,MQTT_Publish
3,3,60893,1883,tcp,mqtt,31.961063,9,5,3,3,...,0.0,29913770.0,29913770.0,29913770.0,29913770.0,0.0,64240,26847,502,MQTT_Publish
4,4,51087,1883,tcp,mqtt,31.902362,9,5,3,3,...,0.0,29814700.0,29814700.0,29814700.0,29814700.0,0.0,64240,26847,502,MQTT_Publish


In [15]:
# Select features and target (dropping irrelevant columns)
X = data.drop(['Attack_type', 'Unnamed: 0'], axis=1)
y = data['Attack_type']

In [16]:
# Convert categorical features to numerical using one-hot encoding
X = pd.get_dummies(X, columns=['proto', 'service'], drop_first=True)

In [17]:
# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [18]:
# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [19]:
# Step 5: Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
# Step 6: Define and Train Machine Learning Models

# XGBoost Model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

In [21]:
# K-Nearest Neighbors (KNN) Model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

In [22]:
# Stochastic Gradient Descent (SGD) Model
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)

In [23]:
# Naïve Bayes Model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [24]:
# Step 7: Evaluate Models Using Metrics
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return accuracy, precision, recall, f1

In [25]:
# Evaluate each model
xgb_metrics = evaluate_model(xgb_model, X_test, y_test)
knn_metrics = evaluate_model(knn_model, X_test, y_test)
sgd_metrics = evaluate_model(sgd_model, X_test, y_test)
nb_metrics = evaluate_model(nb_model, X_test, y_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
# Step 8: Display Results in a Table
# Organize the metrics in a DataFrame for easy comparison
results = pd.DataFrame({
    'Model': ['XGBoost', 'KNN', 'SGD', 'Naive Bayes'],
    'Accuracy': [xgb_metrics[0], knn_metrics[0], sgd_metrics[0], nb_metrics[0]],
    'Precision': [xgb_metrics[1], knn_metrics[1], sgd_metrics[1], nb_metrics[1]],
    'Recall': [xgb_metrics[2], knn_metrics[2], sgd_metrics[2], nb_metrics[2]],
    'F1 Score': [xgb_metrics[3], knn_metrics[3], sgd_metrics[3], nb_metrics[3]]
})

In [27]:
print("Model Performance Metrics:")
print(results)

Model Performance Metrics:
         Model  Accuracy  Precision    Recall  F1 Score
0      XGBoost  0.998186   0.998184  0.998186  0.998180
1          KNN  0.995587   0.995586  0.995587  0.995546
2          SGD  0.982564   0.983273  0.982564  0.982291
3  Naive Bayes  0.910900   0.966957  0.910900  0.921368
