In [None]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from pymongo import MongoClient
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import  DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib  # Import joblib to save the model
from tqdm import tqdm
import logging

# Function to preprocess the JSON data
def preprocess_json_data(collection):
    X = []
    y = []
    feature_names = [
        'tcp_syn_flag_ratio',
        'udp_port_entropy',
        'avg_pkt_size',
        'flow_density',
        'ip_entropy'
    ]

    total_records = collection.count_documents({})

    logging.info(f"Processing {total_records} records from MongoDB...")

    for i, entry in enumerate(tqdm(collection.find(), total=total_records, desc="Processing Records")):
        tcp_syn_flag_ratio = (entry.get('tcp_syn_fwd_count', 0) + entry.get('tcp_syn_bwd_count', 0)) / (entry.get('fwd_packet_count', 1) + entry.get('bwd_packet_count', 1))
        udp_port_entropy = entry.get('unique_udp_source_ports', 0) * entry.get('unique_udp_dest_ports', 0)
        avg_pkt_size = (entry.get('avg_fwd_pkt_size', 0) + entry.get('avg_bwd_pkt_size', 0)) / 2
        flow_density = entry.get('flow_packets_per_sec', 0) / entry.get('flow_bytes_per_sec', 1)
        ip_entropy = entry.get('source_ip_entropy', 0) + entry.get('dest_port_entropy', 0)

        X.append([tcp_syn_flag_ratio, udp_port_entropy, avg_pkt_size, flow_density, ip_entropy])
        label = entry.get('label')
        y.append(0 if label == 'BENIGN' else 1 if label == 'UDP_FLOOD' else 2 if label == 'TCP_SYN_FLOOD' else -1)

        # Print progress every 1000 samples
        if i % 1000 == 0 and i > 0:
            logging.info(f"Processed {i} records...")

    X = np.array(X)
    y = np.array(y)

    imputer = SimpleImputer(strategy='constant', fill_value=0)
    X = imputer.fit_transform(X)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    logging.info("Preprocessing complete.")
    return imputer, scaler, X_scaled, y, feature_names

# Function to predict a new sample
def predict_sample(model, scaler, imputer, sample_data):
    # Impute missing values using the saved imputer (transform the data)
    sample_data_imputed = imputer.transform([sample_data])  # Shape must be (1, n_features)
    
    # Standardize the new sample using the saved scaler (transform the data)
    sample_data_scaled = scaler.transform(sample_data_imputed)  # Shape must be (1, n_features)

    # Make prediction using the loaded model
    prediction = model.predict(sample_data_scaled)
    
    return prediction


In [None]:
try:

    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
    logger = logging.getLogger()

    logger.info("Connecting to MongoDB...")

    # MongoDB connection URI
    uri = "mongodb://admin:b3BYFU0kJZpGNK6Dt42V@node1-bffd0a8e5302ff2a.database.cloud.ovh.net,node2-bffd0a8e5302ff2a.database.cloud.ovh.net,node3-bffd0a8e5302ff2a.database.cloud.ovh.net/admin?replicaSet=replicaset&tls=true"

    # Specify the database and collection
    db_name = "ddos_detection"  # Replace with your database name
    collection_name = "traffic_features"  # Replace with your collection name
    
    # Connect to the MongoDB cluster
    client = MongoClient(uri)

    db = client[db_name]
    collection = db[collection_name]

    logger.info("Data Preprocessing...")

    # Preprocess data and get feature names
    imputer, scaler, X_scaled, y, feature_names = preprocess_json_data(collection)

    # Save the scaler to a .pkl file
    scaler_filename = 'scaler.pkl'
    joblib.dump(scaler, scaler_filename)
    print(f"Standard Scaler saved to {scaler_filename}")

    # Save the imputer to a .pkl file
    imputer_filename = 'imputer.pkl'
    joblib.dump(imputer, imputer_filename)
    print(f"Imputer saved to {imputer_filename}")

    # Train-test split (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Train the Decision Tree model
    logger.info("Training the Decision Tree model...")

    tree_model = DecisionTreeClassifier(class_weight='balanced', random_state=42)

    for _ in tqdm(range(1), desc="Training Progress"):
        tree_model.fit(X_train, y_train)

    logger.info("Training complete!")

    # Make predictions
    y_pred = tree_model.predict(X_test)

    # Evaluation: Print classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Feature importances
    feature_importances = tree_model.feature_importances_


    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    # Print the feature importance
    print(feature_importance_df)

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importance in Random Forest Model')
    plt.show()

    # Save the trained model to a .pkl file
    model_filename = 'decision_tree.pkl'
    joblib.dump(tree_model, model_filename)
    print(f"Model saved to {model_filename}")

except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the client connection
    client.close()

In [None]:

try:
    # Load pickles
    model = joblib.load("decision_tree.pkl")
    imputer = joblib.load("imputer.pkl")
    scaler = joblib.load("scaler.pkl")

    # Example new sample
    new_sample = [-0.78267391, -0.14192909, -0.11761717, -0.41677815,  2.21815411]

    # Get prediction for the new sample
    prediction = predict_sample(model, scaler, imputer, new_sample)

    if len(prediction) > 0:
        prediction = prediction[0]

    print("Prediction for new sample:", prediction)
    
except Exception as e:
    print(f"An error occurred: {e}")