In [13]:
import pandas as pd
import glob
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
#import smote
from imblearn.over_sampling import SMOTE
from sklearn.multiclass import OneVsRestClassifier

In [14]:
# Grayson's path
#train_folder = "C:\\Users\\grays\\Will-Grayson GitHub Repo\\will-grayson-ML\\nineteen-class\\data\\train\\"
#test_folder = "C:\\Users\\grays\\Will-Grayson GitHub Repo\\will-grayson-ML\\nineteen-class\\data\\test\\"

# Will's path (comment out if not in use)
train_folder = "C:\\Users\\willg\\OneDrive\\CSCI\\summer-2024-work\\will-grayson-ML\\nineteen-class\\data\\train\\"
test_folder = "C:\\Users\\willg\\OneDrive\\CSCI\\summer-2024-work\\will-grayson-ML\\nineteen-class\\data\\test\\"

# Function to load and concatenate CSV files from a list of file paths
def load_and_concat(files, sample_fraction=None):
    df_list = []
    for file in files:
        df = pd.read_csv(file)
        if sample_fraction:
            df = df.sample(frac=sample_fraction, random_state=42)  # Random sampling
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

# List all CSV files in the train and test folders
train_files = glob.glob(train_folder + "*.csv")
test_files = glob.glob(test_folder + "*.csv")

# Load and concatenate training and testing data
train_df = load_and_concat(train_files, sample_fraction=0.3)  # Use 10% of the data
test_df = load_and_concat(test_files, sample_fraction=0.3)

In [15]:
# Combine target labels into a single column for multi-class classification
# Assuming the labels are one-hot encoded
target_columns = ['spoofing', 'benign', 'MQTT_DDoS_Connect_Flood', 'MQTT_DoS_Connect_Flood', 
    'MQTT_DDoS_Publish_Flood', 'MQTT_DoS_Publish_Flood', 'MQTT_Malformed_Data',
    'Recon_OS_Scan', 'Recon_Ping_Sweep', 'Recon_Port_Scan', 'Recon_VulScan',
    'DoS_ICMP', 'DoS_SYN', 'DoS_TCP', 'DoS_UDP',
    'DDoS_ICMP', 'DDoS_SYN', 'DDoS_TCP', 'DDoS_UDP'
]
feature_columns = [col for col in train_df.columns if col not in target_columns]

# Separate features and labels
X_train = train_df[feature_columns]
y_train = train_df[target_columns]
X_test = test_df[feature_columns]
y_test = test_df[target_columns]


In [16]:
# Encode the labels to numerical values
label_encoder = LabelEncoder()
y_train_labels = label_encoder.fit_transform(y_train.idxmax(axis=1))
y_test_labels = label_encoder.transform(y_test.idxmax(axis=1))

# Create a scaler object
scaler = MinMaxScaler()

# Fit the scaler to the training features and transform both training and testing features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the sampling strategy
#sampling_strategy = {0: 230339, 1: 80196, 2: 200000, 3: 200000, 4: 200666, 5: 1006603, 6: 150000, 7: 160000, 8: 363009, 9: 52881, 10: 214952, 11: 462480, 12: 514724, 13: 540498, 14: 704503, 15: 974359, 16: 987063, 17: 1887175, 18: 1998026}

# Apply SMOTE to the scaled training data
#smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
#X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train_labels)


In [17]:
# Initialize the Random Forest model and wrap it with OneVsRestClassifier
model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, n_jobs=-1))

# Train the model
model.fit(X_train_scaled, y_train_labels)

# Make predictions
y_pred_labels = model.predict(X_test_scaled)

results = pd.DataFrame({
    'actual': label_encoder.inverse_transform(y_test_labels),
    'predicted': label_encoder.inverse_transform(y_pred_labels)
})

In [18]:
# Evaluate the model
print(confusion_matrix(y_test_labels, y_pred_labels))
print(classification_report(y_test_labels, y_pred_labels, target_names=label_encoder.classes_))

[[34959     0     0    11     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0 17218     0    13     0     2     0     1     0     0     0     0
      0     0     0     0     0     6     0]
 [    0     0 18259     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     1     0]
 [    7     0     6 36187     0     0     0     5     0     0     0     1
      0     0     0     0     0     0     1]
 [   11     0     0     0  9822     0     0     1     0     0     0     0
      0     0     0     0     0     9     0]
 [    0     9     0     0     0  9851     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     8     0     0     0  8202     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0    15     5     0     0 13735     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     

In [19]:

# Print results where actual value is 'MQTT_DDoS_Publish_Flood'
print(results[results['actual'] == 'MQTT_DDoS_Publish_Flood'])

                       actual                predicted
8127  MQTT_DDoS_Publish_Flood   MQTT_DoS_Publish_Flood
8128  MQTT_DDoS_Publish_Flood   MQTT_DoS_Publish_Flood
8129  MQTT_DDoS_Publish_Flood  MQTT_DDoS_Publish_Flood
8130  MQTT_DDoS_Publish_Flood   MQTT_DoS_Publish_Flood
8131  MQTT_DDoS_Publish_Flood   MQTT_DoS_Publish_Flood
...                       ...                      ...
8964  MQTT_DDoS_Publish_Flood   MQTT_DoS_Publish_Flood
8965  MQTT_DDoS_Publish_Flood   MQTT_DoS_Publish_Flood
8966  MQTT_DDoS_Publish_Flood   MQTT_DoS_Publish_Flood
8967  MQTT_DDoS_Publish_Flood   MQTT_DoS_Publish_Flood
8968  MQTT_DDoS_Publish_Flood   MQTT_DoS_Publish_Flood

[842 rows x 2 columns]
