In [8]:
import pandas as pd
import glob
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [11]:
# Grayson's path
train_folder = "C:\\Users\\grays\\Will-Grayson GitHub Repo\\will-grayson-ML\\six-class\\data\\train\\"
test_folder = "C:\\Users\\grays\\Will-Grayson GitHub Repo\\will-grayson-ML\\six-class\\data\\test\\"

# Will's path
#train_folder = "C:\\Users\\willg\\OneDrive\\CSCI\\summer-2024-work\\will-grayson-ML\\train\\"
#test_folder = "C:\\Users\\willg\\OneDrive\\CSCI\\summer-2024-work\\will-grayson-ML\\test\\"

# List all CSV files in the train folder
train_files = glob.glob(train_folder + "*.csv")
test_files = glob.glob(test_folder + "*.csv")

# Function to load and concatenate CSV files from a list of file paths
def load_and_concat(files, sample_fraction=None):
    df_list = []
    for file in files:
        df = pd.read_csv(file)
        if sample_fraction:
            df = df.sample(frac=sample_fraction, random_state=42)  # Random sampling
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

# Load and concatenate training and testing data
train_files = glob.glob(train_folder + "*.csv")
test_files = glob.glob(test_folder + "*.csv")
train_df = load_and_concat(train_files, sample_fraction=0.1)  # Use 10% of the data
test_df = load_and_concat(test_files, sample_fraction=0.1)

In [12]:
# Ensure the columns match by selecting the same feature columns in both train and test sets
feature_columns = [col for col in train_df.columns if col not in ['spoofing', 'benign', 'MQTT', 'recon', 'DDoS', 'DoS']]

# Create a new target column for the 6-class classification
def determine_class(row):
    if row['spoofing'] == 1:
        return 'spoofing'
    elif row['benign'] == 1:
        return 'benign'
    elif row['MQTT'] == 1:
        return 'MQTT'
    elif row['recon'] == 1:
        return 'recon'
    elif row['DDoS'] == 1:
        return 'DDoS'
    elif row['DoS'] == 1:
        return 'DoS'
    else:
        return 'unknown'

train_df['Attack_Type'] = train_df.apply(determine_class, axis=1)
test_df['Attack_Type'] = test_df.apply(determine_class, axis=1)

# Separate features and labels
X_train = train_df[feature_columns]
y_train = train_df['Attack_Type']
X_test = test_df[feature_columns]
y_test = test_df['Attack_Type']


In [13]:
# Encode the labels to numerical values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Create a scaler object
scaler = MinMaxScaler()

# Fit the scaler to the training features and transform both training and testing features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled features to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_columns)

In [15]:
# Initialize the AdaBoost model
model = AdaBoostClassifier(n_estimators=100, algorithm='SAMME')

# Train the model
model.fit(X_train_scaled_df, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled_df)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[110844    861      0      0      6      0]
 [   695  42134      0      0      2      0]
 [    74      1     15     77      8      0]
 [    29      0      0   3562     10    160]
 [  1206      0      0    100   1430     31]
 [    19      0      0     27      0    128]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99    111711
           1       0.98      0.98      0.98     42831
           2       1.00      0.09      0.16       175
           3       0.95      0.95      0.95      3761
           4       0.98      0.52      0.68      2767
           5       0.40      0.74      0.52       174

    accuracy                           0.98    161419
   macro avg       0.88      0.71      0.71    161419
weighted avg       0.98      0.98      0.98    161419

