In [1]:
import pandas as pd
import glob
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [2]:
# Grayson's path
train_folder = "C:\\Users\\grays\\Will-Grayson GitHub Repo\\will-grayson-ML\\six-class\\data\\train\\"
test_folder = "C:\\Users\\grays\\Will-Grayson GitHub Repo\\will-grayson-ML\\six-class\\data\\test\\"

# Will's path
#train_folder = "C:\\Users\\willg\\OneDrive\\CSCI\\summer-2024-work\\will-grayson-ML\\train\\"
#test_folder = "C:\\Users\\willg\\OneDrive\\CSCI\\summer-2024-work\\will-grayson-ML\\test\\"

# List all CSV files in the train and test folders
train_files = glob.glob(train_folder + "*.csv")
test_files = glob.glob(test_folder + "*.csv")

# Function to load and concatenate CSV files from a list of file paths
def load_and_concat(files, sample_fraction=None):
    df_list = []
    for file in files:
        df = pd.read_csv(file)
        if sample_fraction:
            df = df.sample(frac=sample_fraction, random_state=42)  # Random sampling
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

# Load and concatenate training and testing data
train_files = glob.glob(train_folder + "*.csv")
test_files = glob.glob(test_folder + "*.csv")
train_df = load_and_concat(train_files, sample_fraction=0.1)  # Use 10% of the data
test_df = load_and_concat(test_files, sample_fraction=0.1)

In [3]:
# Ensure the columns match by selecting the same feature columns in both train and test sets
feature_columns = [col for col in train_df.columns if col not in ['spoofing', 'benign', 'MQTT', 'recon', 'DDoS', 'DoS']]

# Create a new target column for the 6-class classification
def determine_class(row):
    if row['spoofing'] == 1:
        return 'spoofing'
    elif row['benign'] == 1:
        return 'benign'
    elif row['MQTT'] == 1:
        return 'MQTT'
    elif row['recon'] == 1:
        return 'recon'
    elif row['DDoS'] == 1:
        return 'DDoS'
    elif row['DoS'] == 1:
        return 'DoS'
    else:
        return 'unknown'

train_df['Attack_Type'] = train_df.apply(determine_class, axis=1)
test_df['Attack_Type'] = test_df.apply(determine_class, axis=1)

# Separate features and labels
X_train = train_df[feature_columns]
y_train = train_df['Attack_Type']
X_test = test_df[feature_columns]
y_test = test_df['Attack_Type']

In [4]:
# Encode the labels to numerical values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Create a scaler object
scaler = MinMaxScaler()

# Fit the scaler to the training features and transform both training and testing features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled features to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_columns)

In [5]:
# Initialize the Logistic Regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Train the model
model.fit(X_train_scaled_df, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled_df)

# Decode the numerical predictions back to original labels
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred_labels))
print("\nClassification Report:")
print(classification_report(y_test_labels, y_pred_labels))



Confusion Matrix:
[[109884   1612      0    106    107      2]
 [ 36847   5819      0     51    114      0]
 [    39     16     20     99      0      1]
 [    47     93      0   3466     12    143]
 [   969     96      0    144   1520     38]
 [    17     16      0     56      3     82]]

Classification Report:
              precision    recall  f1-score   support

        DDoS       0.74      0.98      0.85    111711
         DoS       0.76      0.14      0.23     42831
        MQTT       1.00      0.11      0.21       175
      benign       0.88      0.92      0.90      3761
       recon       0.87      0.55      0.67      2767
    spoofing       0.31      0.47      0.37       174

    accuracy                           0.75    161419
   macro avg       0.76      0.53      0.54    161419
weighted avg       0.75      0.75      0.68    161419



In [None]:
# Cross-validation
stratified_k_fold = StratifiedKFold(n_splits=3)
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_results = cross_validate(model, X_train_scaled_df, y_train, cv=stratified_k_fold, scoring=scoring, n_jobs=-1)

# Print cross-validation scores
print("Cross-validation accuracy scores:", cv_results['test_accuracy'])
print("Mean cross-validation accuracy:", cv_results['test_accuracy'].mean())
print("Cross-validation precision scores:", cv_results['test_precision_macro'])
print("Mean cross-validation precision:", cv_results['test_precision_macro'].mean())
print("Cross-validation recall scores:", cv_results['test_recall_macro'])
print("Mean cross-validation recall:", cv_results['test_recall_macro'].mean())
print("Cross-validation F1 scores:", cv_results['test_f1_macro'])
print("Mean cross-validation F1:", cv_results['test_f1_macro'].mean())