In [1]:
import pandas as pd
import glob
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [2]:
# Grayson's path
train_folder = "C:\\Users\\grays\\Will-Grayson GitHub Repo\\will-grayson-ML\\nineteen-class\\data\\train\\"
test_folder = "C:\\Users\\grays\\Will-Grayson GitHub Repo\\will-grayson-ML\\nineteen-class\\data\\test\\"

# Will's path
#train_folder = "C:\\Users\\willg\\OneDrive\\CSCI\\summer-2024-work\\will-grayson-ML\\train\\"
#test_folder = "C:\\Users\\willg\\OneDrive\\CSCI\\summer-2024-work\\will-grayson-ML\\test\\"

# List all CSV files in the train and test folders
train_files = glob.glob(train_folder + "*.csv")
test_files = glob.glob(test_folder + "*.csv")

# Function to load and concatenate CSV files from a list of file paths
def load_and_concat(files, sample_fraction=None):
    df_list = []
    for file in files:
        df = pd.read_csv(file)
        if sample_fraction:
            df = df.sample(frac=sample_fraction, random_state=42)  # Random sampling
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

# Load and concatenate training and testing data
train_files = glob.glob(train_folder + "*.csv")
test_files = glob.glob(test_folder + "*.csv")
train_df = load_and_concat(train_files, sample_fraction=0.1)  # Use 10% of the data
test_df = load_and_concat(test_files, sample_fraction=0.1)

In [3]:
feature_columns = [col for col in train_df.columns if col not in [
    'spoofing', 'benign', 'MQTT_DDoS_Connect_Flood', 'MQTT_DoS_Connect_Flood', 
    'MQTT_DDoS_Publish_Flood', 'MQTT_DoS_Publish_Flood', 'MQTT_Malformed_Data',
    'Recon_OS_Scan', 'Recon_Ping_Sweep', 'Recon_Port_Scan', 'Recon_VulScan',
    'DoS_ICMP', 'DoS_SYN', 'DoS_TCP', 'DoS_UDP',
    'DDoS_ICMP', 'DDoS_SYN', 'DDoS_TCP', 'DDoS_UDP'
]]

# Separate features and labels
X_train = train_df[feature_columns]
y_train = train_df[['spoofing', 'benign', 'MQTT_DDoS_Connect_Flood', 'MQTT_DoS_Connect_Flood', 
    'MQTT_DDoS_Publish_Flood', 'MQTT_DoS_Publish_Flood', 'MQTT_Malformed_Data',
    'Recon_OS_Scan', 'Recon_Ping_Sweep', 'Recon_Port_Scan', 'Recon_VulScan',
    'DoS_ICMP', 'DoS_SYN', 'DoS_TCP', 'DoS_UDP',
    'DDoS_ICMP', 'DDoS_SYN', 'DDoS_TCP', 'DDoS_UDP']]

X_test = test_df[feature_columns]
y_test = test_df[['spoofing', 'benign', 'MQTT_DDoS_Connect_Flood', 'MQTT_DoS_Connect_Flood', 
    'MQTT_DDoS_Publish_Flood', 'MQTT_DoS_Publish_Flood', 'MQTT_Malformed_Data',
    'Recon_OS_Scan', 'Recon_Ping_Sweep', 'Recon_Port_Scan', 'Recon_VulScan',
    'DoS_ICMP', 'DoS_SYN', 'DoS_TCP', 'DoS_UDP',
    'DDoS_ICMP', 'DDoS_SYN', 'DDoS_TCP', 'DDoS_UDP']]

# Create a scaler object
scaler = MinMaxScaler()

# Fit the scaler to the training features and transform both training and testing features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# Initialize the logistic regression model
model = LogisticRegression(max_iter=1000)

# Convert one-hot encoded labels to integer labels for y_train
y_train_labels = y_train.values.argmax(axis=1)

# Train the model
model.fit(X_train_scaled, y_train_labels)

# Make predictions
y_pred_labels = model.predict(X_test_scaled)

# Convert one-hot encoded labels to integer labels for y_test
y_test_labels = y_test.values.argmax(axis=1)

# Evaluate the model
print(confusion_matrix(y_test_labels, y_pred_labels))
print(classification_report(y_test_labels, y_pred_labels))


[[   84    73     0     0     0     3     0     0     0     0     7     0
      1     0     6     0     0     0     0]
 [  139  3362     1     0     1    38     3     0     1    20     0     0
      1     0   195     0     0     0     0]
 [    0     0  4184     3     0     0     0     0     0     2     0     0
      2     0     0     0     1     0     0]
 [    0     0    44   268     0     0     0     0     0     0     0     0
      1     0     0     0     0     0     0]
 [    0    37     1     0   113   690     1     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0    19     0     0     0   831     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0    96    33     0     1     9    27     0     0     0     0     0
      4     2     3     0     0     0     0]
 [    0    41     4     5     1     1     0    10     0   155     1     0
      4     0     0     0   161     0     0]
 [    0    10     0     0     0     0     0     