In [15]:
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import statsmodels.api as sm

In [16]:
data_path = os.path.join('..', 'data', 'MachineLearningCVE')

In [17]:
cols = set()
dfs = []

for file in os.listdir(data_path):
    path = os.path.join(data_path, file)
    traffic_df = pd.read_csv(path)
    dfs.append(traffic_df)
    cols.update(traffic_df.columns)
full_traffic_data = pd.concat(dfs, ignore_index = True)
full_traffic_data.columns = full_traffic_data.columns.str.strip()
print(cols)

{' Down/Up Ratio', ' Flow Duration', ' Fwd Packet Length Max', ' Fwd IAT Max', ' CWE Flag Count', 'Flow Bytes/s', ' Fwd Packet Length Std', ' ACK Flag Count', 'Fwd IAT Total', ' Bwd IAT Min', ' Bwd URG Flags', ' Packet Length Mean', ' ECE Flag Count', ' Subflow Bwd Packets', ' Bwd Packets/s', ' Bwd Avg Bytes/Bulk', ' Max Packet Length', ' Bwd Packet Length Std', 'FIN Flag Count', ' Fwd Avg Bulk Rate', 'Total Length of Fwd Packets', ' SYN Flag Count', ' Avg Bwd Segment Size', 'Init_Win_bytes_forward', ' Fwd Header Length', 'Active Mean', ' URG Flag Count', ' Total Fwd Packets', 'Fwd Packets/s', ' Fwd URG Flags', ' Bwd Packet Length Min', ' Active Min', ' Flow IAT Std', ' Bwd IAT Max', ' Min Packet Length', ' Bwd IAT Std', 'Fwd Avg Bytes/Bulk', ' Subflow Fwd Bytes', ' Flow IAT Max', ' Init_Win_bytes_backward', 'Fwd PSH Flags', ' Flow IAT Mean', ' Bwd Avg Packets/Bulk', ' Subflow Bwd Bytes', ' Idle Min', 'Bwd IAT Total', ' Active Max', ' Fwd IAT Min', ' Bwd Packet Length Mean', 'Subflow F

In [18]:
print(full_traffic_data.head)

<bound method NDFrame.head of          Destination Port  Flow Duration  Total Fwd Packets  \
0                   54865              3                  2   
1                   55054            109                  1   
2                   55055             52                  1   
3                   46236             34                  1   
4                   54863              3                  2   
...                   ...            ...                ...   
2830738                53          32215                  4   
2830739                53            324                  2   
2830740             58030             82                  2   
2830741                53        1048635                  6   
2830742                53          94939                  4   

         Total Backward Packets  Total Length of Fwd Packets  \
0                             0                           12   
1                             1                            6   
2                    

In [19]:
print(full_traffic_data.columns)

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Co

In [25]:
import numpy as np
subset_data = full_traffic_data.sample(frac=1/30, random_state=42)
print("Subset shape:", subset_data.shape)

# Define your selected features and target variable.
selected_features = [
    'Flow Duration',                 # Overall duration of the flow
    'Total Fwd Packets',             # Number of forward packets
    'Total Backward Packets',        # Number of backward packets
    'Total Length of Fwd Packets',   # Total bytes in forward direction
    'Total Length of Bwd Packets',   # Total bytes in backward direction
    'Fwd Packet Length Mean',        # Average forward packet size
    'Bwd Packet Length Mean',        # Average backward packet size
    'Flow Bytes/s',                  # Bytes transmitted per second
    'Flow Packets/s',                # Packets transmitted per second
    'Flow IAT Mean',                 # Mean inter-arrival time of packets in the flow
    'Flow IAT Std',                  # Std of inter-arrival times in the flow
    'Fwd IAT Mean',                  # Mean inter-arrival time for forward packets
    'Fwd IAT Std',                   # Std for forward inter-arrival times
    'Bwd IAT Mean',                  # Mean inter-arrival time for backward packets
    'Bwd IAT Std',                   # Std for backward inter-arrival times
    'Down/Up Ratio',                 # Ratio of downlink to uplink traffic
    'Average Packet Size',           # Overall average packet size in the flow
    'Destination Port',              # Port number indicating service type
    'SYN Flag Count',                # Number of SYN flags
    'ACK Flag Count',                # Number of ACK flags
    'RST Flag Count',                # Number of RST flags
    'Fwd PSH Flags',                 # Forward PSH flag count
    'Bwd PSH Flags',                 # Backward PSH flag count
    'Subflow Fwd Packets',           # Packets in forward subflows
    'Subflow Bwd Packets',           # Packets in backward subflows
    'Active Mean',                   # Average duration of active periods
    'Active Std',                    # Variation in active periods
    'Idle Mean',                     # Average idle time
    'Idle Std'                       # Variation in idle time
]

X = subset_data[selected_features]
y = subset_data['Label']

X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.dropna()
y = y[X.index]  


# Split the subset into training and testing sets (70% train, 30% test).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model.
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set.
y_pred = model.predict(X_test)

# Evaluate model performance.
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Subset shape: (94358, 79)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace([np.inf, -np.inf], np.nan, inplace=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.8438306751069774

Classification Report:
                           precision    recall  f1-score   support

                  BENIGN       0.88      0.94      0.91     22686
                     Bot       0.00      0.00      0.00        18
                    DDoS       0.37      0.42      0.39      1246
           DoS GoldenEye       0.83      0.28      0.42       102
                DoS Hulk       0.84      0.82      0.83      2398
        DoS Slowhttptest       0.00      0.00      0.00        61
           DoS slowloris       0.21      0.25      0.23        61
             FTP-Patator       0.00      0.00      0.00        87
              Heartbleed       0.00      0.00      0.00         0
                PortScan       0.00      0.00      0.00      1543
             SSH-Patator       0.00      0.00      0.00        60
Web Attack � Brute Force       0.00      0.00      0.00        10
        Web Attack � XSS       0.00      0.00      0.00         5

                accu

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
X_train_const = sm.add_constant(X_train)

# Fit the logistic regression model with statsmodels
logit_model = sm.Logit(y_train, X_train_const)
result = logit_model.fit()

# Display the summary, which includes coefficient estimates, standard errors, z-values, and p-values.
print(result.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [None]:
# subset_data = full_traffic_data.sample(frac=1/25, random_state=42)
# print("Subset shape:", subset_data.shape)

# # Define your selected features and target variable.
# selected_features = [
#     'Flow Duration',
#     'Total Fwd Packets',
#     'Total Backward Packets',
#     'Average Packet Size',
#     'Down/Up Ratio',
#     'Fwd IAT Mean',
#     'Bwd IAT Mean',
#     'Fwd Packet Length Mean',
#     'Bwd Packet Length Mean',
#     'Destination Port'
# ]

# X = subset_data[selected_features]
# y = subset_data['Label']

# # Split the subset into training and testing sets (70% train, 30% test).
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Identify classes in y_train that have at least 2 samples.
# class_counts = y_train.value_counts()
# classes_to_keep = class_counts[class_counts >= 2].index
# print("Classes retained for oversampling:", classes_to_keep.tolist())

# # Filter out classes with fewer than 2 samples from the training set.
# X_train_filtered = X_train[y_train.isin(classes_to_keep)]
# y_train_filtered = y_train[y_train.isin(classes_to_keep)]

# # Use SMOTE with k_neighbors set to 1 (now safe because every class has at least 2 samples).
# smote = SMOTE(random_state=42, k_neighbors=1)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train_filtered, y_train_filtered)

# # Initialize and train the logistic regression model.
# model = LogisticRegression(max_iter=1000)
# model.fit(X_train_resampled, y_train_resampled)

# # Make predictions on the test set.
# y_pred = model.predict(X_test)

# # Evaluate model performance.
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_pred))


Subset shape: (113230, 79)
Classes retained for oversampling: ['BENIGN', 'DoS Hulk', 'PortScan', 'DDoS', 'DoS GoldenEye', 'FTP-Patator', 'DoS slowloris', 'SSH-Patator', 'DoS Slowhttptest', 'Bot', 'Web Attack � Brute Force', 'Web Attack � XSS']


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.05928935205628661

Classification Report:
                           precision    recall  f1-score   support

                  BENIGN       0.81      0.03      0.06     27240
                     Bot       0.00      0.65      0.00        26
                    DDoS       0.01      0.00      0.01      1559
           DoS GoldenEye       0.06      0.34      0.10       116
                DoS Hulk       0.20      0.38      0.26      2752
        DoS Slowhttptest       0.00      0.01      0.00        79
           DoS slowloris       0.04      0.42      0.08        83
             FTP-Patator       0.00      0.00      0.00       115
              Heartbleed       0.00      0.00      0.00         1
                PortScan       0.00      0.00      0.00      1912
             SSH-Patator       0.00      0.44      0.01        64
Web Attack � Brute Force       0.00      0.00      0.00        15
        Web Attack � XSS       0.00      0.00      0.00         7

                acc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
