In [48]:
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [49]:
data_path = os.path.join('..', 'data', 'MachineLearningCVE')

In [50]:
cols = set()
dfs = []

for file in os.listdir(data_path):
    path = os.path.join(data_path, file)
    traffic_df = pd.read_csv(path)
    dfs.append(traffic_df)
    cols.update(traffic_df.columns)
full_traffic_data = pd.concat(dfs, ignore_index = True)
full_traffic_data.columns = full_traffic_data.columns.str.strip()
print(cols)

{' Fwd Packet Length Min', ' PSH Flag Count', ' Subflow Bwd Packets', ' Fwd Avg Bulk Rate', ' Min Packet Length', ' Fwd IAT Std', 'Fwd Avg Bytes/Bulk', ' Fwd Packet Length Max', ' Fwd IAT Max', ' Active Max', 'FIN Flag Count', ' Total Fwd Packets', ' ACK Flag Count', ' Bwd Avg Packets/Bulk', ' min_seg_size_forward', ' Bwd Header Length', ' Flow IAT Max', ' SYN Flag Count', ' URG Flag Count', ' Fwd Header Length', ' Subflow Fwd Bytes', 'Bwd Packet Length Max', ' Average Packet Size', ' Bwd Packet Length Std', ' Idle Max', ' Bwd Packet Length Min', 'Fwd PSH Flags', ' Active Min', ' Bwd Packets/s', ' Fwd URG Flags', ' Bwd PSH Flags', ' Total Backward Packets', ' Total Length of Bwd Packets', 'Init_Win_bytes_forward', ' Down/Up Ratio', ' Fwd IAT Min', ' Bwd IAT Std', ' Fwd Packet Length Mean', ' Bwd IAT Mean', 'Fwd IAT Total', ' Idle Std', 'Total Length of Fwd Packets', ' Fwd Header Length.1', ' Flow IAT Mean', 'Active Mean', ' Active Std', ' Packet Length Std', ' Bwd URG Flags', ' Fwd Pac

In [51]:
print(full_traffic_data.head)

<bound method NDFrame.head of          Destination Port  Flow Duration  Total Fwd Packets  \
0                   54865              3                  2   
1                   55054            109                  1   
2                   55055             52                  1   
3                   46236             34                  1   
4                   54863              3                  2   
...                   ...            ...                ...   
2830738                53          32215                  4   
2830739                53            324                  2   
2830740             58030             82                  2   
2830741                53        1048635                  6   
2830742                53          94939                  4   

         Total Backward Packets  Total Length of Fwd Packets  \
0                             0                           12   
1                             1                            6   
2                    

In [52]:
print(full_traffic_data.columns)

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Co

In [53]:
# selected_features = [
#     'Flow Duration',             # Overall time of the flow
#     'Total Fwd Packets',         # Total packets in the forward direction
#     'Total Backward Packets',    # Total packets in the backward direction
#     'Average Packet Size',       # Mean packet size
#     'Down/Up Ratio',             # Ratio between down and up flows
#     'Fwd IAT Mean',              # Mean Inter-Arrival Time for forward packets
#     'Bwd IAT Mean',              # Mean Inter-Arrival Time for backward packets
#     'Fwd Packet Length Mean',    # Mean forward packet length
#     'Bwd Packet Length Mean',    # Mean backward packet length
#     'Destination Port'           # Destination port can sometimes be indicative
# ]

# # The dependent variable (target) is assumed to be 'Label'
# X = full_traffic_data[selected_features]
# y = full_traffic_data['Label']


# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Split the data into training and testing sets (e.g., 70% train, 30% test)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Initialize the logistic regression model. Increase max_iter if necessary.
# model = LogisticRegression(max_iter=1000)

# # Train the model on the training data.
# model.fit(X_train, y_train)

# # Predict the labels for the test set.
# y_pred = model.predict(X_test)

# # Evaluate model performance.
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)
# print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [54]:
subset_data = full_traffic_data.sample(frac=1/15, random_state=42)
print("Subset shape:", subset_data.shape)

# Define your selected features and target variable.
selected_features = [
    'Flow Duration',
    'Total Fwd Packets',
    'Total Backward Packets',
    'Average Packet Size',
    'Down/Up Ratio',
    'Fwd IAT Mean',
    'Bwd IAT Mean',
    'Fwd Packet Length Mean',
    'Bwd Packet Length Mean',
    'Destination Port'
]

X = subset_data[selected_features]
y = subset_data['Label']

# Split the subset into training and testing sets (70% train, 30% test).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model.
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set.
y_pred = model.predict(X_test)

# Evaluate model performance.
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Subset shape: (188716, 79)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7964673673054844


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
                             precision    recall  f1-score   support

                    BENIGN       0.85      0.95      0.90     45455
                       Bot       0.00      0.00      0.00        38
                      DDoS       0.00      0.00      0.00      2580
             DoS GoldenEye       0.00      0.00      0.00       208
                  DoS Hulk       0.33      0.41      0.37      4578
          DoS Slowhttptest       0.00      0.00      0.00       104
             DoS slowloris       0.00      0.00      0.00       110
               FTP-Patator       0.00      0.00      0.00       177
                  PortScan       0.00      0.00      0.00      3219
               SSH-Patator       0.00      0.00      0.00       110
  Web Attack � Brute Force       0.00      0.00      0.00        28
Web Attack � Sql Injection       0.00      0.00      0.00         1
          Web Attack � XSS       0.00      0.00      0.00         7

                  acc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# subset_data = full_traffic_data.sample(frac=1/25, random_state=42)
# print("Subset shape:", subset_data.shape)

# # Define your selected features and target variable.
# selected_features = [
#     'Flow Duration',
#     'Total Fwd Packets',
#     'Total Backward Packets',
#     'Average Packet Size',
#     'Down/Up Ratio',
#     'Fwd IAT Mean',
#     'Bwd IAT Mean',
#     'Fwd Packet Length Mean',
#     'Bwd Packet Length Mean',
#     'Destination Port'
# ]

# X = subset_data[selected_features]
# y = subset_data['Label']

# # Split the subset into training and testing sets (70% train, 30% test).
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Identify classes in y_train that have at least 2 samples.
# class_counts = y_train.value_counts()
# classes_to_keep = class_counts[class_counts >= 2].index
# print("Classes retained for oversampling:", classes_to_keep.tolist())

# # Filter out classes with fewer than 2 samples from the training set.
# X_train_filtered = X_train[y_train.isin(classes_to_keep)]
# y_train_filtered = y_train[y_train.isin(classes_to_keep)]

# # Use SMOTE with k_neighbors set to 1 (now safe because every class has at least 2 samples).
# smote = SMOTE(random_state=42, k_neighbors=1)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train_filtered, y_train_filtered)

# # Initialize and train the logistic regression model.
# model = LogisticRegression(max_iter=1000)
# model.fit(X_train_resampled, y_train_resampled)

# # Make predictions on the test set.
# y_pred = model.predict(X_test)

# # Evaluate model performance.
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_pred))


Subset shape: (113230, 79)
Classes retained for oversampling: ['BENIGN', 'DoS Hulk', 'PortScan', 'DDoS', 'DoS GoldenEye', 'FTP-Patator', 'DoS slowloris', 'SSH-Patator', 'DoS Slowhttptest', 'Bot', 'Web Attack � Brute Force', 'Web Attack � XSS']


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.05928935205628661

Classification Report:
                           precision    recall  f1-score   support

                  BENIGN       0.81      0.03      0.06     27240
                     Bot       0.00      0.65      0.00        26
                    DDoS       0.01      0.00      0.01      1559
           DoS GoldenEye       0.06      0.34      0.10       116
                DoS Hulk       0.20      0.38      0.26      2752
        DoS Slowhttptest       0.00      0.01      0.00        79
           DoS slowloris       0.04      0.42      0.08        83
             FTP-Patator       0.00      0.00      0.00       115
              Heartbleed       0.00      0.00      0.00         1
                PortScan       0.00      0.00      0.00      1912
             SSH-Patator       0.00      0.44      0.01        64
Web Attack � Brute Force       0.00      0.00      0.00        15
        Web Attack � XSS       0.00      0.00      0.00         7

                acc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
