In [1]:
import pandas as pd
import glob
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Grayson's path
train_folder = "C:\\Users\\grays\\Will-Grayson GitHub Repo\\will-grayson-ML\\two-class-labeled-data\\train\\"
test_folder = "C:\\Users\\grays\\Will-Grayson GitHub Repo\\will-grayson-ML\\two-class-labeled-data\\test\\"

# Will's path
#train_folder = "C:\\Users\\willg\\OneDrive\\CSCI\\summer-2024-work\\will-grayson-ML\\train\\"
#test_folder = "C:\\Users\\willg\\OneDrive\\CSCI\\summer-2024-work\\will-grayson-ML\\test\\"

# List all CSV files in the train folder
train_files = glob.glob(train_folder + "*.csv")
test_files = glob.glob(test_folder + "*.csv")

# Function to load and concatenate CSV files from a list of file paths
def load_and_concat(files, sample_fraction=None):
    df_list = []
    for file in files:
        df = pd.read_csv(file)
        if sample_fraction:
            df = df.sample(frac=sample_fraction, random_state=42)  # Random sampling
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

# Load and concatenate training and testing data
train_files = glob.glob(train_folder + "*.csv")
test_files = glob.glob(test_folder + "*.csv")
train_df = load_and_concat(train_files, sample_fraction=0.1)  # Use 10% of the data
test_df = load_and_concat(test_files, sample_fraction=0.1)

In [4]:
feature_columns = [col for col in train_df.columns if col not in ['benign', 'attack']]

# Separate features and labels
X_train = train_df[feature_columns]
y_train = train_df['attack']
X_test = test_df[feature_columns]
y_test = test_df['attack']

# Create a scaler object
scaler = MinMaxScaler()

# Fit the scaler to the training features and transform both training and testing features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled features to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_columns)

In [5]:
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled_df, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled_df)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[  3294    467]
 [   417 157241]]
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      3761
           1       1.00      1.00      1.00    157658

    accuracy                           0.99    161419
   macro avg       0.94      0.94      0.94    161419
weighted avg       0.99      0.99      0.99    161419



In [7]:
from sklearn.model_selection import StratifiedKFold, cross_validate

stratified_k_fold = StratifiedKFold(n_splits=5)
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_results = cross_validate(model, X_train_scaled_df, y_train, cv=stratified_k_fold, scoring=scoring, n_jobs=-1)


# Print cross-validation scores
print("Cross-validation accuracy scores:", cv_results['test_accuracy'])
print("Mean cross-validation accuracy:", cv_results['test_accuracy'].mean())
print("Cross-validation precision scores:", cv_results['test_precision_macro'])
print("Mean cross-validation precision:", cv_results['test_precision_macro'].mean())
print("Cross-validation recall scores:", cv_results['test_recall_macro'])
print("Mean cross-validation recall:", cv_results['test_recall_macro'].mean())
print("Cross-validation F1 scores:", cv_results['test_f1_macro'])
print("Mean cross-validation F1:", cv_results['test_f1_macro'].mean())

Cross-validation accuracy scores: [0.87887611 0.99641104 0.99641802 0.99657862 0.99604796]
Mean cross-validation accuracy: 0.9728663496651935
Cross-validation precision scores: [0.58723155 0.98622447 0.99260293 0.98522441 0.97383299]
Mean cross-validation precision: 0.905023270444644
Cross-validation recall scores: [0.91392136 0.94416473 0.93838148 0.948427   0.94941546]
Mean cross-validation recall: 0.9388620041065895
Cross-validation F1 scores: [0.61538317 0.96418904 0.96382344 0.96605851 0.96128223]
Mean cross-validation F1: 0.8941472780377595
