In [1]:
# Step 1: Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

# Step 2: Load the dataset  # Replace with your CSV file path
data = pd.read_csv(r"C:\Users\shubh\OneDrive\Desktop\SOC_cyberguard\Train_data.csv")
data2 = pd.read_csv(r"C:\Users\shubh\OneDrive\Desktop\SOC_cyberguard\Test_data.csv")

# Step 3: Preprocess the data
# Drop non-numeric columns except 'class'
data_numeric = data.drop(columns=['protocol_type', 'service', 'flag'])
data_numeric_2 = data2.drop(columns=['protocol_type', 'service', 'flag'])

# Encode categorical columns if necessary, basically putting a number tag on text data in the cells
label_encoders = {}
for column in ['protocol_type', 'service', 'flag']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    data2[column] = le.fit_transform(data2[column])
    label_encoders[column] = le

# Concatenating the removed columns (ones who had text) and replacing them with numbers
data_preprocessed = pd.concat([data_numeric.drop(columns=['class']), data[['protocol_type', 'service', 'flag']]], axis=1)
data_preprocessed_2 = pd.concat([data_numeric_2, data2[['protocol_type', 'service', 'flag']]], axis=1)

# Extract the target variable
y = data['class'].map({'normal': 0, 'anomaly': 1})

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_preprocessed)
scaled_data2 = scaler.transform(data_preprocessed_2)  # Standardize the test data using the same scaler

# Step 4: Apply PCA for dimensionality reduction
pca = PCA(n_components=10)
principal_components = pca.fit_transform(scaled_data)
principal_components2 = pca.transform(scaled_data2)  # Apply PCA to the test data using the same PCA model

# Split the data into training and testing sets
X_train = principal_components
y_train = y
X_test = principal_components2

# Step 5: Train logistic regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Map numerical predictions back to class labels
y_pred_labels = pd.Series(y_pred).map({0: 'normal', 1: 'anomaly'})

# Add the predicted class labels to data2
data2['class'] = y_pred_labels

# Save the scaler, PCA, and logistic regression model to disk
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(pca, 'pca.pkl')
joblib.dump(log_reg, 'log_reg.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

# Print out the first few rows of the updated data2 to see the results
print(data2.head())

# Save the updated data2 as a new CSV file
data2.to_csv(r"C:\Users\shubh\OneDrive\Desktop\SOC_cyberguard\logreg_test_data_result.csv", index=False)


   duration  protocol_type  service  flag  src_bytes  dst_bytes  land  \
0         0              1       45     1          0          0     0   
1         0              1       45     1          0          0     0   
2         2              1       19     9      12983          0     0   
3         0              0       13     9         20          0     0   
4         1              1       55     2          0         15     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  10   
1               0       0    0  ...                   1   
2               0       0    0  ...                  86   
3               0       0    0  ...                  57   
4               0       0    0  ...                  86   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.04                    0.06   
1                    0.00                    0.06   
2                    0.61                    0.

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import joblib

# Step 2: Load the dataset  # Replace with your CSV file path
data = pd.read_csv(r"C:\Users\shubh\OneDrive\Desktop\SOC_cyberguard\Train_data.csv")
data2 = pd.read_csv(r"C:\Users\shubh\OneDrive\Desktop\SOC_cyberguard\Test_data.csv")

# Step 3: Preprocess the data
# Drop non-numeric columns except 'class'
data_numeric = data.drop(columns=['protocol_type', 'service', 'flag'])
data_numeric_2 = data2.drop(columns=['protocol_type', 'service', 'flag'])

# Encode categorical columns if necessary, basically putting a number tag on text data in the cells
label_encoders = {}
for column in ['protocol_type', 'service', 'flag']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    data2[column] = le.fit_transform(data2[column])
    label_encoders[column] = le

# Concatenate the removed columns (ones who had text) and replace them with numbers
data_preprocessed = pd.concat([data_numeric.drop(columns=['class']), data[['protocol_type', 'service', 'flag']]], axis=1)
data_preprocessed_2 = pd.concat([data_numeric_2, data2[['protocol_type', 'service', 'flag']]], axis=1)

# Extract the target variable
y = data['class'].map({'normal': 0, 'anomaly': 1})

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_preprocessed)
scaled_data2 = scaler.transform(data_preprocessed_2)

pca = PCA(n_components=10)
principal_components = pca.fit_transform(scaled_data)
principal_components2 = pca.transform(scaled_data2) 

X_train = principal_components
y_train = y
X_test = principal_components2

# Step 5: Train Decision Tree Classifier model
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Predict on the test set
y_pred = decision_tree.predict(X_test)

# Map numerical predictions back to class labels
y_pred_labels = pd.Series(y_pred).map({0: 'normal', 1: 'anomaly'})

# Add the predicted class labels to data2
data2['class'] = y_pred_labels

# Save the scaler, PCA, and logistic regression model to disk
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(pca, 'pca.pkl')
joblib.dump(log_reg, 'log_reg.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

# Print out the first few rows of the updated data2 to see the results
print(data2.head())

# Save the updated data2 as a new CSV file
data2.to_csv(r"C:\Users\shubh\OneDrive\Desktop\SOC_cyberguard\DecTree_test_data_result.csv", index=False)

   duration  protocol_type  service  flag  src_bytes  dst_bytes  land  \
0         0              1       45     1          0          0     0   
1         0              1       45     1          0          0     0   
2         2              1       19     9      12983          0     0   
3         0              0       13     9         20          0     0   
4         1              1       55     2          0         15     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  10   
1               0       0    0  ...                   1   
2               0       0    0  ...                  86   
3               0       0    0  ...                  57   
4               0       0    0  ...                  86   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.04                    0.06   
1                    0.00                    0.06   
2                    0.61                    0.

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import joblib


# Step 2: Load the dataset  # Replace with your CSV file path
data = pd.read_csv(r"C:\Users\shubh\OneDrive\Desktop\SOC_cyberguard\Train_data.csv")
data2 = pd.read_csv(r"C:\Users\shubh\OneDrive\Desktop\SOC_cyberguard\Test_data.csv")

# Step 3: Preprocess the data
# Drop non-numeric columns except 'class'
data_numeric = data.drop(columns=['protocol_type', 'service', 'flag'])
data_numeric_2 = data2.drop(columns=['protocol_type', 'service', 'flag'])

# Encode categorical columns if necessary, basically putting a number tag on text data in the cells
label_encoders = {}
for column in ['protocol_type', 'service', 'flag']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    data2[column] = le.fit_transform(data2[column])
    label_encoders[column] = le

# Concatenate the removed columns (ones who had text) and replace them with numbers
data_preprocessed = pd.concat([data_numeric.drop(columns=['class']), data[['protocol_type', 'service', 'flag']]], axis=1)
data_preprocessed_2 = pd.concat([data_numeric_2, data2[['protocol_type', 'service', 'flag']]], axis=1)

# Extract the target variable
y = data['class'].map({'normal': 0, 'anomaly': 1})

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_preprocessed)
scaled_data2 = scaler.transform(data_preprocessed_2)

# Step 4: Apply PCA for dimensionality reduction
pca = PCA(n_components=10)
principal_components = pca.fit_transform(scaled_data)
principal_components2 = pca.transform(scaled_data2)

X_train = principal_components
y_train = y
X_test = principal_components2

# Step 5: Train K-Nearest Neighbors model
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors
knn.fit(X_train, y_train)

# Predict on the test set
y_pred = knn.predict(X_test)

y_pred_labels = pd.Series(y_pred).map({0: 'normal', 1: 'anomaly'})

# Add the predicted class labels to data2
data2['class'] = y_pred_labels

# Save the scaler, PCA, and logistic regression model to disk
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(pca, 'pca.pkl')
joblib.dump(log_reg, 'log_reg.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

# Print out the first few rows of the updated data2 to see the results
print(data2.head())

# Save the updated data2 as a new CSV file
data2.to_csv(r"C:\Users\shubh\OneDrive\Desktop\SOC_cyberguard\KNeighbours_test_data_result.csv", index=False)

   duration  protocol_type  service  flag  src_bytes  dst_bytes  land  \
0         0              1       45     1          0          0     0   
1         0              1       45     1          0          0     0   
2         2              1       19     9      12983          0     0   
3         0              0       13     9         20          0     0   
4         1              1       55     2          0         15     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  10   
1               0       0    0  ...                   1   
2               0       0    0  ...                  86   
3               0       0    0  ...                  57   
4               0       0    0  ...                  86   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.04                    0.06   
1                    0.00                    0.06   
2                    0.61                    0.