In [54]:
import pandas as pd

# Reading Data

In [89]:
df = {
    "df_fw1": pd.read_csv("extracted_flow_features_fw1.csv"),
    "df_fw2": pd.read_csv("extracted_flow_features_fw2.csv"),
    "df_fw3": pd.read_csv("extracted_flow_features_fw3.csv"),
    "df_fw4": pd.read_csv("extracted_flow_features_fw4.csv")
}

# Features Analysis

In [90]:
for i in range(len(df.keys())):
    print("df_fw" + str(i))
    print(df[list(df.keys())[i]].select_dtypes(include="object").columns)

df_fw0
Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Protocol', 'label'], dtype='object')
df_fw1
Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Protocol', 'label'], dtype='object')
df_fw2
Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Protocol', 'label'], dtype='object')
df_fw3
Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Protocol', 'label'], dtype='object')


In [91]:
from sklearn.preprocessing import LabelEncoder

validColumn = {}
for i in range(len(df.keys())):
    # print("df_fw" + str(i))
    encoderTarget = LabelEncoder()
    testCorrDF = df[list(df.keys())[i]].copy()
    testCorrDF['label'] = encoderTarget.fit_transform(testCorrDF['label'])
    labelCorr = testCorrDF.select_dtypes(exclude="object").corr()['label']
    # print(labelCorr[labelCorr < 0.9])
    validColumn[f"{list(df.keys())[i]}"] = list(labelCorr[labelCorr < 0.9].index)

commonFeatures = set(validColumn["df_fw1"]).intersection(validColumn['df_fw2'], validColumn['df_fw3'], validColumn['df_fw4'])
print(f"Common Features: {commonFeatures}")

Common Features: {'Total Length', 'Total Packets', 'Flow IAT Max', 'RST Flag Count', 'Duration', 'Flow IAT Total', 'Packet Length Min', 'Flow IAT Min', 'Active Std', 'Flow IAT Std', 'Active Mean', 'Flow IAT Mean', 'Packet Length Max'}


In [92]:
commonFeatures = list(commonFeatures)
commonFeatures

['Total Length',
 'Total Packets',
 'Flow IAT Max',
 'RST Flag Count',
 'Duration',
 'Flow IAT Total',
 'Packet Length Min',
 'Flow IAT Min',
 'Active Std',
 'Flow IAT Std',
 'Active Mean',
 'Flow IAT Mean',
 'Packet Length Max']

# Features Engineering

In [88]:
# Within the same dataset, then divide into train and test
from research.Internal.CustomEncoder import CustomLabelEncoder
from sklearn.model_selection import train_test_split

input_data = df.drop(columns= ['label'])
output_data = df['label']

X_train, X_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2, shuffle= True, random_state=42)


columns_toEncode = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Protocol', 'label']

for col in columns_toEncode:
    encoder = CustomLabelEncoder()
    if col in input_data.columns:
        X_train[col] = encoder.fit_transform(X_train[col])
        X_test[col] = encoder.transform(X_test[col])
    else:
        y_train = encoder.fit_transform(y_train)
        y_test = encoder.transform(y_test)

AttributeError: 'dict' object has no attribute 'drop'

In [93]:
# With two different dataset
from research.Internal.CustomEncoder import CustomLabelEncoder

# X_train = df["df_fw1"].drop(columns=['label'])
# X_test = df["df_fw3"].drop(columns=['label'])

X_train = df["df_fw1"].drop(columns=['label'])[commonFeatures]
X_test = df["df_fw3"].drop(columns=['label'])[commonFeatures]

y_train = df["df_fw1"]['label']
y_test = df["df_fw3"]['label']

# Labling Data

# columns_toEncode = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Protocol', 'label']
# XColumn = [col for col in columns_toEncode if col != 'label']
# yColumn = [col for col in columns_toEncode if col not in XColumn]

# for col in columns_toEncode:
#     encoder = CustomLabelEncoder()
#     if col in XColumn:
#         X_train[col] = encoder.fit_transform(X_train[col])
#         X_test[col] = encoder.transform(X_test[col])
#     if col in yColumn:
#         y_train = encoder.fit_transform(y_train)
#         y_test = encoder.transform(y_test)

encoder = CustomLabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [94]:
for i, value in enumerate(encoder.encoder.classes_):
    print(f"{i} : {value}")

0 : ATTACKER
1 : BENIGN


# Model

## Logistic Regression

In [95]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Confusion Matrix:", conf_matrix)
print("Classification Report:", report)

Confusion Matrix: [[33708     0]
 [    0  1659]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     33708
           1       1.00      1.00      1.00      1659

    accuracy                           1.00     35367
   macro avg       1.00      1.00      1.00     35367
weighted avg       1.00      1.00      1.00     35367



## Random Forest

In [97]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Confusion Matrix:", conf_matrix)
print("Classification Report:", report)

Confusion Matrix: [[   23 33685]
 [    0  1659]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      0.00      0.00     33708
           1       0.05      1.00      0.09      1659

    accuracy                           0.05     35367
   macro avg       0.52      0.50      0.05     35367
weighted avg       0.96      0.05      0.01     35367



## SVM

In [98]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

# Initialize the SVM model
svm_model = SVC(kernel='rbf', random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Confusion Matrix:", conf_matrix)
print("Classification Report:", report)

Confusion Matrix: [[33708     0]
 [   51  1608]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     33708
           1       1.00      0.97      0.98      1659

    accuracy                           1.00     35367
   macro avg       1.00      0.98      0.99     35367
weighted avg       1.00      1.00      1.00     35367

