In [2]:
import pandas as pd
from datetime import datetime
df = pd.read_csv('research_traffic.csv')
df.shape

(140000, 12)

In [3]:
print(df.dtypes)
print(df['Label'].value_counts()[1])
print(df['Label'].value_counts()[0])
df.head()

Count_of_Source_IP        int64
Port_Count                int64
Pair_Count_Ratio        float64
Packet_Count_Diff         int64
Lookup_Count_Diff         int64
Protocol                  int64
Average_Packet_Count    float64
Average_Byte_Count      float64
Packet_Std_Dev          float64
Byte_Std_Dev            float64
Duration_per_Flow       float64
Label                     int64
dtype: object
70000
70000


Unnamed: 0,Count_of_Source_IP,Port_Count,Pair_Count_Ratio,Packet_Count_Diff,Lookup_Count_Diff,Protocol,Average_Packet_Count,Average_Byte_Count,Packet_Std_Dev,Byte_Std_Dev,Duration_per_Flow,Label
0,2,2,1.0,436913,1,6,145637.6667,3567915000.0,24719.5,5339086000.0,1.258,0
1,2,2,1.0,436912,1,6,145637.3333,3567915000.0,24721.0,5339086000.0,1.254,0
2,2,2,1.0,1006277,1,6,335425.6667,8291520000.0,61289.5,12408120000.0,2.587333,0
3,2,2,0.0,1,1006280,6,335426.0,8291520000.0,61288.0,12408120000.0,2.592,0
4,2,2,1.0,1605562,6,6,535187.3333,13215030000.0,97006.0,19775970000.0,3.926,0


In [4]:
# obj_cols=df_dt.dtypes[df_dt.dtypes == "object"].index.values.tolist()
# print(obj_cols)

# from sklearn.preprocessing import LabelEncoder
# #Encode labels of multiple columns at once

# df_dt[obj_cols] = df_dt[obj_cols].astype(str)
# df_dt[obj_cols] = df_dt[obj_cols].apply(LabelEncoder().fit_transform)
# #
# # Print head
# #
# print(df_dt.dtypes)
# df.head()

In [5]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score, make_scorer
from sklearn.metrics import f1_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold, cross_validate

dt = DecisionTreeClassifier(criterion='entropy', random_state=42)

y = df.Label
X = df.drop(['Label'],axis=1)

# Normalize the dataset using MinMaxScaler
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X)
X = pd.DataFrame(X_normalized, columns=X.columns)
print("Dataset normalized using MinMaxScaler.")

# # Scale the dataset using StandardScaler
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# X = pd.DataFrame(X_scaled, columns=X.columns)

X_train_cv, X_unseen_test, y_train_cv, y_unseen_test = train_test_split(X, y, test_size=0.2, random_state=42)

kf = KFold(n_splits=10, shuffle=True, random_state=42)

Dataset normalized using MinMaxScaler.


In [6]:
# Initialize lists to store metrics
train_accuracies = []
test_accuracies = []
precisions = []
recalls = []
f1_scores = []
confusion_matrices = []
start = datetime.now()

for train_index, test_index in kf.split(X_train_cv):
    X_train, X_test = X_train_cv.iloc[train_index], X_train_cv.iloc[test_index]
    y_train, y_test = y_train_cv.iloc[train_index], y_train_cv.iloc[test_index]
    
    # Train the dt classifier
    dt.fit(X_train, y_train)
    
    # Make predictions on the test fold
    y_pred_test = dt.predict(X_test)
    y_pred_train = dt.predict(X_train)
    
    # Calculate metrics for the test fold
    train_accuracies.append(accuracy_score(y_train, y_pred_train))
    test_accuracies.append(accuracy_score(y_test, y_pred_test))
    precisions.append(precision_score(y_test, y_pred_test))
    recalls.append(recall_score(y_test, y_pred_test))
    f1_scores.append(f1_score(y_test, y_pred_test))
    confusion_matrices.append(confusion_matrix(y_test, y_pred_test))

# Test the final model on unseen data
y_unseen_pred = dt.predict(X_unseen_test)
unseen_accuracy = accuracy_score(y_unseen_test, y_unseen_pred)
unseen_precision = precision_score(y_unseen_test, y_unseen_pred)
unseen_recall = recall_score(y_unseen_test, y_unseen_pred)
unseen_f1 = f1_score(y_unseen_test, y_unseen_pred)
unseen_conf_matrix = confusion_matrix(y_unseen_test, y_unseen_pred)
end = datetime.now()

In [7]:
# Calculate the average confusion matrix
import numpy as np

# Initialize an array to store the sum of confusion matrices
confusion_matrix_sum = np.zeros((2, 2))  # Adjust the size if multi-class classification

# During the cross-validation loop, confusion matrices were already calculated
# Here, we'll sum up those matrices (assumes `confusion_matrices` contains all fold matrices)
for cm in confusion_matrices:
    confusion_matrix_sum += cm

# Compute the average confusion matrix
average_conf_matrix = confusion_matrix_sum / 10

# Display the average confusion matrix
print("Average Confusion Matrix:")
print(average_conf_matrix)

Average Confusion Matrix:
[[5491.6  122.3]
 [ 122.3 5463.8]]


In [8]:
# Print the metrics for each fold
for i in range(10):
    print(f"Fold {i+1}:")
    print(f"Training Accuracy: {train_accuracies[i]}")
    print(f"Testing Accuracy: {test_accuracies[i]}")
    print(f"Precision: {precisions[i]}")
    print(f"Recall: {recalls[i]}")
    print(f"F1-Score: {f1_scores[i]}")
    print(f"Confusion Matrix:\n{confusion_matrices[i]}\n")

print(f"Average Training Accuracy: {sum(train_accuracies) / 10}")
print(f"Average Testing Accuracy: {sum(test_accuracies) / 10}")
print(f"Average Precision: {sum(precisions) / 10}")
print(f"Average recall: {sum(recalls) / 10}")
print(f"Average F1 Score: {sum(f1_scores) / 10}")

# Print the performance on the unseen data
print("Performance on Unseen Data:")
print(f"Accuracy: {unseen_accuracy}")
print(f"Precision: {unseen_precision}")
print(f"Recall: {unseen_recall}")
print(f"F1-Score: {unseen_f1}")
print(f"Confusion Matrix:\n{unseen_conf_matrix}")
print("excution time: ", end-start)

Fold 1:
Training Accuracy: 0.9999603174603174
Testing Accuracy: 0.9775892857142857
Precision: 0.9778413152251608
Recall: 0.9773173781032327
F1-Score: 0.9775792764627066
Confusion Matrix:
[[5477  124]
 [ 127 5472]]

Fold 2:
Training Accuracy: 0.9999702380952381
Testing Accuracy: 0.9783035714285714
Precision: 0.9784676116992643
Recall: 0.9779411764705882
F1-Score: 0.9782043232576912
Confusion Matrix:
[[5504  120]
 [ 123 5453]]

Fold 3:
Training Accuracy: 0.9999603174603174
Testing Accuracy: 0.9783928571428572
Precision: 0.9795918367346939
Recall: 0.9775101823977334
F1-Score: 0.9785499024995569
Confusion Matrix:
[[5438  115]
 [ 127 5520]]

Fold 4:
Training Accuracy: 0.9999404761904762
Testing Accuracy: 0.9789285714285715
Precision: 0.9793478260869565
Recall: 0.9779305354558611
F1-Score: 0.9786386676321506
Confusion Matrix:
[[5558  114]
 [ 122 5406]]

Fold 5:
Training Accuracy: 0.9999404761904762
Testing Accuracy: 0.9802678571428571
Precision: 0.9796682718031032
Recall: 0.9808928571428571


In [9]:
# Calculate Weighted Feature Importance (WFI)
feature_importances = dt.feature_importances_
wfi = feature_importances / feature_importances.sum()
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': wfi
}).sort_values(by='Importance', ascending=False)

print("\nWeighted Feature Importance (WFI):")
print(feature_importance_df)


Weighted Feature Importance (WFI):
                 Feature  Importance
0     Count_of_Source_IP    0.863198
10     Duration_per_Flow    0.060893
4      Lookup_Count_Diff    0.017235
9           Byte_Std_Dev    0.012269
6   Average_Packet_Count    0.010279
7     Average_Byte_Count    0.009389
1             Port_Count    0.008984
3      Packet_Count_Diff    0.008755
2       Pair_Count_Ratio    0.003760
8         Packet_Std_Dev    0.003704
5               Protocol    0.001535


In [10]:
# import pickle
# with open('dt_binary.pkl', 'wb') as file:
#     pickle.dump(dt, file)
