In [1]:
import pandas as pd
from datetime import datetime
df = pd.read_csv('research_traffic_multiple_labels.csv')
df.shape

(60000, 12)

In [2]:

print(df.dtypes)
print(df['Label'].value_counts()[1])
print(df['Label'].value_counts()[0])
df.head()

Count of Source IP        int64
Port Count                int64
Pair Count Ratio        float64
Packet Count Diff         int64
Lookup Count Diff         int64
Protocol                  int64
Average Packet Count    float64
Average Byte Count      float64
Packet Std Dev          float64
Byte Std Dev            float64
Duration per Flow       float64
Label                     int64
dtype: object
10000
10000


Unnamed: 0,Count of Source IP,Port Count,Pair Count Ratio,Packet Count Diff,Lookup Count Diff,Protocol,Average Packet Count,Average Byte Count,Packet Std Dev,Byte Std Dev,Duration per Flow,Label
0,545,545,0.0,2,43,6,0.027523,4.788991,0.163601,28.466647,0.66802,3
1,545,545,0.0,10,0,6,0.009174,1.59633,0.095342,16.589551,0.419081,3
2,52,52,0.0,3,0,6,0.0,0.0,0.0,0.0,0.018792,3
3,545,545,0.0,13,21,6,0.023853,4.150459,0.355016,61.772825,2.946281,3
4,545,545,0.0,2,131,6,0.027523,4.788991,0.163601,28.466647,2.674461,3


In [3]:
# # obj_cols=df.dtypes[df.dtypes == "object"].index.values.tolist()
# # print(obj_cols)

# from sklearn.preprocessing import LabelEncoder
# #Encode labels of multiple columns at once

# df[obj_cols] = df[obj_cols].astype(str)
# df[obj_cols] = df[obj_cols].apply(LabelEncoder().fit_transform)
# #
# # Print head
# #
# print(df.dtypes)
# df.head()

In [4]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score, make_scorer
from sklearn.metrics import f1_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold, cross_validate

rnd = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')

y = df.Label
X = df.drop(['Label'],axis=1)

# Normalize the dataset using MinMaxScaler
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X)
X = pd.DataFrame(X_normalized, columns=X.columns)
print("Dataset normalized using MinMaxScaler.")

# # Scale the dataset using StandardScaler
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# X = pd.DataFrame(X_scaled, columns=X.columns)

X_train_cv, X_unseen_test, y_train_cv, y_unseen_test = train_test_split(X, y, test_size=0.2, random_state=42)

kf = KFold(n_splits=10, shuffle=True, random_state=42)

Dataset normalized using MinMaxScaler.


In [5]:
# Initialize lists to store metrics
train_accuracies = []
test_accuracies = []
precisions = []
recalls = []
f1_scores = []
confusion_matrices = []
start = datetime.now()

for train_index, test_index in kf.split(X_train_cv):
    X_train, X_test = X_train_cv.iloc[train_index], X_train_cv.iloc[test_index]
    y_train, y_test = y_train_cv.iloc[train_index], y_train_cv.iloc[test_index]
    
    # Train the RF classifier
    rnd.fit(X_train, y_train)
    
    # Make predictions on the test fold
    y_pred_test = rnd.predict(X_test)
    y_pred_train = rnd.predict(X_train)
    
    # Calculate metrics for the test fold
    train_accuracies.append(accuracy_score(y_train, y_pred_train))
    test_accuracies.append(accuracy_score(y_test, y_pred_test))
    precisions.append(precision_score(y_test, y_pred_test, average='weighted'))
    recalls.append(recall_score(y_test, y_pred_test, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred_test, average='weighted'))
    confusion_matrices.append(confusion_matrix(y_test, y_pred_test))

# Test the final model on unseen data
y_unseen_pred = rnd.predict(X_unseen_test)
unseen_accuracy = accuracy_score(y_unseen_test, y_unseen_pred)
unseen_precision = precision_score(y_unseen_test, y_unseen_pred, average='weighted')
unseen_recall = recall_score(y_unseen_test, y_unseen_pred, average='weighted')
unseen_f1 = f1_score(y_unseen_test, y_unseen_pred, average='weighted')
unseen_conf_matrix = confusion_matrix(y_unseen_test, y_unseen_pred)
end = datetime.now()
print("excution time: ", end-start)

excution time:  0:01:05.902878


In [6]:
# Calculate the average confusion matrix for multi-class classification
import numpy as np

# Determine the number of classes from the confusion matrices
num_classes = confusion_matrices[0].shape[0]

# Initialize an array to store the sum of confusion matrices
confusion_matrix_sum = np.zeros((num_classes, num_classes))

# Sum up the confusion matrices from all folds
for cm in confusion_matrices:
    confusion_matrix_sum += cm

# Compute the average confusion matrix
average_conf_matrix = confusion_matrix_sum / len(confusion_matrices)

# Display the average confusion matrix
print("Average Confusion Matrix:")
max_width = max(len("{:.2f}".format(value)) for row in average_conf_matrix for value in row)

# Print each row with formatted values
for row in average_conf_matrix:
    print(" ".join(f"{value:>{max_width}.2f}" for value in row))

Average Confusion Matrix:
612.30  34.40   0.20  39.60   0.30 112.30
  6.40 772.50   0.20   0.00   0.20  15.10
  1.50   0.40 797.50   0.00   0.00   0.10
  0.10   0.00   0.00 799.30   0.00   0.20
  1.60   0.00   0.00   0.00 801.90   0.00
 52.70   2.30   0.30   0.00   0.00 748.60


In [7]:
# Print the metrics for each fold
for i in range(10):
    print(f"Fold {i+1}:")
    print(f"Training Accuracy: {train_accuracies[i]}")
    print(f"Testing Accuracy: {test_accuracies[i]}")
    print(f"Precision: {precisions[i]}")
    print(f"Recall: {recalls[i]}")
    print(f"F1-Score: {f1_scores[i]}")
    print(f"Confusion Matrix:\n{confusion_matrices[i]}\n")

print(f"Average Training Accuracy: {sum(train_accuracies) / 10}")
print(f"Average Testing Accuracy: {sum(test_accuracies) / 10}")
print(f"Average Precision: {sum(precisions) / 10}")
print(f"Average recall: {sum(recalls) / 10}")
print(f"Average F1 Score: {sum(f1_scores) / 10}")

# Print the performance on the unseen data
print("Performance on Unseen Data:")
print(f"Accuracy: {unseen_accuracy}")
print(f"Precision: {unseen_precision}")
print(f"Recall: {unseen_recall}")
print(f"F1-Score: {unseen_f1}")
print(f"Confusion Matrix:\n{unseen_conf_matrix}")
print(sum(train_accuracies) / 10)
print(sum(test_accuracies) / 10)

Fold 1:
Training Accuracy: 0.9789351851851852
Testing Accuracy: 0.9454166666666667
Precision: 0.945137177409953
Recall: 0.9454166666666667
F1-Score: 0.9441209237334859
Confusion Matrix:
[[579  42   0  31   0 109]
 [  7 802   0   0   0  11]
 [  1   0 772   0   0   0]
 [  0   0   0 794   0   0]
 [  3   0   0   0 804   0]
 [ 57   1   0   0   0 787]]

Fold 2:
Training Accuracy: 0.9767361111111111
Testing Accuracy: 0.9395833333333333
Precision: 0.9411221351635686
Recall: 0.9395833333333333
F1-Score: 0.938330199799719
Confusion Matrix:
[[624  30   0  45   1 129]
 [  8 746   1   0   0  22]
 [  0   1 811   0   0   0]
 [  0   0   0 814   0   0]
 [  4   0   0   0 781   0]
 [ 46   3   0   0   0 734]]

Fold 3:
Training Accuracy: 0.9778009259259259
Testing Accuracy: 0.944375
Precision: 0.9453363434092145
Recall: 0.944375
F1-Score: 0.9432084037429619
Confusion Matrix:
[[602  31   0  37   2 118]
 [  6 822   0   0   0  19]
 [  2   1 771   0   0   0]
 [  1   0   0 823   0   1]
 [  1   0   0   0 768   0

In [8]:
# Calculate Weighted Feature Importance (WFI)
feature_importances = rnd.feature_importances_
wfi = feature_importances / feature_importances.sum()
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': wfi
}).sort_values(by='Importance', ascending=False)

print("\nWeighted Feature Importance (WFI):")
print(feature_importance_df)


Weighted Feature Importance (WFI):
                 Feature  Importance
0     Count of Source IP    0.275738
5               Protocol    0.259403
1             Port Count    0.173852
6   Average Packet Count    0.082110
7     Average Byte Count    0.059864
10     Duration per Flow    0.042722
3      Packet Count Diff    0.040246
4      Lookup Count Diff    0.029854
8         Packet Std Dev    0.022699
2       Pair Count Ratio    0.007773
9           Byte Std Dev    0.005739


In [10]:
# import pickle
# with open('rf_multi.pkl', 'wb') as file:
#     pickle.dump(rnd, file)