In [1]:
import pandas as pd

df = pd.read_csv('research_traffic.csv')
df.shape

(140000, 12)

In [2]:
print(df.dtypes)
print(df['Label'].value_counts()[1])
print(df['Label'].value_counts()[0])
df.head()

Count_of_Source_IP        int64
Port_Count                int64
Pair_Count_Ratio        float64
Packet_Count_Diff         int64
Lookup_Count_Diff         int64
Protocol                  int64
Average_Packet_Count    float64
Average_Byte_Count      float64
Packet_Std_Dev          float64
Byte_Std_Dev            float64
Duration_per_Flow       float64
Label                     int64
dtype: object
70000
70000


Unnamed: 0,Count_of_Source_IP,Port_Count,Pair_Count_Ratio,Packet_Count_Diff,Lookup_Count_Diff,Protocol,Average_Packet_Count,Average_Byte_Count,Packet_Std_Dev,Byte_Std_Dev,Duration_per_Flow,Label
0,2,2,1.0,436913,1,6,145637.6667,3567915000.0,24719.5,5339086000.0,1.258,0
1,2,2,1.0,436912,1,6,145637.3333,3567915000.0,24721.0,5339086000.0,1.254,0
2,2,2,1.0,1006277,1,6,335425.6667,8291520000.0,61289.5,12408120000.0,2.587333,0
3,2,2,0.0,1,1006280,6,335426.0,8291520000.0,61288.0,12408120000.0,2.592,0
4,2,2,1.0,1605562,6,6,535187.3333,13215030000.0,97006.0,19775970000.0,3.926,0


In [3]:
# obj_cols=df.dtypes[df.dtypes == "object"].index.values.tolist()
# print(obj_cols)

# from sklearn.preprocessing import LabelEncoder
# #Encode labels of multiple columns at once

# df[obj_cols] = df[obj_cols].astype(str)
# df[obj_cols] = df[obj_cols].apply(LabelEncoder().fit_transform)
# #
# # Print head
# #
# print(df.dtypes)
# df.head()

In [4]:
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

y = df.Label
X = df.drop(['Label'],axis=1)

LR = LogisticRegression(solver='liblinear')

# Normalize the dataset using MinMaxScaler
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X)
X = pd.DataFrame(X_normalized, columns=X.columns)
print("Dataset normalized using MinMaxScaler.")

X_train_cv, X_unseen_test, y_train_cv, y_unseen_test = train_test_split(X, y, test_size=0.2, random_state=42)

kf = KFold(n_splits=10, random_state=42, shuffle=True)

Dataset normalized using MinMaxScaler.


In [5]:
# Initialize lists to store metrics
train_accuracies = []
test_accuracies = []
precisions = []
recalls = []
f1_scores = []
confusion_matrices = []
start = datetime.now()

for train_index, test_index in kf.split(X_train_cv):
    X_train, X_test = X_train_cv.iloc[train_index], X_train_cv.iloc[test_index]
    y_train, y_test = y_train_cv.iloc[train_index], y_train_cv.iloc[test_index]
    
    # Train the dt classifier
    LR.fit(X_train, y_train)
    
    # Make predictions on the test fold
    y_pred_test = LR.predict(X_test)
    y_pred_train = LR.predict(X_train)
    
    # Calculate metrics for the test fold
    train_accuracies.append(accuracy_score(y_train, y_pred_train))
    test_accuracies.append(accuracy_score(y_test, y_pred_test))
    precisions.append(precision_score(y_test, y_pred_test))
    recalls.append(recall_score(y_test, y_pred_test))
    f1_scores.append(f1_score(y_test, y_pred_test))
    confusion_matrices.append(confusion_matrix(y_test, y_pred_test))

# Test the final model on unseen data
y_unseen_pred = LR.predict(X_unseen_test)
unseen_accuracy = accuracy_score(y_unseen_test, y_unseen_pred)
unseen_precision = precision_score(y_unseen_test, y_unseen_pred)
unseen_recall = recall_score(y_unseen_test, y_unseen_pred)
unseen_f1 = f1_score(y_unseen_test, y_unseen_pred)
unseen_conf_matrix = confusion_matrix(y_unseen_test, y_unseen_pred)
end = datetime.now()

In [6]:
# Calculate the average confusion matrix
import numpy as np

# Initialize an array to store the sum of confusion matrices
confusion_matrix_sum = np.zeros((2, 2))  # Adjust the size if multi-class classification

# During the cross-validation loop, confusion matrices were already calculated
# Here, we'll sum up those matrices (assumes `confusion_matrices` contains all fold matrices)
for cm in confusion_matrices:
    confusion_matrix_sum += cm

# Compute the average confusion matrix
average_conf_matrix = confusion_matrix_sum / 10

# Display the average confusion matrix
print("Average Confusion Matrix:")
print(average_conf_matrix)

Average Confusion Matrix:
[[5477.6  136.3]
 [ 131.3 5454.8]]


In [7]:
# Print the metrics for each fold
for i in range(10):
    print(f"Fold {i+1}:")
    print(f"Training Accuracy: {train_accuracies[i]}")
    print(f"Testing Accuracy: {test_accuracies[i]}")
    print(f"Precision: {precisions[i]}")
    print(f"Recall: {recalls[i]}")
    print(f"F1-Score: {f1_scores[i]}")
    print(f"Confusion Matrix:\n{confusion_matrices[i]}\n")

print(f"Average Training Accuracy: {sum(train_accuracies) / 10}")
print(f"Average Testing Accuracy: {sum(test_accuracies) / 10}")
print(f"Average Precision: {sum(precisions) / 10}")
print(f"Average recall: {sum(recalls) / 10}")
print(f"Average F1 Score: {sum(f1_scores) / 10}")

# Print the performance on the unseen data
print("Performance on Unseen Data:")
print(f"Accuracy: {unseen_accuracy}")
print(f"Precision: {unseen_precision}")
print(f"Recall: {unseen_recall}")
print(f"F1-Score: {unseen_f1}")
print(f"Confusion Matrix:\n{unseen_conf_matrix}")
print("excution time: ", end-start)

Fold 1:
Training Accuracy: 0.9758829365079366
Testing Accuracy: 0.9782142857142857
Precision: 0.9771876670825165
Recall: 0.9792820146454724
F1-Score: 0.9782337198929527
Confusion Matrix:
[[5473  128]
 [ 116 5483]]

Fold 2:
Training Accuracy: 0.9763392857142857
Testing Accuracy: 0.9750892857142858
Precision: 0.9743865305391367
Recall: 0.975609756097561
F1-Score: 0.9749977596558831
Confusion Matrix:
[[5481  143]
 [ 136 5440]]

Fold 3:
Training Accuracy: 0.9760218253968254
Testing Accuracy: 0.9766964285714286
Precision: 0.9752912107306742
Recall: 0.978572693465557
F1-Score: 0.9769291964996022
Confusion Matrix:
[[5413  140]
 [ 121 5526]]

Fold 4:
Training Accuracy: 0.9759523809523809
Testing Accuracy: 0.976875
Precision: 0.9764876107795262
Recall: 0.9766642547033285
F1-Score: 0.9765759247535498
Confusion Matrix:
[[5542  130]
 [ 129 5399]]

Fold 5:
Training Accuracy: 0.9758730158730159
Testing Accuracy: 0.9777678571428572
Precision: 0.9787081767758097
Recall: 0.9767857142857143
F1-Score: 0.

In [8]:
# import pickle
# with open('lr_binary.pkl', 'wb') as file:
#     pickle.dump(LR, file)