In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Load your dataset
df = pd.read_csv('Darknet.csv')

In [2]:
# Encode the labels to binary format
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'].replace(['Non-Tor', 'NonVPN'], 'Normal').replace(['Tor', 'VPN'], 'Malicious'))

In [3]:
import numpy as np
# Select relevant features
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns
X = df[numeric_features].drop(['Src Port', 'Dst Port', 'Protocol'], axis=1)
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)

In [4]:
# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df['Label'], test_size=0.3, random_state=42, stratify=df['Label'])

In [5]:
# Define the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [6]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping])



Epoch 1/100


2023-12-15 07:37:45.652049: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100


<keras.callbacks.History at 0x29cbb5dc0>

In [7]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy}')

   1/1327 [..............................] - ETA: 20s - loss: 5.5597e-21 - accuracy: 1.0000

Test accuracy: 0.9999764561653137


In [8]:
#test with holdout data
df = pd.read_csv('holdout_set_mixed.csv')
df['Label'] = le.transform(df['Label'].replace(['Non-Tor', 'NonVPN'], 'Normal').replace(['Tor', 'VPN'], 'Malicious'))
X = df[numeric_features].drop(['Src Port', 'Dst Port', 'Protocol'], axis=1)
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)
X_scaled = scaler.transform(X)
y = df['Label']
loss, accuracy = model.evaluate(X_scaled, y)
print(f'Holdout accuracy: {accuracy}')

Holdout accuracy: 1.0


In [9]:
# Optionally save the model
model.save('darknet_model.h5')