# Import and Load Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file_path = 'https://drive.google.com/uc?id=1w69PKv1mSGzy70yYBQ2W0nBwtWq-bKzh'
file_path_pred = 'https://drive.google.com/uc?id=1pbpfNmlJjn3f7iG9IZAnu9U0yv-y1Aa_'

# Load the first dataset (with 'targetnya' column)
data_with_target = pd.read_csv(file_path, delimiter=";")

# Load the second dataset (without 'targetnya' column)
data_to_predict = pd.read_csv(file_path_pred, delimiter=";")

data_to_predict = data_to_predict.drop(columns=['id'])

# Exploratory Data Analysis

## Null Values in Dataset

In [None]:
data_with_target

In [None]:
tmp = data_with_target.isnull().sum().sort_values(ascending=False)

labels = tmp.index.to_list()
values = tmp.to_list()


plt.figure(figsize=(15, 8))
plt.title("Count Null Values in Dataset")
container = plt.bar(labels, values)
plt.bar_label(container)
plt.xticks(np.arange(len(labels)), labels, rotation = 'vertical')
plt.ylabel("Number of Occurences")
plt.show()

### Solution
Handle missing values before using it

## Heat Map / Correlation Map

In [None]:
# Copy dataframe
data_corr = pd.concat([valid_data, invalid_data])

# Change all column to be numeric
for column in data_corr.columns:
    le = LabelEncoder()
    data_corr[column] = le.fit_transform(data_corr[column])

# Create Heat Map
data_corr = data_corr.corr()
data_corr = round(data_corr, 3)
plt.figure(figsize=(35, 25))
plt.title("Heat Map Correlation", fontsize=25, pad=20)
sns.heatmap(data_corr,annot=True,cmap="RdYlGn")
plt.show()

In [None]:
# Descending correlation values to "type of attack" column / target column
tmp = data_corr.drop(["targetnya"])
tmp = tmp[['targetnya']]
tmp = tmp['targetnya'].fillna(0)
tmp = tmp.sort_values(ascending=False)

# Barchart Ilustration
labels = tmp.index.to_list()
values_pos = tmp[tmp >= 0].to_list()
values_neg = tmp[tmp < 0].to_list()

plt.figure(figsize=(25, 10))
# plt.title(f'{values_col} at {datte} for each region')
container = plt.bar(labels[:len(values_pos)], values_pos, color="green")
container2 = plt.bar(labels[len(values_pos):], values_neg, color="red")
plt.bar_label(container)
plt.bar_label(container2)
plt.xticks(np.arange(len(labels)), labels, rotation = 'vertical')
plt.ylabel('Correlation to Target Column')
plt.title("Data Column Corellation to Target Column (targetnya)")
plt.show()

### Solution
There are sufficient column with enough correlation to the target column for Feature Selection scenario

## Target Column

In [None]:
tmp = data_with_target['targetnya']
tmp = tmp.value_counts()

labels = tmp.index.to_list()
values = tmp.to_list()

plt.figure(figsize=(15, 8))
plt.subplot(1, 2, 1)
container = plt.bar(labels, values)
plt.bar_label(container)
plt.xticks(np.arange(len(labels)), labels, rotation=90)
plt.ylabel("Number of Occurences")
plt.title("Number of Data")

plt.subplot(1, 2, 2)
plt.pie(values, labels=labels, autopct='%.1f%%', center=(0, 0))
plt.title("Proportion of Data", pad=40)

plt.suptitle("Distribution of Target Data")
plt.show()

### Solution
Because of the imbalance of the target data distribution, we have the option to do either oversampling or undersampling to our dataset

# Base

In [None]:
file_path = 'https://drive.google.com/uc?id=1w69PKv1mSGzy70yYBQ2W0nBwtWq-bKzh'
file_path_pred = 'https://drive.google.com/uc?id=1pbpfNmlJjn3f7iG9IZAnu9U0yv-y1Aa_'

# Load the first dataset (with 'targetnya' column)
data_with_target = pd.read_csv(file_path, delimiter=";")

# Load the second dataset (without 'targetnya' column)
data_to_predict = pd.read_csv(file_path_pred, delimiter=";")

data_to_predict = data_to_predict.drop(columns=['id'])

In [None]:
# Data Preprocessing
# Handle missing values
data_with_target.drop_duplicates(inplace=True)
data_with_target.replace('*', np.nan, inplace=True)
data_with_target.replace("99999", np.nan, inplace=True)
data_with_target.replace(99999, np.nan, inplace=True)
data_with_target.replace(99999.00, np.nan, inplace=True)

# Handle NaN values using SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
data_with_target = pd.DataFrame(imputer.fit_transform(data_with_target), columns=data_with_target.columns)
# data_to_predict = pd.DataFrame(imputer.transform(data_to_predict), columns=data_to_predict.columns)

# Encode categorical variables
categorical_columns = ['protocol_type', 'service', 'flag']

label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    data_with_target[column] = le.fit_transform(data_with_target[column])
    data_to_predict[column] = le.transform(data_to_predict[column])
    label_encoders[column] = le

In [None]:
# Split the data into training and testing sets
X = data_with_target.drop(columns=['targetnya'])
y = data_with_target['targetnya']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##Decision Tree

In [None]:
# Train a Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

In [None]:
# Predict on the testing set
y_pred = dt_model.predict(X_test)

# Calculate Precision, Recall, and F1-Score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print classification report
print(classification_report(y_test, y_pred))

##Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Predict on the testing set
y_pred = rf_model.predict(X_test)

In [None]:
# Calculate Precision, Recall, and F1-Score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [None]:
# Print classification report
print(classification_report(y_test, y_pred))

##Neural Network

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# # Standardize numerical features
# scaler = StandardScaler()
# data_with_target[data_with_target.columns[:-1]] = scaler.fit_transform(data_with_target[data_with_target.columns[:-1]])
# data_to_predict[data_to_predict.columns] = scaler.transform(data_to_predict[data_to_predict.columns])

In [None]:
# Change data type for keras process
label_encoder = LabelEncoder()
y_train_tf = label_encoder.fit_transform(y_train)
y_test_tf = label_encoder.transform(y_test)

X_train_tf = np.asarray(X_train).astype(np.float)
X_test_tf = np.asarray(X_test).astype(np.float)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_train_tf = np.asarray(X_train).astype(np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_test_tf = np.asarray(X_test).astype(np.float)


In [None]:
# Build a simple feedforward neural network using Keras
model = keras.Sequential([
    layers.Input(shape=(X_train_tf.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(len(data_with_target["targetnya"].unique()), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the neural network
model.fit(X_train_tf, y_train_tf, epochs=10, batch_size=32, validation_data=(X_test_tf, y_test_tf))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78b4ec8dcd00>

In [None]:
# Predict on the testing set
y_pred = model.predict(X_test_tf)

# Transform to the index with the maximum value
tmp = []
for i in range(len(y_pred)):
  tmp.append(np.argmax(y_pred[i]))
y_pred = tmp

# Transform the label back
y_pred = label_encoder.inverse_transform(y_pred)
y_test_tf = label_encoder.inverse_transform(y_test_tf)

# Calculate Precision, Recall, and F1-Score
precision = precision_score(y_test_tf, y_pred, average='weighted')
recall = recall_score(y_test_tf, y_pred,  average='weighted')
f1 = f1_score(y_test_tf, y_pred, average='weighted')

# Print classification report
print(classification_report(y_test_tf, y_pred))

##Ensemble Learning (Adaboost+RF)

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier

In [None]:
# Create base estimators for the ensemble
base_estimator_1 = DecisionTreeClassifier(max_depth=3)
base_estimator_2 = RandomForestClassifier(n_estimators=100, random_state=42)

# Create the ensemble with AdaBoost and Random Forest
ada_boost_classifier = AdaBoostClassifier(base_estimator=base_estimator_1, random_state=42)
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

ensemble_classifier = VotingClassifier(estimators=[
    ('AdaBoost', ada_boost_classifier),
    ('RandomForest', random_forest_classifier)
], voting='soft')  # 'soft' voting for probabilities

# Train the ensemble model
ensemble_classifier.fit(X_train, y_train)

# Predict on the testing set
y_pred = ensemble_classifier.predict(X_test)

# Calculate Precision, Recall, and F1-Score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print classification report
print(classification_report(y_test, y_pred))

## 4.1 Untreated + SMOTE

In [None]:
file_path = 'https://drive.google.com/uc?id=1w69PKv1mSGzy70yYBQ2W0nBwtWq-bKzh'
file_path_pred = 'https://drive.google.com/uc?id=1pbpfNmlJjn3f7iG9IZAnu9U0yv-y1Aa_'

# Load the first dataset (with 'targetnya' column)
data_with_target = pd.read_csv(file_path, delimiter=";")

# Load the second dataset (without 'targetnya' column)
data_to_predict = pd.read_csv(file_path_pred, delimiter=";")

data_to_predict = data_to_predict.drop(columns=['id'])

In [None]:
# Find all rows that contains faulty data (Null, 99999, *)
train = data_with_target[(data_with_target['duration'] == "99999") | \
                  (data_with_target['duration'] == 99999) | \
                   (data_with_target['duration'] == 99999.00) | \
                    (data_with_target['duration'] == "*") | \
                    (data_with_target['duration'].isnull())]


for col in data_with_target.columns[1:]:
  tmp1 = data_with_target[(data_with_target[col] == "99999") | \
                  (data_with_target[col] == 99999) | \
                   (data_with_target[col] == 99999.00) | \
                    (data_with_target[col] == "*") | \
                    (data_with_target[col].isnull())]
  train = pd.concat([train, tmp1])
  train.drop_duplicates(inplace=True)
train

In [None]:
# Find all rows that contains non faulty data
test = data_with_target.merge(train.drop_duplicates(),
                   how='left', indicator=True)
test = test[test['_merge'] == 'left_only']
test.drop(columns=['_merge'], inplace=True)
test

In [None]:
# Validate split
print(train.shape, test.shape)
print(data_with_target.duplicated().sum())
print()
print(train.shape[0] + test.shape[0] + data_with_target.duplicated().sum())
print(data_with_target.shape)

(95239, 42) (15800, 42)
1407

112446
(112446, 42)


In [None]:
# Data Preprocessing
# Handle missing values
train.drop_duplicates(inplace=True)
# train.replace('*', np.nan, inplace=True)
# train.replace("99999", np.nan, inplace=True)
# train.replace(99999, np.nan, inplace=True)
# train.replace(99999.00, np.nan, inplace=True)
train.replace('*', "99999", inplace=True)

test.drop_duplicates(inplace=True)
# test.replace('*', np.nan, inplace=True)
# test.replace("99999", np.nan, inplace=True)
# test.replace(99999, np.nan, inplace=True)
# test.replace(99999.00, np.nan, inplace=True)
train.replace('*', "99999", inplace=True)

# Handle NaN values using SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
data_with_target = pd.DataFrame(imputer.fit_transform(data_with_target), columns=data_with_target.columns)
train = pd.DataFrame(imputer.transform(train), columns=train.columns)
test = pd.DataFrame(imputer.transform(test), columns=test.columns)

# data_to_predict = pd.DataFrame(imputer.transform(data_to_predict), columns=data_to_predict.columns)

# Encode categorical variables
categorical_columns = ['protocol_type', 'service', 'flag']

label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    data_with_target[column] = le.fit_transform(data_with_target[column])
    train[column] = le.transform(train[column])
    test[column] = le.transform(test[column])

    data_to_predict[column] = le.transform(data_to_predict[column])
    label_encoders[column] = le

In [None]:
# Create X and y
y_train = train['targetnya']
X_train = train.drop(columns=['targetnya'])

y_test = test['targetnya']
X_test = test.drop(columns=['targetnya'])

In [None]:
# Length before oversampling
print(Counter(train['targetnya']), len(train))
print(Counter(test['targetnya']), len(test))

# Oversampling
ov = SMOTE(random_state=42)
X_train, y_train = ov.fit_resample(X_train, y_train)
X_test, y_test = ov.fit_resample(X_test, y_test)

# Length after oversampling
print(Counter(y_train), len(y_train))
print(Counter(y_test), len(y_test))

# Feature Selection correlation Threshold >= 0.1 or <= -0.1

In [None]:
file_path = 'https://drive.google.com/uc?id=1w69PKv1mSGzy70yYBQ2W0nBwtWq-bKzh'
file_path_pred = 'https://drive.google.com/uc?id=1pbpfNmlJjn3f7iG9IZAnu9U0yv-y1Aa_'

# Load the first dataset (with 'targetnya' column)
data_with_target = pd.read_csv(file_path, delimiter=";")

# Load the second dataset (without 'targetnya' column)
data_to_predict = pd.read_csv(file_path_pred, delimiter=";")

data_to_predict = data_to_predict.drop(columns=['id'])

In [None]:
# Find all rows that contains faulty data (Null, 99999, *)
train = data_with_target[(data_with_target['duration'] == "99999") | \
                  (data_with_target['duration'] == 99999) | \
                   (data_with_target['duration'] == 99999.00) | \
                    (data_with_target['duration'] == "*") | \
                    (data_with_target['duration'].isnull())]


for col in data_with_target.columns[1:]:
  tmp1 = data_with_target[(data_with_target[col] == "99999") | \
                  (data_with_target[col] == 99999) | \
                   (data_with_target[col] == 99999.00) | \
                    (data_with_target[col] == "*") | \
                    (data_with_target[col].isnull())]
  train = pd.concat([train, tmp1])
  train.drop_duplicates(inplace=True)
train

In [None]:
# Find all rows that contains non faulty data
test = data_with_target.merge(train.drop_duplicates(),
                   how='left', indicator=True)
test = test[test['_merge'] == 'left_only']
test.drop(columns=['_merge'], inplace=True)
test

In [None]:
# Validate split
print(train.shape, test.shape)
print(data_with_target.duplicated().sum())
print()
print(train.shape[0] + test.shape[0] + data_with_target.duplicated().sum())
print(data_with_target.shape)

(95239, 42) (15800, 42)
1407

112446
(112446, 42)


In [None]:
# Data Preprocessing
# Handle missing values
train.drop_duplicates(inplace=True)
train.replace('*', np.nan, inplace=True)
train.replace("99999", np.nan, inplace=True)
train.replace(99999, np.nan, inplace=True)
train.replace(99999.00, np.nan, inplace=True)
# train.replace('*', "99999", inplace=True)

test.drop_duplicates(inplace=True)
test.replace('*', np.nan, inplace=True)
test.replace("99999", np.nan, inplace=True)
test.replace(99999, np.nan, inplace=True)
test.replace(99999.00, np.nan, inplace=True)
# train.replace('*', "99999", inplace=True)

# Handle NaN values using SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
data_with_target = pd.DataFrame(imputer.fit_transform(data_with_target), columns=data_with_target.columns)
train = pd.DataFrame(imputer.transform(train), columns=train.columns)
test = pd.DataFrame(imputer.transform(test), columns=test.columns)

# data_to_predict = pd.DataFrame(imputer.transform(data_to_predict), columns=data_to_predict.columns)

# Encode categorical variables
categorical_columns = ['protocol_type', 'service', 'flag']

label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    data_with_target[column] = le.fit_transform(data_with_target[column])
    train[column] = le.transform(train[column])
    test[column] = le.transform(test[column])

    data_to_predict[column] = le.transform(data_to_predict[column])
    label_encoders[column] = le

In [None]:
# Copy dataframe
data_corr = pd.concat([train, test])

# Change all column to be numeric
for column in data_corr.columns:
    le = LabelEncoder()
    data_corr[column] = le.fit_transform(data_corr[column])

# Create Correlation
data_corr = data_corr.corr()
data_corr = round(data_corr, 3)

# Create Heat Map
plt.figure(figsize=(35, 25))
plt.title("Heat Map Correlation after Preprocessing", fontsize=25, pad=20)
sns.heatmap(data_corr,annot=True,cmap="RdYlGn")
plt.show()

In [None]:
# Descending correlation values to "type of attack" column / target column
tmp = data_corr.drop(["targetnya"])
tmp = tmp[['targetnya']]
tmp = tmp['targetnya'].fillna(0)
tmp = tmp.sort_values(ascending=False)

# Barchart Ilustration
labels = tmp.index.to_list()
values_pos = tmp[tmp >= 0].to_list()
values_neg = tmp[tmp < 0].to_list()

plt.figure(figsize=(25, 10))
# plt.title(f'{values_col} at {datte} for each region')
container = plt.bar(labels[:len(values_pos)], values_pos, color="green")
container2 = plt.bar(labels[len(values_pos):], values_neg, color="red")
plt.bar_label(container)
plt.bar_label(container2)
plt.xticks(np.arange(len(labels)), labels, rotation = 'vertical')
plt.ylabel('Correlation to Target Column')
plt.title("Data Column Corellation to Target Column (targetnya)")
plt.show()

In [None]:
# Descending correlation values to "type of attack" column / target column
tmp = data_corr.drop(["targetnya"])
tmp = tmp[['targetnya']]
tmp['targetnya'].sort_values(ascending=False)

# Capture columns that satisfy the threshold
threshold = 0.1
tmp = tmp[(tmp['targetnya'] >= threshold) | (tmp['targetnya'] <= -(threshold))]
print(f"length of valid columns = {len(tmp)}")
valid_columns = tmp.index
tmp

In [None]:
# Only select target and valid columns
tmp_col = list(valid_columns)
tmp_col.append('targetnya')

# Create Heat Map
data_corr_val = data_corr[tmp_col]
data_corr_val = data_corr_val.loc[tmp_col]
plt.figure(figsize=(15, 10))
plt.title("Heat Map Correlation with only Valid Columns")
sns.heatmap(data_corr_val,annot=True,cmap="RdYlGn")
plt.show()

In [None]:
# Descending correlation values to "type of attack" column / target column
tmp = tmp['targetnya']
tmp = tmp.sort_values(ascending=False)

# Barchart Ilustration
labels = tmp.index.to_list()
values_pos = tmp[tmp >= 0].to_list()
values_neg = tmp[tmp < 0].to_list()

plt.figure(figsize=(15, 8))
# plt.title(f'{values_col} at {datte} for each region')
container = plt.bar(labels[:len(values_pos)], values_pos, color="green")
container2 = plt.bar(labels[len(values_pos):], values_neg, color="red")
plt.bar_label(container)
plt.bar_label(container2)
plt.xticks(np.arange(len(labels)), labels, rotation = 'vertical')
plt.ylabel('Correlation to Target Column')
plt.title("Data Column Corellation to Target Column (targetnya)")
plt.show()

In [None]:
# Create X and y
y_train = train['targetnya']
X_train = train.drop(columns=['targetnya'])
X_train = X_train[valid_columns]

y_test = test['targetnya']
X_test = test.drop(columns=['targetnya'])
X_test = X_test[valid_columns]

In [None]:
# Length before oversampling
print(Counter(train['targetnya']), len(train))
print(Counter(test['targetnya']), len(test))

# Oversampling
ov = SMOTE(random_state=42)
X_train, y_train = ov.fit_resample(X_train, y_train)
X_test, y_test = ov.fit_resample(X_test, y_test)

# Length after oversampling
print(Counter(y_train), len(y_train))
print(Counter(y_test), len(y_test))

Counter({'normal': 47985, 'neptune': 36678, 'satan': 2819, 'ipsweep': 2781, 'portsweep': 1878, 'Denial of Service Attack': 1330, 'nmap': 955, 'smurf': 813}) 95239
Counter({'normal': 12184, 'smurf': 1565, 'portsweep': 773, 'Denial of Service Attack': 416, 'nmap': 367, 'ipsweep': 255, 'neptune': 154, 'satan': 86}) 15800
Counter({'normal': 47985, 'neptune': 47985, 'smurf': 47985, 'Denial of Service Attack': 47985, 'satan': 47985, 'portsweep': 47985, 'ipsweep': 47985, 'nmap': 47985}) 383880
Counter({'nmap': 12184, 'normal': 12184, 'smurf': 12184, 'portsweep': 12184, 'ipsweep': 12184, 'Denial of Service Attack': 12184, 'neptune': 12184, 'satan': 12184}) 97472


# All Scenarios Summary

## Loading Data

In [None]:
file_path = 'https://drive.google.com/uc?id=1w69PKv1mSGzy70yYBQ2W0nBwtWq-bKzh'
file_path_pred = 'https://drive.google.com/uc?id=1pbpfNmlJjn3f7iG9IZAnu9U0yv-y1Aa_'

# Load the first dataset (with 'targetnya' column)
data_with_target = pd.read_csv(file_path, delimiter=";")

# Load the second dataset (without 'targetnya' column)
data_to_predict = pd.read_csv(file_path_pred, delimiter=";")

data_to_predict = data_to_predict.drop(columns=['id'])

## Preprocess Data

In [None]:
# Find all rows that contains faulty data (Null, 99999, *)
train = data_with_target[(data_with_target['duration'] == "99999") | \
                  (data_with_target['duration'] == 99999) | \
                   (data_with_target['duration'] == 99999.00) | \
                    (data_with_target['duration'] == "*") | \
                    (data_with_target['duration'].isnull())]


for col in data_with_target.columns[1:]:
  tmp1 = data_with_target[(data_with_target[col] == "99999") | \
                  (data_with_target[col] == 99999) | \
                   (data_with_target[col] == 99999.00) | \
                    (data_with_target[col] == "*") | \
                    (data_with_target[col].isnull())]
  train = pd.concat([train, tmp1])
  train.drop_duplicates(inplace=True)
train

In [None]:
# Find all rows that contains non faulty data
test = data_with_target.merge(train.drop_duplicates(),
                   how='left', indicator=True)
test = test[test['_merge'] == 'left_only']
test.drop(columns=['_merge'], inplace=True)
test

In [None]:
# Validate split
print(train.shape, test.shape)
print(data_with_target.duplicated().sum())
print()
print(train.shape[0] + test.shape[0] + data_with_target.duplicated().sum())
print(data_with_target.shape)

(95239, 42) (15800, 42)
1407

112446
(112446, 42)


In [None]:
# Data Preprocessing
# Handle missing values
train.drop_duplicates(inplace=True)
train.replace('*', np.nan, inplace=True)
train.replace("99999", np.nan, inplace=True)
train.replace(99999, np.nan, inplace=True)
train.replace(99999.00, np.nan, inplace=True)
# train.replace('*', "99999", inplace=True)

test.drop_duplicates(inplace=True)
test.replace('*', np.nan, inplace=True)
test.replace("99999", np.nan, inplace=True)
test.replace(99999, np.nan, inplace=True)
test.replace(99999.00, np.nan, inplace=True)
# train.replace('*', "99999", inplace=True)

# Handle NaN values using SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
data_with_target = pd.DataFrame(imputer.fit_transform(data_with_target), columns=data_with_target.columns)
train = pd.DataFrame(imputer.transform(train), columns=train.columns)
test = pd.DataFrame(imputer.transform(test), columns=test.columns)


# data_to_predict = pd.DataFrame(imputer.transform(data_to_predict), columns=data_to_predict.columns)

# Encode categorical variables
categorical_columns = ['protocol_type', 'service', 'flag']

label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    data_with_target[column] = le.fit_transform(data_with_target[column])
    train[column] = le.transform(train[column])
    test[column] = le.transform(test[column])
    data_to_predict[column] = le.transform(data_to_predict[column])
    label_encoders[column] = le

In [None]:
# Create X and y
y_train = train['targetnya']
X_train = train.drop(columns=['targetnya'])

y_test = test['targetnya']
X_test = test.drop(columns=['targetnya'])

In [None]:
# Length before oversampling
print(Counter(train['targetnya']), len(train))
print(Counter(test['targetnya']), len(test))

# Oversampling
ov = SMOTE(random_state=42)
X_train, y_train = ov.fit_resample(X_train, y_train)
X_test, y_test = ov.fit_resample(X_test, y_test)

# Length after oversampling
print(Counter(y_train), len(y_train))
print(Counter(y_test), len(y_test))

## Modelling

In [None]:
# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Predict on the testing set
y_pred = rf_model.predict(X_test)

# Calculate Precision, Recall, and F1-Score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print classification report
print(classification_report(y_test, y_pred))

In [None]:
cm = ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
fig = cm.ax_.get_figure()
fig.set_figwidth(12)
fig.set_figheight(8)

## Model Feature Importance

In [None]:
feature_importances = rf_model.feature_importances_

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 8))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance Analysis')
plt.gca().invert_yaxis()  # Invert the y-axis for better visualization
plt.show()

# Print the top influential features
top_features = importance_df.head(10)  # Adjust the number of top features to display
print("Top Influential Features:")
print(top_features)

# Predict Data Test

In [None]:
data_to_predict

In [None]:
# Prediction on the other data file
predictions = rf_model.predict(data_to_predict)

# Inverse transform the encoded categorical features to their original values
for column in categorical_columns:
    data_to_predict[column] = label_encoders[column].inverse_transform(data_to_predict[column])

# Add the predictions to the 'data_to_predict' DataFrame
data_to_predict['predicted_targetnya'] = predictions

# Determine the number of rows in the DataFrame
num_rows = data_to_predict.shape[0]

# Add a new 'id' column with values ranging from 0 to num_rows-1
data_to_predict['id'] = range(num_rows)

# Rearrange the 'id' column to the front of the DataFrame
data_to_predict = data_to_predict[['id'] + [col for col in data_to_predict.columns if col != 'id']]

# Save the DataFrame with predictions to a CSV file
data_to_predict.to_csv('data_with_predictions_rf_final.csv', index=False)