In [2]:
# Import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
import os
import glob

In [3]:
# Load the dataset
data = pd.read_csv('./data/dataset6.csv')



In [4]:
folder_path = "./data"
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Read the first CSV file to get the reference columns
first_df = pd.read_csv(csv_files[0], nrows=0)  # Read only headers to save memory
reference_columns = list(first_df.columns)

# Check if all CSV files have the same columns
all_same_columns = True
for file in csv_files[1:]:
    df = pd.read_csv(file, nrows=0)  # Read only headers
    current_columns = list(df.columns)
    if current_columns != reference_columns:
        print(f"File {file} has different columns: {current_columns}")
        all_same_columns = False
        break

print(all_same_columns)

True


In [20]:
data['tunnel_parents   label   detailed-label'].head(70)

0                    (empty)   Benign   -
1                    (empty)   Benign   -
2                    (empty)   Benign   -
3                    (empty)   Benign   -
4                    (empty)   Benign   -
                     ...                 
65                   (empty)   Benign   -
66                   (empty)   Benign   -
67                   (empty)   Benign   -
68    (empty)   Malicious   C&C-HeartBeat
69                   (empty)   Benign   -
Name: tunnel_parents   label   detailed-label, Length: 70, dtype: object

In [21]:
print(data['tunnel_parents   label   detailed-label'].head().to_list())

['(empty)   Benign   -', '(empty)   Benign   -', '(empty)   Benign   -', '(empty)   Benign   -', '(empty)   Benign   -']


In [None]:
split_columns = data['tunnel_parents   label   detailed-label'].str.split(r'\s+', expand=True)

# Rename the new columns
split_columns.columns = ['tunnel_parents', 'label', 'detailed_label']

# Concatenate the new columns to the original DataFrame and drop the old column
data = pd.concat([data, split_columns], axis=1)
data = data.drop('tunnel_parents   label   detailed-label', axis=1)

print(data[['tunnel_parents', 'label', 'detailed_label']].head(10))

  tunnel_parents   label detailed_label
0        (empty)  Benign              -
1        (empty)  Benign              -
2        (empty)  Benign              -
3        (empty)  Benign              -
4        (empty)  Benign              -
5        (empty)  Benign              -
6        (empty)  Benign              -
7        (empty)  Benign              -
8        (empty)  Benign              -
9        (empty)  Benign              -


### Data Cleaning
#### 1. Duplicate values

In [23]:
# Check for duplicate rows in the dataset
duplicate_rows = data[data.duplicated(keep=False)]
print(f"Number of duplicate rows: {duplicate_rows.shape[0]}")

# Preview some duplicate rows
duplicate_rows

Number of duplicate rows: 0


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,conn_state,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed_label


In [24]:
data.shape

(11454714, 21)

In [11]:
# Remove duplicates (keep the first occurrence)
data = data.drop_duplicates(keep="first").reset_index(drop=True)
print(f"Shape after removing duplicates: {data.shape}")

Shape after removing duplicates: (11454714, 21)


#### 2. Infinite values

In [25]:
# Checking for number of infinity values
numeric_cols = data.select_dtypes(include = np.number).columns
inf_count = np.isinf(data[numeric_cols]).sum()
print(inf_count[inf_count > 0])

Series([], dtype: int64)


#### 3. Missing values

In [26]:
# Count missing values per column
missing_counts = data.isna().sum()
print("Total missing values:", missing_counts.sum())

# Show only columns with missing values
missing_counts[missing_counts > 0].sort_values(ascending=False)

Total missing values: 0


Series([], dtype: int64)

### Data Preparation
#### 1. Feature selection/exctraction

In [27]:
# Dropping columns that has only one unique value
num_unique = data.nunique()
one_variable = num_unique[num_unique == 1]
not_one_variable = num_unique[num_unique > 1].index

dropped_cols = one_variable.index
data = data[not_one_variable]

print('Dropped columns:')
dropped_cols

Dropped columns:


Index([], dtype='object')

#### 2. Data labeling

In [36]:
data['tunnel_parents'].value_counts()

tunnel_parents
(empty)               11388988
COLnd035cNITygYHp3       46143
CARhxZ3hLNVO3xYFok       19583
Name: count, dtype: int64

In [None]:
data['label'].value_counts()

# Dữ liệu mất cân bằng nghiêm trọng: 99.34% là Malicious và chỉ 0.66% là Benign
# --> Điều này có thể khiến mô hình thiên về dự đoán lớp Malicious và bỏ qua lớp Benign.


label
Malicious    11378759
Benign          75955
Name: count, dtype: int64

In [38]:
data['detailed_label'].value_counts()

detailed_label
Okiru            11333397
-                   75955
DDoS                39584
C&C-HeartBeat        5778
Name: count, dtype: int64