In [1]:
import pandas as pd
import numpy as np

In [2]:
from hashlib import md5
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv("DNN-EdgeIIoT-dataset.csv", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2219201 entries, 0 to 2219200
Data columns (total 63 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   frame.time                 object 
 1   ip.src_host                object 
 2   ip.dst_host                object 
 3   arp.dst.proto_ipv4         object 
 4   arp.opcode                 float64
 5   arp.hw.size                float64
 6   arp.src.proto_ipv4         object 
 7   icmp.checksum              float64
 8   icmp.seq_le                float64
 9   icmp.transmit_timestamp    float64
 10  icmp.unused                float64
 11  http.file_data             object 
 12  http.content_length        float64
 13  http.request.uri.query     object 
 14  http.request.method        object 
 15  http.referer               object 
 16  http.request.full_uri      object 
 17  http.request.version       object 
 18  http.response              float64
 19  http.tls_port              float64
 20  tc

**Drop unnecessary and constant columns**

In [5]:
drop_columns = ["frame.time", "ip.src_host", "ip.dst_host", "arp.dst.proto_ipv4", "arp.src.proto_ipv4",
                "http.file_data", "http.request.uri.query",
                "http.request.full_uri", "tcp.options", "tcp.payload",
                "tcp.srcport", "mqtt.msg"]
df.drop(drop_columns, axis = 1, inplace = True)

In [6]:
constant_cols = df.nunique() == 1
constant_cols = constant_cols[constant_cols].index  
print(f"Constant columns: {list(constant_cols)}")

Constant columns: ['icmp.unused', 'http.tls_port', 'dns.qry.type', 'mqtt.msg_decoded_as']


In [7]:
columns_to_drop = ['icmp.unused', 'http.tls_port', 'dns.qry.type', 'mqtt.msg_decoded_as']
df = df.drop(columns=columns_to_drop)

In [8]:
df = df.rename(columns={'http.request.method': 'http1', 'http.referer': 'http2', 'http.request.version': 'http3', 'dns.qry.name.len': 'dns', 'mqtt.conack.flags': 'mqtt1', 'mqtt.protoname': 'mqtt2', 'mqtt.topic': 'mqtt3'})

**Encoding cateorical columns**

In [10]:
le_http1 = LabelEncoder()
le_http2 = LabelEncoder()
le_http3 = LabelEncoder()
le_dns = LabelEncoder()
le_mqtt1 = LabelEncoder()
le_mqtt2 = LabelEncoder()
le_mqtt3 = LabelEncoder()

In [11]:
df['http1_encoded'] = le_http1.fit_transform(df.http1)
df['http2_encoded'] = le_http2.fit_transform(df.http2)
df['http3_encoded'] = le_http3.fit_transform(df.http3)
df['dns_encoded'] = le_dns.fit_transform(df.dns)
df['mqtt1_encoded'] = le_mqtt1.fit_transform(df.mqtt1)
df['mqtt2_encoded'] = le_mqtt2.fit_transform(df.mqtt2)
df['mqtt3_encoded'] = le_mqtt3.fit_transform(df.mqtt3)

In [12]:
df.drop(columns=['http1','http2','http3', 'dns','mqtt1','mqtt2', 'mqtt3'], inplace = True )

**Remove NAs and duplicates**

In [14]:
df.isnull().sum()

arp.opcode                   0
arp.hw.size                  0
icmp.checksum                0
icmp.seq_le                  0
icmp.transmit_timestamp      0
http.content_length          0
http.response                0
tcp.ack                      0
tcp.ack_raw                  0
tcp.checksum                 0
tcp.connection.fin           0
tcp.connection.rst           0
tcp.connection.syn           0
tcp.connection.synack        0
tcp.dstport                  0
tcp.flags                    0
tcp.flags.ack                0
tcp.len                      0
tcp.seq                      0
udp.port                     0
udp.stream                   0
udp.time_delta               0
dns.qry.name                 0
dns.qry.qu                   0
dns.retransmission           0
dns.retransmit_request       0
dns.retransmit_request_in    0
mqtt.conflag.cleansess       0
mqtt.conflags                0
mqtt.hdrflags                0
mqtt.len                     0
mqtt.msgtype                 0
mqtt.pro

In [15]:
df.duplicated().sum()

273752

In [16]:
df = df.drop_duplicates()

**Drop identical columns**

In [18]:
# Create a hash for each column
def hash_column(series):
    return md5(pd.util.hash_pandas_object(series, index=False).values).hexdigest()

# Find columns with identical hashes
def find_identical_columns_by_hash(df):
    hash_dict = {}
    for col in df.columns:
        col_hash = hash_column(df[col])
        if col_hash in hash_dict:
            hash_dict[col_hash].append(col)
        else:
            hash_dict[col_hash] = [col]

    return [cols for cols in hash_dict.values() if len(cols) > 1]

# Applying the function to the DataFrame
identical_column_groups = find_identical_columns_by_hash(df)
print("Groups of identical columns:", identical_column_groups)

Groups of identical columns: [['mqtt.proto_len', 'mqtt.ver']]


In [19]:
# Groups of identical columns
identical_column_groups = [
    ['mqtt.proto_len', 'mqtt.ver']
]

# Iterate through the list of groups and drop all but the first column
for group in identical_column_groups:
    # Keep the first column of the group and drop the rest
    columns_to_drop = group[1:]  # all columns except the first one
    df = df.drop(columns_to_drop, axis=1)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1945449 entries, 0 to 2219193
Data columns (total 46 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   arp.opcode                 float64
 1   arp.hw.size                float64
 2   icmp.checksum              float64
 3   icmp.seq_le                float64
 4   icmp.transmit_timestamp    float64
 5   http.content_length        float64
 6   http.response              float64
 7   tcp.ack                    float64
 8   tcp.ack_raw                float64
 9   tcp.checksum               float64
 10  tcp.connection.fin         float64
 11  tcp.connection.rst         float64
 12  tcp.connection.syn         float64
 13  tcp.connection.synack      float64
 14  tcp.dstport                float64
 15  tcp.flags                  float64
 16  tcp.flags.ack              float64
 17  tcp.len                    float64
 18  tcp.seq                    float64
 19  udp.port                   float64
 20  udp.str

In [21]:
print(df['Attack_type'].value_counts())

Attack_type
Normal                   1399624
DDoS_UDP                  121567
DDoS_ICMP                  67939
SQL_injection              50826
DDoS_TCP                   50062
Vulnerability_scanner      50026
Password                   49933
DDoS_HTTP                  48544
Uploading                  36957
Backdoor                   24026
Port_Scanning              19977
XSS                        15068
Ransomware                  9689
Fingerprinting               853
MITM                         358
Name: count, dtype: int64


**Data scaling**

In [23]:
float_cols = df.select_dtypes(include=['float']).columns
df[float_cols] = np.log1p(df[float_cols])

**Save data**

In [25]:
# Save data
df.to_csv('Edge-IIoTset_clean.csv', index=False)