In [7]:
import os
import urllib
import gzip
import shutil
import numpy as np
import pandas as pd

In [3]:
def ensure_kddcup(csv_path='./data/kddcup.csv',url='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz'):
    if os.path.exists(csv_path):
        print(f"{csv_path} already exists.")
        return

    # Make sure the target directory exists
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)

    gz_path = csv_path + '.gz'
    print(f"Downloading {url} → {gz_path} …")
    urllib.request.urlretrieve(url, gz_path)

    # Decompress it to CSV
    print(f"Decompressing {gz_path} → {csv_path} …")
    with gzip.open(gz_path, 'rt') as f_in, open(csv_path, 'w') as f_out:
        shutil.copyfileobj(f_in, f_out)

    # Clean up
    os.remove(gz_path)
    print("Done!")


In [4]:
ensure_kddcup()

Downloading http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz → ./data/kddcup.csv.gz …


Decompressing ./data/kddcup.csv.gz → ./data/kddcup.csv …
Done!


In [5]:
columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'type']
df = pd.read_csv('./data/kddcup.csv', header=None, names=columns)

In [6]:
df.loc[df["type"] != "normal.", 'type'] = 0
df.loc[df["type"] == "normal.", 'type'] = 1

one_hot_protocol = pd.get_dummies(df["protocol_type"], dtype=int)
one_hot_service = pd.get_dummies(df["service"], dtype=int)
one_hot_flag = pd.get_dummies(df["flag"], dtype=int)

df = df.drop("protocol_type",axis=1)
df = df.drop("service",axis=1)
df = df.drop("flag",axis=1)
    
df = pd.concat([one_hot_protocol, one_hot_service, one_hot_flag, df],axis=1)
df["type"] = pd.to_numeric(df["type"], errors="coerce").astype(int)
df = df.astype("float32")

In [8]:
def normalize_and_cast(data: pd.DataFrame, cols_to_norm: list, normal_condition: str = "type == 0"):
    # Filter normal rows for fitting
    normal_data = data.query(normal_condition)

    # Cast to float32
    data[cols_to_norm] = data[cols_to_norm].astype(np.float32)

    # Compute min and max from normal data
    min_vals = normal_data[cols_to_norm].min()
    max_vals = normal_data[cols_to_norm].max()

    # Avoid division by zero
    scale = (max_vals - min_vals).replace(0, 1.0)

    # Normalize entire dataset
    data[cols_to_norm] = (data[cols_to_norm] - min_vals) / scale

    return data, min_vals, max_vals


In [9]:
cols_to_norm = ["duration", "src_bytes", "dst_bytes", "wrong_fragment", "urgent", 
            "hot", "num_failed_logins", "num_compromised", "num_root", 
            "num_file_creations", "num_shells", "num_access_files", "count", "srv_count", 
            "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", 
            "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", 
            "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", 
            "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate" ]

df, min_vals, max_vals = normalize_and_cast(df, cols_to_norm)

In [10]:
print(df.dtypes)

icmp                        float32
tcp                         float32
udp                         float32
IRC                         float32
X11                         float32
                             ...   
dst_host_serror_rate        float32
dst_host_srv_serror_rate    float32
dst_host_rerror_rate        float32
dst_host_srv_rerror_rate    float32
type                        float32
Length: 119, dtype: object


In [11]:
np.savez('./data/kddcup.npz',kdd=df.to_numpy(dtype="float32"))