In [7]:
import os
import urllib
import gzip
import shutil
import jax.numpy as jnp
import pandas as pd

In [10]:
def ensure_kddcup(csv_path='./data/kddcup.csv',url='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz'):
    if os.path.exists(csv_path):
        print(f"{csv_path} already exists.")
        return

    # Make sure the target directory exists
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)

    gz_path = csv_path + '.gz'
    print(f"Downloading {url} → {gz_path} …")
    urllib.request.urlretrieve(url, gz_path)

    # Decompress it to CSV
    print(f"Decompressing {gz_path} → {csv_path} …")
    with gzip.open(gz_path, 'rt') as f_in, open(csv_path, 'w') as f_out:
        shutil.copyfileobj(f_in, f_out)

    # Clean up
    os.remove(gz_path)
    print("Done!")


In [11]:
ensure_kddcup()

./data/kddcup.csv already exists.


In [18]:
columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'type']
df = pd.read_csv('./data/kddcup.csv', header=None, names=columns)

In [19]:
df.loc[df["type"] != "normal.", 'type'] = 0
df.loc[df["type"] == "normal.", 'type'] = 1

one_hot_protocol = pd.get_dummies(df["protocol_type"], dtype=int)
one_hot_service = pd.get_dummies(df["service"], dtype=int)
one_hot_flag = pd.get_dummies(df["flag"], dtype=int)

df = df.drop("protocol_type",axis=1)
df = df.drop("service",axis=1)
df = df.drop("flag",axis=1)
    
df = pd.concat([one_hot_protocol, one_hot_service,one_hot_flag, df],axis=1)
df["type"] = pd.to_numeric(df["type"], errors="coerce").astype(int)
df = df.astype(float)

In [20]:
cols_to_norm = ["duration", "src_bytes", "dst_bytes", "wrong_fragment", "urgent", 
            "hot", "num_failed_logins", "num_compromised", "num_root", 
            "num_file_creations", "num_shells", "num_access_files", "count", "srv_count", 
            "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", 
            "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", 
            "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", 
            "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate" ]

min_cols = df.loc[df["type"]==0 , cols_to_norm].min()
max_cols = df.loc[df["type"]==0 , cols_to_norm].max()

df.loc[:, cols_to_norm] = (df[cols_to_norm] - min_cols) / (max_cols - min_cols)

In [21]:
print(df.dtypes)

icmp                        float64
tcp                         float64
udp                         float64
IRC                         float64
X11                         float64
                             ...   
dst_host_serror_rate        float64
dst_host_srv_serror_rate    float64
dst_host_rerror_rate        float64
dst_host_srv_rerror_rate    float64
type                        float64
Length: 123, dtype: object


In [22]:
jnp.savez('./data/kddcup.npz',kdd=df.to_numpy(dtype="float32"))