In [1]:
import pandas as pd

In [5]:
train_path = "../data/raw/nsl_kdd/KDDTrain+.txt"
test_path  = "../data/raw/nsl_kdd/KDDTest+.txt"
field_path = "../data/raw/nsl_kdd/Field Names.csv"

In [None]:
# Load 43 feature names
field_names = pd.read_csv(field_path, header=None)
cols = field_names[0].tolist()

print("Total columns:", len(cols))
print("Last 5 column names:", cols[-5:])

Total columns: 43
Last 5 column names: ['dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'difficulty']


In [10]:
# Load datasets
train_df = pd.read_csv(train_path, names=cols)
test_df  = pd.read_csv(test_path, names=cols)

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)

Train shape: (125973, 43)
Test shape : (22544, 43)


In [11]:
# 3) Quick sanity checks
print("Missing values (train):", train_df.isnull().sum().sum())
print("Missing values (test) :", test_df.isnull().sum().sum())

print("\nAttack_type sample values (train):")
print(train_df["attack_type"].value_counts().head(10))

Missing values (train): 0
Missing values (test) : 0

Attack_type sample values (train):
attack_type
normal         67343
neptune        41214
satan           3633
ipsweep         3599
portsweep       2931
smurf           2646
nmap            1493
back             956
teardrop         892
warezclient      890
Name: count, dtype: int64


In [13]:
# 4) Create binary label for intrusion detection
train_df["label"] = (train_df["attack_type"] != "normal").astype(int)
test_df["label"]  = (test_df["attack_type"] != "normal").astype(int)

print("\nBinary label distribution (train):")
print(train_df["label"].value_counts())

print("\nBinary label distribution (test):")
print(test_df["label"].value_counts())

train_df.head()


Binary label distribution (train):
label
0    67343
1    58630
Name: count, dtype: int64

Binary label distribution (test):
label
1    12833
0     9711
Name: count, dtype: int64


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,difficulty,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20,0
1,0,udp,other,SF,146,0,0,0,0,0,...,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,0
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,1
3,0,tcp,http,SF,232,8153,0,0,0,0,...,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,0
4,0,tcp,http,SF,199,420,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,0


In [14]:
import os
os.makedirs("../data/processed", exist_ok=True)

train_df.to_csv("../data/processed/nsl_kdd_train_raw_labeled.csv", index=False)
test_df.to_csv("../data/processed/nsl_kdd_test_raw_labeled.csv", index=False)

print("Saved: data/processed/nsl_kdd_train_raw_labeled.csv")
print("Saved: data/processed/nsl_kdd_test_raw_labeled.csv")

Saved: data/processed/nsl_kdd_train_raw_labeled.csv
Saved: data/processed/nsl_kdd_test_raw_labeled.csv
