Imports + Load raw+labeled data

In [1]:
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder


In [2]:
train_df = pd.read_csv("../data/processed/nsl_kdd_train_raw_labeled.csv")
test_df  = pd.read_csv("../data/processed/nsl_kdd_test_raw_labeled.csv")

print(train_df.shape, test_df.shape)


(125973, 44) (22544, 44)


Identify categorical columns

In [3]:
cat_cols = ["protocol_type", "service", "flag"]


Fit encoder on TRAIN only

In [4]:
encoder = OneHotEncoder(
    sparse_output=False,   # for newer sklearn
    handle_unknown="ignore"
)

encoded_train = encoder.fit_transform(train_df[cat_cols])
encoded_test  = encoder.transform(test_df[cat_cols])

print(encoded_train.shape)
print(encoded_test.shape)


(125973, 84)
(22544, 84)


Convert encoded arrays to DataFrames

In [5]:
encoded_cols = encoder.get_feature_names_out(cat_cols)

encoded_train_df = pd.DataFrame(encoded_train, columns=encoded_cols)
encoded_test_df  = pd.DataFrame(encoded_test, columns=encoded_cols)


Drop original categorical columns

In [6]:
train_df = train_df.drop(columns=cat_cols)
test_df  = test_df.drop(columns=cat_cols)

Concatenate encoded columns back

In [7]:
train_df = pd.concat([train_df.reset_index(drop=True),
                      encoded_train_df.reset_index(drop=True)], axis=1)

test_df  = pd.concat([test_df.reset_index(drop=True),
                      encoded_test_df.reset_index(drop=True)], axis=1)

print("Train shape after encoding:", train_df.shape)
print("Test shape after encoding :", test_df.shape)


Train shape after encoding: (125973, 125)
Test shape after encoding : (22544, 125)


Save encoded datasets

In [8]:
train_df.to_csv("../data/processed/nsl_kdd_train_encoded.csv", index=False)
test_df.to_csv("../data/processed/nsl_kdd_test_encoded.csv", index=False)

print("Encoded datasets saved successfully.")


Encoded datasets saved successfully.
