In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Feature engineering
for df in [train_df, test_df]:
    df['packets_per_sec'] = df['spkts'] / df['dur'].replace(0, np.nan)
    df['bytes_per_sec'] = df['sbytes'] / df['dur'].replace(0, np.nan)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

label_cols = ['label', 'attack_cat', 'id', 'srcip', 'dstip']

# Separate labels
X_train = train_df.drop(columns=label_cols, errors='ignore')
y_train = train_df['label']

X_test = test_df.drop(columns=label_cols, errors='ignore')
y_test = test_df['label']

# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

# Define preprocessing pipeline (imputation -> scaling/encoding)
preprocessor = ColumnTransformer(transformers=[
    ('num',
     Pipeline(steps=[
         ('imputer', SimpleImputer(strategy='mean')),
         ('scaler', StandardScaler())
     ]),
     numeric_features),

    ('cat',
     Pipeline(steps=[
         ('imputer', SimpleImputer(strategy='most_frequent')),
         ('onehot', OneHotEncoder(handle_unknown='ignore'))
     ]),
     categorical_features)
])

# Fit-transform train, transform test
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Training shape after preprocessing:", X_train_processed.shape)
print("Testing shape after preprocessing:", X_test_processed.shape)