In [13]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from IPython.display import display
import ipywidgets as widgets

DATA_DIR = Path("balanced_output")  # fixed path name
DATA_FILE = "chunk_int_data.csv"
TARGET_COLUMN = "source"  # change to the actual target column name
IGNORE_COLUMNS = {"saddr", "daddr"}
IMPORTANCE_THRESHOLD = 0.01
RANDOM_STATE = 42
N_ESTIMATORS = 300
TEST_SIZE = 0.2


stage_bar = widgets.IntProgress(value=0, min=0, max=6, description="Stage", bar_style="info")
detail_label = widgets.HTML(value="<b>Waiting...</b>")
display(widgets.VBox([stage_bar, detail_label]))

def _update(stage: int, text: str):
    stage_bar.value = stage
    detail_label.value = f"<b>{text}</b>"

def load_dataset(path: Path) -> pd.DataFrame:
    _update(1, f"Loading dataset: {path}")
    df = pd.read_csv(path)
    if TARGET_COLUMN not in df.columns:
        raise ValueError(f"Target column '{TARGET_COLUMN}' not found in {path}")
    return df


def _encode_non_numeric(X: pd.DataFrame) -> pd.DataFrame:
    converted = X.copy()
    non_numeric_cols = converted.select_dtypes(exclude=["number"]).columns.tolist()
    for col in non_numeric_cols:
        as_numeric = pd.to_numeric(converted[col], errors="coerce")
        if as_numeric.notna().any():
            converted[col] = as_numeric.fillna(as_numeric.mean())
        else:
            converted[col] = converted[col].astype("category").cat.codes.astype("int32")
    return converted

def prepare_features(df: pd.DataFrame):
    _update(2, "Preparing features")
    y = df[TARGET_COLUMN]
    drop_cols = [col for col in IGNORE_COLUMNS if col in df.columns]
    X = df.drop(columns=[TARGET_COLUMN] + drop_cols)

    X = _encode_non_numeric(X)

    if not pd.api.types.is_numeric_dtype(y):
        y = y.astype("category").cat.codes

    X = X.astype("float32")
    y = y.astype("int32")

    return X, y


def rank_features(X: pd.DataFrame, y: pd.Series) -> pd.Series:
    _update(4, f"Training RandomForest (n_estimators={N_ESTIMATORS})")
    clf = RandomForestClassifier(
        n_estimators=N_ESTIMATORS,
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )
    clf.fit(X, y)
    _update(5, "Computing feature importances")
    importance_values = clf.feature_importances_
    importance = pd.Series(importance_values, index=list(X.columns))
    return importance.sort_values(ascending=False)


def main():
    dataset_path = DATA_DIR / DATA_FILE
    df = load_dataset(dataset_path)
    X, y = prepare_features(df)

    _update(3, "Splitting train/test")
    stratify_arg = y if len(np.unique(y)) > 1 else None
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=TEST_SIZE,
        random_state=RANDOM_STATE,
        stratify=stratify_arg,
    )

    feature_importance = rank_features(X_train, y_train)
    selected_features = feature_importance[
        feature_importance >= IMPORTANCE_THRESHOLD
    ].index.tolist()
    dropped_features = feature_importance.index.difference(selected_features).tolist()

    _update(6, "Saving outputs")
    print("Top 20 features by importance:")
    print(feature_importance.head(20))

    print("\nKept feature count:", len(selected_features))
    print("Dropped feature count:", len(dropped_features))

    DATA_DIR.mkdir(parents=True, exist_ok=True)
    feature_importance.to_csv(DATA_DIR / "feature_importance.csv", header=["importance"])
    pd.Series(selected_features).to_csv(DATA_DIR / "selected_features.csv", index=False, header=["feature"])

    detail_label.value = "<b>Done</b>"


if __name__ == "__main__":
    main()

VBox(children=(IntProgress(value=0, bar_style='info', description='Stage', max=6), HTML(value='<b>Waiting...</â€¦

Top 20 features by importance:
stime      0.135513
ltime      0.121690
dport      0.113955
proto      0.059809
seq        0.058971
sport      0.057938
flgs       0.052851
sbytes     0.051870
spkts      0.050684
dpkts      0.050230
pkts       0.045153
state      0.040938
mean       0.032215
pkSeqID    0.022010
dur        0.020890
bytes      0.015328
dbytes     0.014841
min        0.012477
sum        0.012227
drate      0.009346
dtype: float64

Kept feature count: 19
Dropped feature count: 5


In [None]:
dataset_path = DATA_DIR / DATA_FILE
test_dataset_path = DATA_DIR / "test_data.csv"

!
columns_to_drop = ["daddr", "saddr", "rate", "dbytes", "min", "sum", "bytes", "mean", "stddev", "max", "dpkts", "sbytes", "srate", "drate"]

def _process_file(input_path: Path, output_filename: str, label: str):
    _update(1, f"Loading dataset: {input_path.name}")
    frame = pd.read_csv(input_path)

    _update(2, f"Dropping specified columns from {label}")
    cleaned = frame.drop(columns=columns_to_drop, errors="ignore")

    print(f"Sample data ({label}):\n", cleaned.head())

    output_path = DATA_DIR / output_filename
    _update(6, f"Saving extracted features to {output_filename}")
    cleaned.to_csv(output_path, index=False)

    print(f"Wrote {len(cleaned)} rows to {output_path}")
    return cleaned, output_path

df, output_path = _process_file(dataset_path, "extracted_features_chunk.csv", "train")
test_df, test_output_path = _process_file(test_dataset_path, "extracted_features_test.csv", "test")

detail_label.value = "<b>Saved extracted_features_chunk.csv and extracted_features_test.csv</b>"

Sample data (train):
    pkSeqID         stime  flgs  proto  sport  dport  pkts  state  \
0    66424  1.556087e+09     7    116  57056  10081     2     14   
1   747890  1.556143e+09     7    116  57839   1900    24     14   
2   139150  1.554314e+09     1    110  42100   7878     0      6   
3   137880  1.556085e+09    11    110  61775  41952    16     18   
4    81498  1.556085e+09     7    110  19824   6517     1     14   

          ltime  seq       dur  spkts  source  
0  1.556087e+09    0  4.964505      2       0  
1  1.556143e+09    0  3.312139     24       1  
2  1.554314e+09    0  0.000000      0       1  
3  1.556085e+09    0  0.012719      7       1  
4  1.556085e+09    0  0.000000      1       0  
Wrote 1854724 rows to balanced_output\extracted_features_chunk.csv
Sample data (test):
    pkSeqID         stime flgs proto  sport  dport  pkts state         ltime  \
0   805352  1.556135e+09   S0   tcp    802    802   120    S0  1.556135e+09   
1   175623  1.554345e+09  OTH   tcp