In [None]:
import os
import warnings
import numpy as np
import pandas as pd

# ==============================================================================
#   USER CONFIGURATION SECTION
# ==============================================================================

# Select Task: "T2", "T3", "T4", "T5", "T6"
TASK_ID = "T2"

# --- Default Settings ---
OUTPUT_SUFFIX = ""         # Default suffix (e.g. filename_list.csv)
IGNORED_COLS = []          # Columns to exclude from features (besides label)

# --- Task Specific Settings ---

if TASK_ID == "T2":
    # [T2] Malicious URL Binary
    INPUT_FILES = [
        "/path/to/train_malicious_phish.csv",
        "/path/to/test_malicious_phish.csv"
    ]
    LABEL_COL = "label"

elif TASK_ID == "T3":
    # [T3] Malicious URL Multi
    INPUT_FILES = [
        "/path/to/train_malicious_phish_multi.csv",
        "/path/to/test_malicious_phish_multi.csv"
    ]
    LABEL_COL = "label"

elif TASK_ID == "T4":
    # [T4] Credit Card Fraud Detection
    INPUT_FILES = [
        "/path/to/train_creditcard_timesplit.csv",
        "/path/to/test_creditcard_timesplit.csv"
    ]
    LABEL_COL = "Class"

elif TASK_ID == "T5":
    # [T5] UNSW-NB15 Binary Classification
    # Target: label (0/1). Ignore: attack_cat.
    INPUT_FILES = [
        "/path/to/UNSW_NB15_training-set.parquet",
        "/path/to/UNSW_NB15_testing-set.parquet"
    ]
    LABEL_COL = "label"
    IGNORED_COLS = ["attack_cat"]  # Prevent data leakage or redundancy

elif TASK_ID == "T6":
    # [T6] UNSW-NB15 Multi-class Classification
    # Target: attack_cat. Ignore: label.
    INPUT_FILES = [
        "/path/to/UNSW_NB15_training-set.parquet",
        "/path/to/UNSW_NB15_testing-set.parquet"
    ]
    LABEL_COL = "attack_cat"
    IGNORED_COLS = ["label"]       # Prevent data leakage
    OUTPUT_SUFFIX = "_multi"       # Output files will be: filename_multi_list.csv

else:
    raise ValueError(f"Unknown Task ID: {TASK_ID}")

# ==============================================================================
#   CORE LOGIC (Generic Serialization)
# ==============================================================================

def _is_invalid(val) -> bool:
    """Check if value is NaN or Infinite."""
    return pd.isna(val) or (isinstance(val, (int, float, np.number)) and np.isinf(val))

def load_data(path: str) -> pd.DataFrame:
    """Load CSV or Parquet based on extension."""
    if path.endswith(".parquet"):
        return pd.read_parquet(path)
    else:
        return pd.read_csv(path, low_memory=False)

def main():
    print(f"=== Starting Serialization for Task [{TASK_ID}] ===")
    
    for file_path in INPUT_FILES:
        if not os.path.exists(file_path):
            warnings.warn(f"File not found: {file_path}. Skipping.")
            continue

        print(f"Processing: {os.path.basename(file_path)}")
        
        try:
            # 1. Load Data
            df = load_data(file_path)
            
            if LABEL_COL not in df.columns:
                warnings.warn(f"Label column '{LABEL_COL}' not found in {file_path}. Skipping.")
                continue

            # 2. Separate Label and Features
            y_series = df[LABEL_COL]
            
            # Drop Label and any specific Ignored Columns
            cols_to_drop = [LABEL_COL] + [c for c in IGNORED_COLS if c in df.columns]
            X_df = df.drop(columns=cols_to_drop)

            # 3. Serialize Features
            list_style = []
            
            for _, row in X_df.iterrows():
                parts_list = []
                
                for col, val in row.items():
                    # Skip NaNs or Infs
                    if _is_invalid(val):
                        continue
                    
                    # Create string representation (List style only)
                    parts_list.append(f"{col}: {val};")
                
                list_style.append(" ".join(parts_list))

            # 4. Construct Output DataFrame
            df_list = pd.DataFrame({"data": list_style, "label": y_series})

            # 5. Save Files
            base_dir = os.path.dirname(file_path)
            base_name = os.path.splitext(os.path.basename(file_path))[0]
            
            # Construct filename with optional suffix
            out_name_list = f"{base_name}{OUTPUT_SUFFIX}_list.csv"
            path_list = os.path.join(base_dir, out_name_list)
            
            df_list.to_csv(path_list, index=False)
            
            print(f" -> Saved: {out_name_list}")
            
        except Exception as e:
            warnings.warn(f"Error processing {file_path}: {str(e)}")

    print("\n=== All Files Processed ===")

if __name__ == "__main__":
    main()