In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# ==============================================================================
# USER CONFIGURATION SECTION
# ==============================================================================
# Select Task: "T1" (Email Binary), "T2" (URL Binary), "T3" (URL Multi)
TASK_ID = "T1"

# --- Common Settings ---
RANDOM_SEED = 42
TEST_SIZE = 0.2
SAVE_DIR = "/path/to/output_directory"

# --- Task Specific Settings (Uncomment/Modify based on TASK_ID) ---

if TASK_ID == "T1":
    # [T1] Phishing Email Binary Classification
    INPUT_FILE = "/path/to/Phishing_Email.csv"

    TEXT_COL = "Email Text"       # Column containing the text/data
    LABEL_COL_ORIG = "Email Type" # Original label column in CSV
    LABEL_COL_NEW = "label"       # Target label column name
    
    # T1 Output Filenames
    OUT_TRAIN = "train_phish_email_list.csv"
    OUT_TEST = "test_phish_email_list.csv"

elif TASK_ID == "T2":
    # [T2] Malicious URL Binary Classification
    INPUT_FILE = "/path/to/malicious_phish.csv"

    LABEL_COL_ORIG = "type"
    LABEL_COL_NEW = "label"
    
    # T2 Output Filenames
    OUT_TRAIN = "train_malicious_phish.csv"
    OUT_TEST = "test_malicious_phish.csv"

elif TASK_ID == "T3":
    # [T3] Malicious URL Multi-class Classification
    INPUT_FILE = "/path/to/malicious_phish.csv"
    
    LABEL_COL_ORIG = "type"
    LABEL_COL_NEW = "label"
    
    # T3 Output Filenames
    OUT_TRAIN = "train_malicious_phish_multi.csv"
    OUT_TEST = "test_malicious_phish_multi.csv"

# ==============================================================================
# MAIN PIPELINE
# ==============================================================================

def main():
    # 1. Setup Directories
    os.makedirs(SAVE_DIR, exist_ok=True)
    
    # 2. Load Data
    print(f"=== Loading Data for Task [{TASK_ID}] ===")
    if not os.path.exists(INPUT_FILE):
        raise FileNotFoundError(f"File not found: {INPUT_FILE}")
        
    df = pd.read_csv(INPUT_FILE, low_memory=False)
    df.columns = df.columns.str.strip()
    print(f"Initial Rows: {len(df):,}, Cols: {len(df.columns)}")

    # 3. Preprocessing & Label Generation
    if TASK_ID == "T1":
        # Remove nulls/empty strings
        df = df.dropna(subset=[TEXT_COL])
        df = df[df[TEXT_COL].astype(str).str.strip() != ""]
        
        # Map Labels: Phishing=1, Safe=0
        label_mapping = {"Phishing Email": 1, "Safe Email": 0}
        df[LABEL_COL_NEW] = df[LABEL_COL_ORIG].map(label_mapping)
        
        # Rename text column to 'data' and filter columns
        df = df.rename(columns={TEXT_COL: "data"})
        df = df[["data", LABEL_COL_NEW]]
        
    elif TASK_ID == "T2":
        # Map Labels: Benign=0, Others=1
        df[LABEL_COL_NEW] = df[LABEL_COL_ORIG].apply(lambda x: 0 if x == "benign" else 1)
        
        # Drop original label column
        df = df.drop(columns=[LABEL_COL_ORIG])
        
    elif TASK_ID == "T3":
        # Map Labels: Category Codes (Multi-class)
        df[LABEL_COL_NEW] = df[LABEL_COL_ORIG].astype('category').cat.codes
        
        # Print Mapping for reference
        categories = df[LABEL_COL_ORIG].astype('category').cat.categories
        print("Label Mapping:", dict(enumerate(categories)))
        
        # Drop original label column
        df = df.drop(columns=[LABEL_COL_ORIG])

    # Validate Labels
    if df[LABEL_COL_NEW].isnull().any():
        raise ValueError("Error: Null values found in target label column.")

    print(f"Processed Rows: {len(df):,}")
    print(f"Class Distribution:\n{df[LABEL_COL_NEW].value_counts()}")

    # 4. Train/Test Split
    train_df, test_df = train_test_split(
        df,
        test_size=TEST_SIZE,
        random_state=RANDOM_SEED,
        stratify=df[LABEL_COL_NEW]
    )

    print(f"Train set: {len(train_df):,}")
    print(f"Test set:  {len(test_df):,}")

    # 5. Serialization (Save)
    train_path = os.path.join(SAVE_DIR, OUT_TRAIN)
    test_path = os.path.join(SAVE_DIR, OUT_TEST)

    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)

    print("\n=== Processing Complete ===")
    print(f"Saved Train: {train_path}")
    print(f"Saved Test:  {test_path}")

if __name__ == "__main__":
    main()