In [23]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [24]:
# Setup paths
RAW_DATA_DIR = Path("data/raw_data")
CLEANED_DATA_DIR = Path("data/cleaned_data")
FEATURES_DIR = Path("data/features")
CLEANED_DATA_DIR.mkdir(exist_ok=True, parents=True)
FEATURES_DIR.mkdir(exist_ok=True, parents=True)

## WASH et al. (2021) Dataset Processing
### Step 1: Cleaning with renaming readable feature names

In [25]:
class Wash2021Processor:
    
    @staticmethod
    def process():
        print("Processing WASH 2021 Dataset...")
        
        # Load raw data
        df = pd.read_csv("data/raw_data/phishing_wash_2021.csv")
        print(f"Raw: {len(df)} rows, {len(df.columns)} columns")
        
        # Complete feature mapping including qualitative columns
        features = {
            # Demographics & Background
            'age': 'age',
            'gender': 'gender', 
            'gender_other': 'gender_3_TEXT',
            'ethnicity': 'ethnicity',
            'ethnicity_other': 'ethnicity_9_TEXT',
            'education_level': 'education',
            'employment_status': 'employment',
            'annual_income': 'income',
            'has_it_training': 'expert_training',
            'has_it_job': 'expert_job',
            'can_recall_phishing': 'recall_email',
            
            # Digital Literacy Scale
            'digital_literacy_wiki': 'digital_literacy_1',
            'digital_literacy_meme': 'digital_literacy_2',
            'digital_literacy_phishing': 'digital_literacy_3',
            'digital_literacy_bookmark': 'digital_literacy_4',
            'digital_literacy_cache': 'digital_literacy_5',
            'digital_literacy_ssl': 'digital_literacy_6',
            'digital_literacy_ajax': 'digital_literacy_7',
            'digital_literacy_rss': 'digital_literacy_8',
            'digital_literacy_other': 'digital_literacy_9',
            
            # QUALITATIVE ELICITATIONS
            'unsafe_email_ways_list': 'elicitation1',
            'suspicious_email_recognition': 'elicitation2', 
            'suspicious_emails_received': 'elicitation3',
            
            # EMAIL SELECTION & BRIEF DESCRIPTIONS
            'email_brief_summary': 'brief_summary',
            'what_made_email_suspicious': 'describe_suspicious',
            'what_made_decision_hard': 'describe_hard',
            'what_email_asked_todo': 'describe_ask',
            
            # Emotional Responses
            'emotion_dread': 'emotions_1',
            'emotion_terror': 'emotions_2',
            'emotion_anxiety': 'emotions_3',
            'emotion_nervous': 'emotions_4',
            'emotion_scared': 'emotions_5',
            'emotion_panic': 'emotions_6',
            'emotion_fear': 'emotions_7',
            'emotion_worry': 'emotions_8',
            
            # NOTICING: What They Noticed About The Email
            'email_features_noticed': 'notice1',
            'email_recency': 'notice2',
            'email_account_type': 'notice3',
            'email_account_type_other': 'notice3_4_TEXT',
            'email_content_type': 'notice4',
            'email_content_type_other': 'notice4_3_TEXT',
            'email_sender_type': 'notice5',
            'email_sender_type_other': 'notice5_5_TEXT',
            
            # EXPECTING: Context and Expectations
            'felt_similar_before': 'expect1',
            'previous_sender_emails': 'expect2',
            'previous_sender_interaction': 'expect3',
            'sender_relationship_duration': 'expect4',
            'expected_this_email': 'expect5',
            'email_seemed_different': 'expect6',
            
            # SUSPECTING: What Made Them Suspicious
            'actions_requested': 'suspect1',
            'sender_issues': 'suspect2',
            'subject_line_issues': 'suspect3',
            'email_body_issues': 'suspect4',
            'overall_suspicion': 'suspect5',
            'suspicion_confidence': 'suspect5_sure_1',
            
            # INVESTIGATING: How They Investigated
            'investigation_methods': 'investigate1',
            'investigation_methods_other': 'investigate1_9_TEXT',
            'contacted_sender_how': 'investigate2',
            
            # DECIDING: Decision Process and Actions
            'final_decision': 'decide',
            'decision_confidence': 'decide_sure_1',
            'actions_with_email': 'act',
            
            # HARM & FULL STORY
            'perceived_harm': 'harm',
            'detailed_incident_narrative': 'full_story',
            'story_recall_difficulty': 'full_story_easy',
            
            # Cybersecurity History
            'previous_incidents': 'victim'
        }
        
        # Create cleaned dataframe
        cleaned = pd.DataFrame()
        missing_cols = []
        
        for new_col, old_col in features.items():
            if old_col in df.columns:
                cleaned[new_col] = df[old_col]
            else:
                print(f"Warning: Column '{old_col}' not found, setting '{new_col}' to NaN")
                cleaned[new_col] = np.nan
                missing_cols.append(old_col)
        
        print(f"Initial mapping: {len(cleaned)} rows, {len(cleaned.columns)} features")
        
        initial_count = len(cleaned)

        # Filter 1: Must be able to recall phishing emails
        if 'can_recall_phishing' in cleaned.columns:
            cleaned = cleaned[cleaned['can_recall_phishing'].str.contains('Yes', case=False, na=False)]
            print(f"After phishing recall filter: {len(cleaned)} rows (removed {initial_count - len(cleaned)})")
        
        # Filter 2: Keep only participants who responded to final decision
        if 'final_decision' in cleaned.columns:
            cleaned = cleaned[cleaned['final_decision'].notna() & (cleaned['final_decision'].str.strip() != '')]
            print(f"After final decision filter: {len(cleaned)} rows (removed {initial_count - len(cleaned)})")

        for col in cleaned.select_dtypes(include=['object']).columns:
            cleaned[col] = cleaned[col].str.replace(r'[\r\n]+', ' ', regex=True).str.strip()
        
            
        return cleaned
    

### Step 2. Feature Engineering

##### **Purpose**: Extract behavioral decision patterns from real phishing encounters

##### **Unique Value**: Real-world authentic phishing incidents with genuine emotional and behavioral responses

##### **Research Questions Enabled**:
1. "What personal factors predict risky vs safe email decisions?"
2. "How do emotions influence phishing susceptibility?" 
3. "Which investigation behaviors correlate with accurate decisions?"
4. "Can we predict decision confidence from contextual factors?"

#### **X_inputs**:
1. Demographics (5): Age, gender, education, employment, income
2. Cybersecurity background (7): Previous incidents, IT training/job status  
3. Digital Literacy Scale (10): Technical knowledge across 9 domains + composite score
4. Emotional Response Profile (9): Fear, anxiety, panic, worry etc. + composite score
5. Investigation Behaviors (3): How they verified sender, links, external sources
6. Email Context & Characteristics (16): Recency, account type, content type, sender relationship
7. Confidence & Risk Perception (3): Suspicion levels, confidence scores to detect phishing, perceived harm to phishing incidents

#### **Y_targets**:
1. Primary Decision: Safe vs Unsafe classification of phishing incident (binary)
2. Decision Confidence: How certain they were about the classification (0-10 scale, standardized)
3. Behavioral Actions: Clicked links, reported spam, deleted, ignored (4 binary indicators)

In [26]:
def create_ml_optimized(df):
    """
    Transform cleaned WASH data into ML-ready features with proper encodings
    based on actual data values and ML model feature requirements
    """
    
    ml_df = pd.DataFrame()
    scaler = StandardScaler()
    
    print("Creating ML-optimized WASH features...")
    print(f"Input data: {len(df)} rows, {len(df.columns)} columns")
    
    # ===================================================================
    # 1. DEMOGRAPHICS & BACKGROUND (5 features)
    # ===================================================================
    
    # Age categories
    age_bins = [0, 25, 35, 55, 75, 100]
    age_labels = [1, 2, 3, 4, 5]  # 1=youngest, 5=oldest
    age_cat = pd.cut(df['age'].astype(float), bins=age_bins, labels=age_labels, include_lowest=True)
    ml_df['age_category'] = age_cat.astype(float).fillna(3) 
    
    # Gender encoding (binary only)
    gender_map = {'Man': 1, 'Woman': 0}
    ml_df['gender'] = df['gender'].map(gender_map).fillna(0) 
    
    # Education level
    education_map = {
        # Low Education (1): Below high school
        'None, or grades 1-8': 1, 
        'Some high school': 1,
        
        # Medium-Low Education (2): High school / Trade school
        'High school graduate or GED certificate': 2,
        'Technical, trade, or vocational school AFTER high school': 2, 
        
        # Medium-High Education (3): Some college / Bachelor's
        'Some college, no 4-year degree': 3,
        '4-year college degree': 3,
        
        # High Education (4): Graduate/Professional degree
        'Some postgraduate or professional schooling, no postgraduate degree': 4,
        "Postgraduate or professional degree, including master's, doctorate, medical or law degree": 4
    }
    ml_df['education_level'] = df['education_level'].map(education_map).fillna(2) 
    
    # Employment status (binary: employed vs not employed)
    employment_map = {
        'Employed full time': 1, 
        'Employed part time': 1,
        'Unemployed looking for work': 0,
        'Unemployed not looking for work': 0, 
        'Retired': 0,
        'Student': 0 
    }
    ml_df['employment_status'] = df['employment_status'].map(employment_map).fillna(0)
    
    # Annual income
    income_map = {
        # Low Income (1):
        'Less than $25,000': 1, 
        '$25,000 to $34,999': 1,
        
        # Lower-Middle Income (2):
        '$35,000 to $49,999': 2,
        '$50,000 to $74,999': 2, 
        
        # Upper-Middle Income (3):
        '$75,000 to $99,999': 3,
        '$100,000 to $149,999': 3,
        
        # High Income (4):
        '$150,000 to $199,999': 4,
        '$200,000 or more': 4
    }
    ml_df['annual_income'] = df['annual_income'].map(income_map).fillna(2)  # Default to lower-middle
    
    # ===================================================================
    # 2. IT BACKGROUND & SECURITY HISTORY (3 features)
    # ===================================================================
    
    # IT training and job status (binary)
    ml_df['has_it_training'] = (df['has_it_training'] == 'Yes').astype(int)
    ml_df['has_it_job'] = (df['has_it_job'] == 'Yes').astype(int)
    
    # Previous incidents - ONE-HOT ENCODED for different incident types
    if 'previous_incidents' in df.columns:
        ml_df['previous_incidents_phishing_email'] = df['previous_incidents'].str.contains(
            'Fell victim to a phishing email message or other scam email', na=False
        ).astype(int)
        ml_df['previous_incidents_data_breach'] = df['previous_incidents'].str.contains(
            'Received a notification from a company that your information was involved in a data breach', na=False
        ).astype(int)
        ml_df['previous_incidents_computer_virus'] = df['previous_incidents'].str.contains(
            'Had a virus on your computer or mobile device', na=False
        ).astype(int)
        ml_df['previous_incidents_device_hacked'] = df['previous_incidents'].str.contains(
            'Someone broke in or hacked your computer, mobile device, or account', na=False
        ).astype(int)
        ml_df['previous_incidents_credit_card_fraud'] = df['previous_incidents'].str.contains(
            'Stranger used your credit card number without your knowledge or permission', na=False
        ).astype(int)
        ml_df['previous_incidents_identity_theft'] = df['previous_incidents'].str.contains(
            'Identity theft more extensive than use of your credit card number without permission', na=False
        ).astype(int)
        
        # Overall indicator: any security incident (excluding "None of the above")
        ml_df['previous_incidents_any'] = (~df['previous_incidents'].str.contains('None of the above', na=True)).astype(int)
    else:
        ml_df['previous_incidents_phishing_email'] = 0
        ml_df['previous_incidents_data_breach'] = 0
        ml_df['previous_incidents_computer_virus'] = 0
        ml_df['previous_incidents_device_hacked'] = 0
        ml_df['previous_incidents_credit_card_fraud'] = 0
        ml_df['previous_incidents_identity_theft'] = 0
        ml_df['previous_incidents_any'] = 0
    
    # ===================================================================
    # 3. DIGITAL LITERACY (10 features) 
    # ===================================================================
    
    # Digital literacy scale: None=1, Little=2, Some=3, Good=4, Full=5
    literacy_map = {'None': 1, 'Little': 2, 'Some': 3, 'Good': 4, 'Full': 5}
    
    literacy_fields = [
        'digital_literacy_wiki', 'digital_literacy_meme', 'digital_literacy_phishing',
        'digital_literacy_bookmark', 'digital_literacy_cache', 'digital_literacy_ssl',
        'digital_literacy_ajax', 'digital_literacy_rss', 'digital_literacy_other'
    ]
    
    for field in literacy_fields:
        if field in df.columns:
            encoded_vals = df[field].map(literacy_map).fillna(1) 
            ml_df[field] = pd.Series(
                scaler.fit_transform(encoded_vals.values.reshape(-1, 1)).flatten(),
                index=df.index
            )
        else:
            # Create placeholder if missing
            ml_df[field] = 0.0
    
    # Digital literacy total score (average of all components)
    literacy_cols = [field for field in literacy_fields if field in ml_df.columns]
    if literacy_cols:
        ml_df['digital_literacy_total'] = ml_df[literacy_cols].mean(axis=1)
    else:
        ml_df['digital_literacy_total'] = 0.0
    
    # ===================================================================
    # 4. EMOTIONAL RESPONSE (9 features)
    # ===================================================================
    
    # Emotion scale: Not at all=1, Somewhat=2, Moderately=3, Quite a bit=4, An extreme amount=5
    emotion_map = {
        'Not at all': 1, 
        'Somewhat': 2, 
        'Moderately': 3, 
        'Quite a bit': 4, 
        'An extreme amount': 5
    }
    
    emotion_fields = [
        'emotion_dread', 'emotion_terror', 'emotion_anxiety', 'emotion_nervous',
        'emotion_scared', 'emotion_panic', 'emotion_fear', 'emotion_worry'
    ]
    
    for field in emotion_fields:
        if field in df.columns:
            encoded_vals = df[field].map(emotion_map).fillna(1)  # Default to "Not at all"
            ml_df[field] = pd.Series(
                scaler.fit_transform(encoded_vals.values.reshape(-1, 1)).flatten(),
                index=df.index
            )
        else:
            ml_df[field] = 0.0
    
    # Emotion total score (average emotional intensity)
    emotion_cols = [field for field in emotion_fields if field in ml_df.columns]
    if emotion_cols:
        ml_df['emotion_total'] = ml_df[emotion_cols].mean(axis=1)
    else:
        ml_df['emotion_total'] = 0.0
    
    # ===================================================================
    # 5. INVESTIGATION BEHAVIORS (3 features)
    # ===================================================================
    
    # Parse investigation methods (multi-select field)
    if 'investigation_methods' in df.columns:
        ml_df['investigated_sender'] = df['investigation_methods'].str.contains(
            'Looked more closely at the the email address|Asked someone else', na=False
        ).astype(int)
        
        ml_df['investigated_links'] = df['investigation_methods'].str.contains(
            'Hovered over|Clicked on one or more of the links', na=False
        ).astype(int)
        
        ml_df['investigated_external'] = df['investigation_methods'].str.contains(
            'Looked at email headers|Opened the attachment', na=False
        ).astype(int)
    else:
        ml_df['investigated_sender'] = 0
        ml_df['investigated_links'] = 0
        ml_df['investigated_external'] = 0
    
    # ===================================================================
    # 6. EMAIL CONTEXT & CHARACTERISTICS (12+ features with one-hot encoding)
    # ===================================================================
    
    # Email recency (1=most recent to 5=oldest)
    recency_map = {
        'Within the last day': 1,
        'Within the last week': 2, 
        'Within the last month': 3,
        'Within the last year': 4,
        'Longer than one year ago': 5
    }
    ml_df['email_recency'] = df['email_recency'].map(recency_map).fillna(3) 
    
    # Email account type - ONE-HOT ENCODED
    if 'email_account_type' in df.columns:
        ml_df['email_account_work'] = (df['email_account_type'] == 'Work Email account').astype(int)
        ml_df['email_account_student'] = (df['email_account_type'] == 'Student Email account').astype(int)
        ml_df['email_account_personal'] = (df['email_account_type'] == 'Personal Email account').astype(int)
    else:
        ml_df['email_account_work'] = 0
        ml_df['email_account_student'] = 0
        ml_df['email_account_personal'] = 1  
    
    # Email content type - ONE-HOT ENCODED
    if 'email_content_type' in df.columns:
        ml_df['email_content_work_related'] = (df['email_content_type'] == 'This email was related to work').astype(int)
        ml_df['email_content_personal'] = (df['email_content_type'] == 'This email was of a personal nature').astype(int)
    else:
        ml_df['email_content_work_related'] = 0
        ml_df['email_content_personal'] = 1  
    
    # Email sender type - ONE-HOT ENCODED
    if 'email_sender_type' in df.columns:
        ml_df['email_sender_work_colleague'] = (df['email_sender_type'] == 'A work colleague').astype(int)
        ml_df['email_sender_friend_family'] = (df['email_sender_type'] == 'A close friend or family member').astype(int)
        ml_df['email_sender_acquaintance'] = (df['email_sender_type'] == 'An acquaintance from outside work').astype(int)
        ml_df['email_sender_organization'] = (df['email_sender_type'] == 'A company, business or other organization').astype(int)
    else:
        ml_df['email_sender_work_colleague'] = 0
        ml_df['email_sender_friend_family'] = 0
        ml_df['email_sender_acquaintance'] = 0
        ml_df['email_sender_organization'] = 1 
    
    # Sender relationship duration (ordinal scale)
    duration_map = {
        'One month or less': 1,
        'Between one month and one year': 2,
        'One to two years': 3,
        'Two to five years': 4,
        'Five to ten years': 5,
        'More than 10 years': 6
    }
    ml_df['sender_relationship_duration'] = df['sender_relationship_duration'].map(duration_map).fillna(1)
    
    # Expected this email (binary)
    expected_map = {'Yes': 1, 'No': 0, "I'm not sure": 0}
    ml_df['expected_this_email'] = df['expected_this_email'].map(expected_map).fillna(0)
    
    # Felt similar before (Likert scale 1-5)
    likert_map = {
        'Strongly disagree': 1, 
        'Somewhat disagree': 2, 
        'Neither agree nor disagree': 3,
        'Somewhat agree': 4, 
        'Strongly agree': 5
    }
    ml_df['felt_similar_before'] = df['felt_similar_before'].map(likert_map).fillna(3)
    
    # Previous sender emails (binary)
    if 'previous_sender_emails' in df.columns:
        ml_df['previous_sender_emails'] = (df['previous_sender_emails'] == 'Yes').astype(int)
    else:
        ml_df['previous_sender_emails'] = 0
    
    # Previous sender interaction (binary) 
    if 'previous_sender_interaction' in df.columns:
        ml_df['previous_sender_interaction'] = (df['previous_sender_interaction'] == 'Yes').astype(int)
    else:
        ml_df['previous_sender_interaction'] = 0
    
    # Email seemed different (Likert scale 1-5)
    if 'email_seemed_different' in df.columns:
        ml_df['email_seemed_different'] = df['email_seemed_different'].map(likert_map).fillna(3)
    else:
        ml_df['email_seemed_different'] = 3
    
    # Parse noticed features (binary indicators)
    if 'email_features_noticed' in df.columns:
        ml_df['noticed_sender_issues'] = df['email_features_noticed'].str.contains(
            "Sender's name", na=False
        ).astype(int)
        
        ml_df['noticed_content_issues'] = df['email_features_noticed'].str.contains(
            'What the email was about|Length of the email|Information missing', na=False
        ).astype(int)
        
        ml_df['noticed_technical_issues'] = df['email_features_noticed'].str.contains(
            'Link|Formatting|Mistakes|File', na=False
        ).astype(int)
    else:
        ml_df['noticed_sender_issues'] = 0
        ml_df['noticed_content_issues'] = 0
        ml_df['noticed_technical_issues'] = 0
    
    # Actions requested (multi-select, one-hot encoded)
    if 'actions_requested' in df.columns:
        ml_df['actions_requested_click_link'] = df['actions_requested'].str.contains(
            'Click on a link or button', na=False
        ).astype(int)
        ml_df['actions_requested_open_attachment'] = df['actions_requested'].str.contains(
            'Open something that was attached to the email', na=False
        ).astype(int)
        ml_df['actions_requested_respond_info'] = df['actions_requested'].str.contains(
            'Respond to the email with some information', na=False
        ).astype(int)
        ml_df['actions_requested_external_action'] = df['actions_requested'].str.contains(
            'Take some action outside of the email', na=False
        ).astype(int)
    else:
        ml_df['actions_requested_click_link'] = 0
        ml_df['actions_requested_open_attachment'] = 0
        ml_df['actions_requested_respond_info'] = 0
        ml_df['actions_requested_external_action'] = 0
    
    # Sender issues (one-hot encoded)
    if 'sender_issues' in df.columns:
        ml_df['sender_issues_none'] = df['sender_issues'].str.contains(
            "I didn't notice anything that felt off about the sender", na=False
        ).astype(int)
        ml_df['sender_issues_name_different'] = df['sender_issues'].str.contains(
            "The sender's name looked different than I would expect", na=False
        ).astype(int)
        ml_df['sender_issues_email_different'] = df['sender_issues'].str.contains(
            "The sender's email address looked different than I would expect", na=False
        ).astype(int)
    else:
        ml_df['sender_issues_none'] = 1
        ml_df['sender_issues_name_different'] = 0
        ml_df['sender_issues_email_different'] = 0
    
    # Subject line issues (one-hot encoded)
    if 'subject_line_issues' in df.columns:
        ml_df['subject_line_issues_none'] = df['subject_line_issues'].str.contains(
            "I didn't notice anything that felt off about the subject line", na=False
        ).astype(int)
        ml_df['subject_line_issues_different'] = df['subject_line_issues'].str.contains(
            "The subject line was different than I would expect", na=False
        ).astype(int)
    else:
        ml_df['subject_line_issues_none'] = 1
        ml_df['subject_line_issues_different'] = 0
    
    # Email body issues (multi-select, one-hot encoded)
    if 'email_body_issues' in df.columns:
        ml_df['email_body_issues_none'] = df['email_body_issues'].str.contains(
            "I didn't notice anything that felt off about the main body of the email", na=False
        ).astype(int)
        ml_df['email_body_issues_typos'] = df['email_body_issues'].str.contains(
            'The main body of the email included typos or other issues', na=False
        ).astype(int)
        ml_df['email_body_issues_missing'] = df['email_body_issues'].str.contains(
            'The main body of the email was missing something', na=False
        ).astype(int)
        ml_df['email_body_issues_strange'] = df['email_body_issues'].str.contains(
            'The main body of the email included something strange', na=False
        ).astype(int)
        ml_df['email_body_issues_more_info'] = df['email_body_issues'].str.contains(
            'The main body of the email included more information than I expect', na=False
        ).astype(int)
        ml_df['email_body_issues_less_info'] = df['email_body_issues'].str.contains(
            'The main body of the email included less information than I expect', na=False
        ).astype(int)
    else:
        ml_df['email_body_issues_none'] = 1
        ml_df['email_body_issues_typos'] = 0
        ml_df['email_body_issues_missing'] = 0
        ml_df['email_body_issues_strange'] = 0
        ml_df['email_body_issues_more_info'] = 0
        ml_df['email_body_issues_less_info'] = 0
    
    # ===================================================================
    # 7. CONFIDENCE & PERCEPTION (3 features)
    # ===================================================================
    
    # Confidence scores: Original scale 1-11, normalize to 0-10 scale then standardize
    if 'suspicion_confidence' in df.columns:

        suspicion_conf = pd.to_numeric(df['suspicion_confidence'], errors='coerce')
        suspicion_conf_normalized = (suspicion_conf - 1).clip(0, 10).fillna(4.5)
        ml_df['suspicion_confidence'] = pd.Series(
            scaler.fit_transform(suspicion_conf_normalized.values.reshape(-1, 1)).flatten(),
            index=df.index
        )
    else:
        ml_df['suspicion_confidence'] = 0.0
    
    # Overall suspicion (binary: clear yes/no only)
    suspicion_map = {
        'No, I did not think it was harmful': 0,
        'Yes, I thought it was harmful': 1
    }
    ml_df['overall_suspicion'] = df['overall_suspicion'].map(suspicion_map).fillna(0)
    
    # Perceived harm (Likert scale 1-5, standardized)
    if 'perceived_harm' in df.columns:
        harm_encoded = df['perceived_harm'].map(likert_map).fillna(3) 
        ml_df['perceived_harm'] = pd.Series(
            scaler.fit_transform(harm_encoded.values.reshape(-1, 1)).flatten(),
            index=df.index
        )
    else:
        ml_df['perceived_harm'] = 0.0
    
    # ===================================================================
    # 8. TARGET VARIABLES - DECISION OUTCOMES (6 features)
    # ===================================================================
    
    # Final decision (primary target - binary: clear safe/unsafe only)
    decision_map = {
        'Yes, the email was safe': 1,
        'No, the email was definitely not safe': 0
    }
    ml_df['final_decision'] = df['final_decision'].map(decision_map).fillna(0)
    
    # Decision confidence (normalize 1-11 to 0-10 scale)
    if 'decision_confidence' in df.columns:

        decision_conf = pd.to_numeric(df['decision_confidence'], errors='coerce')
        decision_conf_normalized = (decision_conf - 1).clip(0, 10).fillna(4.5)
        ml_df['decision_confidence'] = pd.Series(
            scaler.fit_transform(decision_conf_normalized.values.reshape(-1, 1)).flatten(),
            index=df.index
        )
    else:
        ml_df['decision_confidence'] = 0.0
    
    # Parse actions taken (binary indicators for each action type)
    if 'actions_with_email' in df.columns:
        ml_df['actions_taken_clicked'] = df['actions_with_email'].str.contains(
            'clicked|Clicked', na=False
        ).astype(int)
        
        ml_df['actions_taken_reported'] = df['actions_with_email'].str.contains(
            'report.*spam|Clicked a button to report', na=False
        ).astype(int)
        
        ml_df['actions_taken_deleted'] = df['actions_with_email'].str.contains(
            'Deleted', na=False
        ).astype(int)
        
        ml_df['actions_taken_ignored'] = df['actions_with_email'].str.contains(
            'Left.*inbox', na=False
        ).astype(int)
    else:
        ml_df['actions_taken_clicked'] = 0
        ml_df['actions_taken_reported'] = 0
        ml_df['actions_taken_deleted'] = 0
        ml_df['actions_taken_ignored'] = 0
    
    # Ensure all columns are numeric 
    for col in ml_df.columns:
        if ml_df[col].dtype == 'object' or pd.api.types.is_categorical_dtype(ml_df[col]):
            ml_df[col] = pd.to_numeric(ml_df[col], errors='coerce').fillna(0)
    
    # Final fill for any remaining NaN values - now safe since all columns are numeric
    ml_df = ml_df.fillna(0)
    
    print(f"ML features created: {len(ml_df)} rows, {len(ml_df.columns)} features")
    print(f"Feature columns: {list(ml_df.columns)}")
    
    return ml_df

In [27]:
def remove_column_suffixes(df, suffixes=['_encoded', '_binary', '_standardized']):
    """Remove specified suffixes from column names"""
    new_columns = {}
    
    for col in df.columns:
        new_col = col
        for suffix in suffixes:
            if col.endswith(suffix):
                new_col = col.replace(suffix, '')
                break
        new_columns[col] = new_col
    
    return df.rename(columns=new_columns)

In [28]:
# === MAIN PROCESSING ===
print("WASH 2021 PHISHING BEHAVIOR DATASET PROCESSOR")

# Process data
wash_cleaned = Wash2021Processor.process()
wash_ml = create_ml_optimized(wash_cleaned)

# Save datasets
wash_cleaned.to_csv(CLEANED_DATA_DIR / "wash_2021_cleaned.csv", index=False)
wash_ml_clean = remove_column_suffixes(wash_ml)
wash_ml_clean.to_csv(FEATURES_DIR / "wash_2021_ml_optimized.csv", index=False)

print(f"\nCleaned dataset: {len(wash_cleaned)} rows, {len(wash_cleaned.columns)} columns")
print(f"ML dataset: {len(wash_ml)} rows, {len(wash_ml.columns)} features")
print(f"Saved to: {CLEANED_DATA_DIR} and {FEATURES_DIR}")


WASH 2021 PHISHING BEHAVIOR DATASET PROCESSOR
Processing WASH 2021 Dataset...
Raw: 1099 rows, 137 columns
Initial mapping: 1099 rows, 65 features
After phishing recall filter: 476 rows (removed 623)
After final decision filter: 325 rows (removed 774)
Creating ML-optimized WASH features...
Input data: 325 rows, 65 columns
ML features created: 325 rows, 79 features
Feature columns: ['age_category', 'gender', 'education_level', 'employment_status', 'annual_income', 'has_it_training', 'has_it_job', 'previous_incidents_phishing_email', 'previous_incidents_data_breach', 'previous_incidents_computer_virus', 'previous_incidents_device_hacked', 'previous_incidents_credit_card_fraud', 'previous_incidents_identity_theft', 'previous_incidents_any', 'digital_literacy_wiki', 'digital_literacy_meme', 'digital_literacy_phishing', 'digital_literacy_bookmark', 'digital_literacy_cache', 'digital_literacy_ssl', 'digital_literacy_ajax', 'digital_literacy_rss', 'digital_literacy_other', 'digital_literacy_to

In [29]:
# === MODEL TRAINING ===

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score
import joblib

# Paths
FEATURES_DIR = Path("data/features")
MODELS_DIR = Path("models")
MODELS_DIR.mkdir(exist_ok=True)

# Load dataset
df = pd.read_csv(FEATURES_DIR / "wash_2021_ml_optimized.csv")

# Define features and targets
X_features = [f for f in df.columns if f not in ['final_decision', 'actions_taken_clicked',
                                                 'actions_taken_reported', 'actions_taken_deleted',
                                                 'actions_taken_ignored', 'decision_confidence']]

classification_targets = ['final_decision', 'actions_taken_clicked', 
                          'actions_taken_reported', 'actions_taken_deleted', 'actions_taken_ignored']
regression_targets = ['decision_confidence']

# Prepare features
X = df[X_features].fillna(0)
print(f"Features: {X.shape[1]}, Samples: {X.shape[0]}")

# Model trainer
def train_model(X, y, task='classification', name=''):
    models = {
        'rf': RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42) if task == 'classification'
              else RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
        'lr_or_ridge': Pipeline([
            ('scaler', StandardScaler()),
            ('model', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)) if task == 'classification'
            else ('model', Ridge(random_state=42))
        ])
    }

    scoring = 'accuracy' if task == 'classification' else 'r2'
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) if task == 'classification' else 5

    best_model, best_score, best_name = None, -np.inf, None

    for k, model in models.items():
        scores = cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=-1)
        mean_score = scores.mean()
        print(f"{name} | {k.upper()} | {scoring.upper()}={mean_score:.4f}")
        if mean_score > best_score:
            best_model, best_score, best_name = model, mean_score, k

    # Final train-test split for evaluation
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y if task=='classification' else None,
                                                        test_size=0.2, random_state=42)
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)

    if task == 'classification':
        print(classification_report(y_test, y_pred, zero_division=0))
    else:
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        print(f"Test R²: {r2_score(y_test, y_pred):.4f}, RMSE: {rmse:.4f}")

    return best_model, best_score, best_name

# Store results
trained_models = {}
model_info = {}

# Classification
for target in classification_targets:
    if target in df.columns and df[target].nunique() > 1:
        y = df[target].fillna(0).astype(int)
        model, score, name = train_model(X, y, 'classification', target)
        trained_models[target] = model
        model_info[target] = {'type': 'classification', 'model': name, 'score': score}

# Regression
for target in regression_targets:
    if target in df.columns and df[target].nunique() > 1:
        y = df[target].fillna(0)
        model, score, name = train_model(X, y, 'regression', target)
        trained_models[target] = model
        model_info[target] = {'type': 'regression', 'model': name, 'score': score}
        
# Show top features for all trained models
print("\n=== Top Feature Importances for Each Model ===")

for target, model in trained_models.items():
    print(f"\n--- {target} ---")
    
    if hasattr(model, "feature_importances_"):  # Random Forest
        importances = model.feature_importances_
        importance_df = pd.DataFrame({
            'feature': X.columns,
            'importance': importances
        }).sort_values(by='importance', ascending=False)
        print(importance_df.head(15))
        
    elif hasattr(model, "coef_"):  # Logistic or Ridge Regression
        coefs = model.coef_.flatten()
        coef_df = pd.DataFrame({
            'feature': X.columns,
            'coefficient': coefs
        }).sort_values(by='coefficient', key=lambda x: abs(x), ascending=False)
        print(coef_df.head(15))
        
    else:
        print("No feature importance available for this model.")

# Feature importance for main model
# def show_feature_importance(model, feature_names, top_n=15):
#     if hasattr(model, 'feature_importances_'):
#         importances = model.feature_importances_
#     elif isinstance(model, Pipeline) and hasattr(model.named_steps['model'], 'coef_'):
#         importances = np.abs(model.named_steps['model'].coef_[0])
#     else:
#         return
#     fi = pd.DataFrame({'feature': feature_names, 'importance': importances})
#     print("\nTop Features:\n", fi.sort_values('importance', ascending=False).head(top_n))

# if 'final_decision' in trained_models:
#     print("\nFeature importance for final_decision model:")
#     show_feature_importance(trained_models['final_decision'], X_features)

# Save models & metadata
for name, model in trained_models.items():
    joblib.dump(model, MODELS_DIR / f"wash_{name}_model.joblib")

# Save feature list and model info
joblib.dump({
    'features': X_features, 
    'models': model_info,
    'feature_count': len(X_features)
}, MODELS_DIR / "wash_metadata.joblib")

# Create prediction class instead of function
class WashPredictor:
    def __init__(self, models, features):
        self.models = models
        self.features = features
        self.model_info = model_info
    
    def __call__(self, input_data):
        if isinstance(input_data, dict):
            input_data = pd.DataFrame([input_data])
        
        # Ensure all features present
        for f in self.features:
            if f not in input_data.columns:
                input_data[f] = 0
        
        input_data = input_data[self.features].fillna(0)
        
        predictions = {}
        for target, model in self.models.items():
            pred = model.predict(input_data)
            
            if self.model_info[target]['type'] == 'classification':
                prob = model.predict_proba(input_data)[0][1] if hasattr(model, 'predict_proba') else None
                predictions[target] = {
                    'prediction': int(pred[0]), 
                    'probability': float(prob) if prob is not None else None
                }
            else:
                predictions[target] = {'prediction': float(pred[0])}
        
        return predictions

# Save the predictor class with models embedded
wash_predictor = WashPredictor(trained_models, X_features)
joblib.dump(wash_predictor, MODELS_DIR / "wash_predictor.joblib")

print("\nAll models and predictor saved successfully.")

Features: 73, Samples: 325
final_decision | RF | ACCURACY=0.9815
final_decision | LR_OR_RIDGE | ACCURACY=0.9538
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        64
           1       0.00      0.00      0.00         1

    accuracy                           0.98        65
   macro avg       0.49      0.50      0.50        65
weighted avg       0.97      0.98      0.98        65

actions_taken_clicked | RF | ACCURACY=0.6862
actions_taken_clicked | LR_OR_RIDGE | ACCURACY=0.5846
              precision    recall  f1-score   support

           0       0.73      0.96      0.83        45
           1       0.67      0.20      0.31        20

    accuracy                           0.72        65
   macro avg       0.70      0.58      0.57        65
weighted avg       0.71      0.72      0.67        65

actions_taken_reported | RF | ACCURACY=0.6862
actions_taken_reported | LR_OR_RIDGE | ACCURACY=0.5846
              precision    recall 