In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import os

In [2]:
# Setup paths
RAW_DATA_DIR = Path("data/raw_data")
CLEANED_DATA_DIR = Path("data/cleaned_data")
CLEANED_DATA_DIR.mkdir(exist_ok=True)

#### 2. WASH 2021 Dataset Processing
#### **Purpose**: Extract behavioral decision patterns from real phishing encounters
#### **Key Features**: Decision confidence, investigation methods, emotional responses, authentic narratives

In [5]:
class Wash2021Processor:
    """
    Processes Wash et al. (2021) dataset for behavioral profiling
    Focuses on real-world phishing encounter decision patterns
    """
    
    @staticmethod
    def process():
        print("Processing WASH 2021 Dataset...")
        df = pd.read_csv(RAW_DATA_DIR / "phishing_wash_2021.csv")
        print(f"   Raw data: {len(df)} rows, {len(df.columns)} columns")
        
        # Feature mapping with detailed documentation
        features = {
            # === DEMOGRAPHICS & BACKGROUND ===
            'participant_age': 'age',  # int: Age in years (18-80+)
            'participant_gender': 'gender',  # cat: Man/Woman/Other/Prefer not to answer
            'participant_gender_other': 'gender_3_TEXT',  # str: Custom gender description
            'participant_ethnicity': 'ethnicity',  # cat: Race/ethnicity multi-select
            'participant_ethnicity_other': 'ethnicity_9_TEXT',  # str: Custom ethnicity
            'education_level': 'education',  # ord: Less than HS to Graduate degree
            'employment_status': 'employment',  # cat: Full-time/Part-time/Student/Retired
            'annual_income': 'income',  # ord: <$25K to $200K+ brackets
            'has_it_training': 'expert_training',  # bool: Yes/No formal IT training
            'has_it_job': 'expert_job',  # bool: Yes/No high-tech job
            'can_recall_phishing': 'recall_email',  # bool: Yes/No remember suspicious emails
            
            # === DIGITAL LITERACY SCALE (1-5) ===
            'digital_literacy_wiki': 'digital_literacy_1',  # ord: 1-5 understanding "Wiki"
            'digital_literacy_meme': 'digital_literacy_2',  # ord: 1-5 understanding "Meme"
            'digital_literacy_phishing': 'digital_literacy_3',  # ord: 1-5 understanding "Phishing"
            'digital_literacy_bookmark': 'digital_literacy_4',  # ord: 1-5 understanding "Bookmark"
            'digital_literacy_cache': 'digital_literacy_5',  # ord: 1-5 understanding "Cache"
            'digital_literacy_ssl': 'digital_literacy_6',  # ord: 1-5 understanding "SSL"
            'digital_literacy_ajax': 'digital_literacy_7',  # ord: 1-5 understanding "AJAX"
            'digital_literacy_rss': 'digital_literacy_8',  # ord: 1-5 understanding "RSS"
            
            # === CORE DECISION VARIABLES ===
            'final_decision': 'decide',  # cat: Safe/Unsafe/Unsure - CRITICAL for persona behavior
            'decision_confidence': 'decide_sure_1',  # num: 0-100 confidence in decision - KEY for calibration
            'overall_suspicion': 'suspect5',  # ord: 1-7 belief email was harmful - KEY threat assessment
            'suspicion_confidence': 'suspect5_sure_1',  # num: 0-100 confidence in suspicion
            
            # === INVESTIGATION BEHAVIORS ===
            'investigation_methods': 'investigate1',  # multi-select: Investigation actions taken - CRITICAL
            'investigation_other': 'investigate1_9_TEXT',  # str: Other investigation methods
            'contacted_sender_how': 'investigate2',  # multi-select: Sender contact methods
            
            # === EMOTIONAL RESPONSES (1-5 scale) ===
            'emotion_dread': 'emotions_1',  # ord: 1-5 level of dread - KEY stress indicator
            'emotion_terror': 'emotions_2',  # ord: 1-5 level of terror
            'emotion_anxiety': 'emotions_3',  # ord: 1-5 level of anxiety - CRITICAL for persona
            'emotion_nervous': 'emotions_4',  # ord: 1-5 level of nervousness
            'emotion_scared': 'emotions_5',  # ord: 1-5 level of being scared
            'emotion_panic': 'emotions_6',  # ord: 1-5 level of panic
            'emotion_fear': 'emotions_7',  # ord: 1-5 level of fear - KEY emotional response
            'emotion_worry': 'emotions_8',  # ord: 1-5 level of worry
            
            # === CONTEXTUAL AWARENESS ===
            'email_features_noticed': 'notice1',  # multi-select: Features noticed in email
            'email_recency': 'notice2',  # ord: How long ago received (minutes to months)
            'email_account_type': 'notice3',  # cat: Personal/Work/School email account
            'email_account_other': 'notice3_4_TEXT',  # str: Other account type
            'email_content_type': 'notice4',  # cat: Work vs personal content context
            'email_content_other': 'notice4_3_TEXT',  # str: Other content type
            'email_sender_type': 'notice5',  # cat: Individual/Company/Government/Unknown
            'email_sender_other': 'notice5_5_TEXT',  # str: Other sender type
            
            # === EXPECTATION & CONTEXT ===
            'felt_similar_before': 'expect1',  # ord: 1-7 agreement scale similar emails
            'previous_sender_emails': 'expect2',  # cat: Yes/No/Unsure previous sender emails
            'previous_sender_interaction': 'expect3',  # cat: Yes/No/Unsure other interactions
            'sender_relationship_duration': 'expect4',  # ord: Never heard to Years known
            'expected_this_email': 'expect5',  # cat: Yes/No/Unsure expected this email
            'email_seemed_different': 'expect6',  # ord: 1-7 agreement different than typical
            
            # === SUSPICION TRIGGERS ===
            'actions_requested': 'suspect1',  # multi-select: Actions email requested
            'sender_issues': 'suspect2',  # multi-select: Sender problems identified
            'subject_line_issues': 'suspect3',  # cat: Normal/Off/Very off subject line
            'email_body_issues': 'suspect4',  # multi-select: Body problems identified
            
            # === ACTIONS TAKEN ===
            'actions_with_email': 'act',  # multi-select: Actions taken with email - CRITICAL
            
            # === HARM & NARRATIVE ===
            'perceived_harm': 'harm',  # ord: 1-7 agreement something harmful happened
            'incident_story': 'full_story',  # text: Complete story - KEY for dialogue generation
            'story_recall_ease': 'full_story_easy',  # ord: 1-5 how easy to remember story
            
            # === SECURITY HISTORY ===
            'previous_incidents': 'victim'  # multi-select: Past security experiences - KEY modifier
        }
        
        # Create cleaned dataframe
        cleaned = pd.DataFrame()
        for new_col, old_col in features.items():
            if old_col in df.columns:
                cleaned[new_col] = df[old_col]
            else:
                print(f"Column '{old_col}' not found, setting '{new_col}' to NaN")
                cleaned[new_col] = np.nan
        
        # Quality filter - only participants who can recall phishing emails
        if 'can_recall_phishing' in cleaned.columns:
            initial_count = len(cleaned)
            cleaned = cleaned[cleaned['can_recall_phishing'].str.contains('Yes', case=False, na=False)]
            print(f"Quality filter: {initial_count} → {len(cleaned)} participants (kept those who recall phishing)")
        
        cleaned = Wash2021Processor._clean_text_fields(cleaned)

        return cleaned
    
    @staticmethod
    def _clean_text_fields(df):
        """Remove newlines and clean text fields for proper CSV structure"""
        
        # Text columns that might contain newlines
        text_columns = [
            'unsafe_email_ways_list', 'suspicious_email_recognition', 'suspicious_emails_received',
            'email_brief_summary', 'what_made_email_suspicious', 'what_made_decision_hard', 
            'what_email_asked_todo', 'investigation_other', 'gender_other', 'ethnicity_other',
            'contacted_sender_how_other', 'additional_investigation_other'
        ]
        
        for col in text_columns:
            if col in df.columns:
                df[col] = df[col].astype(str)  # Convert to string
                df[col] = df[col].str.replace('\n', ' ', regex=False)  # Replace newlines with spaces
                df[col] = df[col].str.replace('\r', ' ', regex=False)  # Replace carriage returns
                df[col] = df[col].str.replace('  +', ' ', regex=True)  # Multiple spaces to single
                df[col] = df[col].str.strip()  # Remove leading/trailing whitespace
                df[col] = df[col].replace('nan', np.nan)  # Convert 'nan' strings back to NaN
        
        return df



In [6]:
wash_data = Wash2021Processor.process()
wash_data.to_csv(CLEANED_DATA_DIR / "wash_2021_cleaned.csv", index=False)
print(f"Saved to: {CLEANED_DATA_DIR / 'wash_2021_cleaned.csv'}")

Processing WASH 2021 Dataset...
   Raw data: 1099 rows, 137 columns
Quality filter: 1099 → 476 participants (kept those who recall phishing)
Saved to: data/cleaned_data/wash_2021_cleaned.csv
