In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import os

In [2]:
# Setup paths
RAW_DATA_DIR = Path("data/raw_data")
CLEANED_DATA_DIR = Path("data/cleaned_data")
CLEANED_DATA_DIR.mkdir(exist_ok=True)

#### 1. Oliver 2022 Dataset Processing
#### **Purpose**: Extract psychological foundations and signal detection metrics
#### **Key Features**: Response bias, signal detection parameters, Protection Motivation Theory constructs

In [8]:
class Oliver2022Processor:
    """
    Processes Oliver et al. (2022) dataset for comprehensive psychological validation
    Includes ALL variables except timing data
    """
    
    @staticmethod
    def process():
        print("Processing Oliver 2022 Dataset - Complete Version...")
        
        # Try different encodings to handle special characters
        encodings = ['utf-8', 'cp1252', 'iso-8859-1', 'latin1']
        df = None
        
        for encoding in encodings:
            try:
                df = pd.read_csv(RAW_DATA_DIR / "phishing_oliver_2022.csv", encoding=encoding)
                print(f"   Successfully loaded with {encoding} encoding")
                break
            except UnicodeDecodeError:
                continue
        
        if df is None:
            raise Exception("Could not read CSV with any standard encoding")
            
        print(f"   Raw data: {len(df)} rows, {len(df.columns)} columns")
        
        # Complete feature mapping - ALL columns except timing
        features = {
            # === PARTICIPANT IDENTIFIERS ===
            'participant_case_id': 'CASE',  # int: Unique participant identifier
            'reference_code': 'REF',  # str: Study reference code
            'completion_code': 'CO01',  # int: Survey completion code
            
            # === PRIMARY PERFORMANCE OUTCOMES ===
            'phishing_test_total_correct': 'correct_total_pt',  # int: Raw phishing test score
            'phishing_test_percent_correct': 'correct_percent_pt',  # num: 0-1 phishing test accuracy - CRITICAL
            'knowledge_test_total_correct': 'correct_total_kt',  # int: Raw knowledge test score
            'knowledge_test_percent_correct': 'correct_percent_kt',  # num: 0-1 knowledge accuracy
            
            # === PROTECTION MOTIVATION THEORY COMPOSITES ===
            'perceived_knowledge': 'Per_Know',  # num: Self-assessed phishing knowledge - KEY
            'email_trust': 'E_Trust',  # num: Trust in email communications - CRITICAL for personas
            'perceived_severity': 'Per_Sev',  # num: Perceived severity of phishing threats
            'perceived_self_efficacy': 'Per_SE',  # num: Confidence in detection ability - KEY calibration
            'perceived_vulnerability': 'Per_Vuln',  # num: Personal vulnerability perception
            
            # === DEMOGRAPHICS ===
            'participant_gender': 'Sex',  # int: 1=male, 2=female
            'participant_age': 'DE02_01',  # num: Age in years
            'education_level': 'Edu1',  # int: Educational attainment level
            'education_other': 'Edu_other',  # str: Custom education description
            'job_category': 'Job',  # int: Employment/occupation category
            'job_other': 'Job_othr',  # str: Custom job description
            'employment_type': 'Anstllung',  # int: Type of employment arrangement
            'employment_other': 'Anst_offen',  # str: Custom employment type
            
            # === IT BACKGROUND ===
            'works_in_it': 'ITSJOB',  # int: Whether participant works in IT field
            'phishing_victim_count': 'Phish_Vic_Count',  # int: Times fallen for phishing - KEY modifier
            
            # === PERCEIVED KNOWLEDGE ITEMS ===
            'pk_item_1': 'PK1',  # int: "I know what phishing is"
            'pk_item_2': 'PK2',  # int: "I understand phishing techniques"
            
            # === EMAIL TRUST ITEMS ===
            'et_item_1': 'ET1',  # int: Trust in email sender authenticity
            'et_item_2': 'ET2',  # int: Trust in email content reliability
            'et_item_3': 'ET3',  # int: General trust in email communications
            
            # === PERCEIVED SEVERITY ITEMS ===
            'ps_item_1': 'PS1',  # int: Severity of phishing consequences
            'ps_item_2': 'PS2',  # int: Impact of falling for phishing
            'ps_item_3': 'PS3',  # int: Seriousness of phishing threats
            
            # === PERCEIVED VULNERABILITY ITEMS ===
            'pv_item_1': 'PV1',  # int: Personal risk of phishing attacks
            'pv_item_2': 'PV2',  # int: Likelihood of being targeted
            'pv_item_3': 'PV3',  # int: Susceptibility to phishing
            
            # === SELF-EFFICACY ITEMS ===
            'se_item_1': 'SE1',  # int: Confidence in detecting phishing
            'se_item_2': 'SE2',  # int: Ability to avoid phishing scams
            'se_item_3': 'SE3',  # int: Skills in phishing recognition
            
            # === KNOWLEDGE TEST ITEMS (with answer keys) ===
            'knowledge_q1': 'KT01',  # int: Knowledge question 1 (correct=2)
            'knowledge_q2': 'KT02',  # int: Knowledge question 2 (correct=2)
            'knowledge_q5': 'KT05',  # int: Knowledge question 5 (correct=2)
            'knowledge_q7': 'KT07',  # int: Knowledge question 7 (correct=2)
            'knowledge_q9': 'KT09',  # int: Knowledge question 9 (correct=2)
            'knowledge_q10': 'KT10',  # int: Knowledge question 10 (correct=2)
            'knowledge_q14': 'KT14',  # int: Knowledge question 14 (correct=2)
            'knowledge_q15': 'KT15',  # int: Knowledge question 15 (correct=2)
            'knowledge_q16': 'KT16',  # int: Knowledge question 16 (correct=1)
            'knowledge_q17': 'KT17',  # int: Knowledge question 17 (correct=1)
            
            # === PHISHING TEST ITEMS - CRITICAL FOR SIGNAL DETECTION ===
            # Phishing emails (correct response = 2 "phishing")
            'phishing_email_5': 'PTP5',  # int: Email 5 classification (should be 2)
            'phishing_email_6': 'PTP6',  # int: Email 6 classification (should be 2)
            'phishing_email_9': 'PTP9',  # int: Email 9 classification (should be 2)
            'phishing_email_10': 'PTP10',  # int: Email 10 classification (should be 2)
            
            # Legitimate emails (correct response = 1 "legitimate")
            'legitimate_email_4': 'PTE4',  # int: Email 4 classification (should be 1)
            'legitimate_email_5': 'PTE5',  # int: Email 5 classification (should be 1)
            'legitimate_email_7': 'PTE7',  # int: Email 7 classification (should be 1)
            'legitimate_email_10': 'PTE10',  # int: Email 10 classification (should be 1)
            
            # === SURVEY ADMINISTRATION ===
            'last_page_viewed': 'LASTPAGE',  # int: Final page participant reached
            'max_pages': 'MAXPAGE',  # int: Total pages in survey
            'survey_quality_flag': 'DEG_TIME'  # int: Survey completion quality metric
        }
        
        # Create cleaned dataframe
        cleaned = pd.DataFrame()
        missing_cols = []
        
        for new_col, old_col in features.items():
            if old_col in df.columns:
                cleaned[new_col] = df[old_col]
            else:
                print(f"Warning: Column '{old_col}' not found, setting '{new_col}' to NaN")
                cleaned[new_col] = np.nan
                missing_cols.append(old_col)
        
        # Remove timing columns (TIME001-TIME024, TIME_SUM)
        timing_cols = [col for col in df.columns if col.startswith('TIME')]
        if timing_cols:
            print(f"Removed {len(timing_cols)} timing columns for persona generation")
        
        # Data cleaning and transformations
        # Convert gender properly (1=male, 2=female)
        if 'participant_gender' in cleaned.columns:
            cleaned['participant_gender'] = cleaned['participant_gender'].map({
                1: 'Male', 
                2: 'Female'
            })
        
        # Calculate Signal Detection Theory metrics from individual responses
        cleaned = Oliver2022Processor._calculate_sdt_metrics(cleaned)
        
        print(f"Processed: {len(cleaned)} rows, {len(cleaned.columns)} features")
        if missing_cols:
            print(f"Missing columns: {missing_cols}")
            
        return cleaned
    
    @staticmethod
    def _calculate_sdt_metrics(df):
        """Calculate Signal Detection Theory metrics from individual email responses"""
        
        # Phishing email columns (correct = 2)
        phishing_cols = ['phishing_email_5', 'phishing_email_6', 'phishing_email_9', 'phishing_email_10']
        # Legitimate email columns (correct = 1) 
        legitimate_cols = ['legitimate_email_4', 'legitimate_email_5', 'legitimate_email_7', 'legitimate_email_10']
        
        # Calculate hits and misses for phishing emails
        df['hits'] = df[phishing_cols].apply(lambda row: sum(row == 2), axis=1)
        df['misses'] = df[phishing_cols].apply(lambda row: sum(row == 1), axis=1)
        
        # Calculate false alarms and correct rejections for legitimate emails
        df['false_alarms'] = df[legitimate_cols].apply(lambda row: sum(row == 2), axis=1)
        df['correct_rejections'] = df[legitimate_cols].apply(lambda row: sum(row == 1), axis=1)
        
        # Convert to rates (divide by 4 since there are 4 of each type)
        df['hit_rate'] = df['hits'] / 4
        df['miss_rate'] = df['misses'] / 4  
        df['false_alarm_rate'] = df['false_alarms'] / 4
        df['correct_rejection_rate'] = df['correct_rejections'] / 4
        
        # Calculate bias and sensitivity measures
        df['response_bias'] = df['hit_rate'] - df['correct_rejection_rate']  # Liberal vs Conservative
        df['sensitivity'] = df['hit_rate'] - df['false_alarm_rate']  # Discrimination ability
        
        # Add interpretive flags
        df['bias_tendency'] = df['response_bias'].apply(lambda x: 
            'Liberal' if x > 0.1 else 'Conservative' if x < -0.1 else 'Neutral')
        df['detection_ability'] = df['sensitivity'].apply(lambda x:
            'High' if x > 0.5 else 'Low' if x < 0.2 else 'Medium')
        
        return df


In [9]:
oliver2022_data = Oliver2022Processor.process()
oliver2022_data.to_csv(CLEANED_DATA_DIR / "oliver_2022_cleaned.csv", index=False)
print(f"Saved complete dataset to: {CLEANED_DATA_DIR / 'oliver_2022_cleaned.csv'}")

Processing Oliver 2022 Dataset - Complete Version...
   Successfully loaded with cp1252 encoding
   Raw data: 296 rows, 84 columns
Removed 25 timing columns for persona generation
Processed: 296 rows, 69 features
Saved complete dataset to: data/cleaned_data/oliver_2022_cleaned.csv
