In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import os

In [2]:
# Setup paths
RAW_DATA_DIR = Path("data/raw_data")
CLEANED_DATA_DIR = Path("data/cleaned_data")
CLEANED_DATA_DIR.mkdir(exist_ok=True)

#### 4. Lorin 2024 Dataset Processing
#### **Purpose**: Extract personality-driven phishing behavior patterns
#### **Key Features**: Big Five personality traits, susceptibility patterns, privacy attitudes

In [19]:
class Lorin2024Processor:
    """
    Processes Lorin et al. (2024) dataset for comprehensive personality-behavior profiling
    Includes ALL variables needed for persona generation (no timing data)
    """
    
    @staticmethod
    def process():
        print("Processing Lorin 2024 Dataset - Complete Version...")
        
        # Try multiple file formats and encodings
        file_path = RAW_DATA_DIR / "phishing_lorin_2024"
        extensions = ['.csv', '.xlsx']
        encodings = ['utf-8', 'cp1252', 'iso-8859-1', 'latin1']
        
        df = None
        for ext in extensions:
            try:
                if ext == '.xlsx':
                    df = pd.read_excel(str(file_path) + ext)
                    print(f"   Successfully loaded {ext} file")
                    break
                else:
                    for encoding in encodings:
                        try:
                            df = pd.read_csv(str(file_path) + ext, encoding=encoding, delimiter=';')
                            print(f"   Successfully loaded {ext} with {encoding} encoding")
                            break
                        except (UnicodeDecodeError, FileNotFoundError):
                            continue
                    if df is not None:
                        break
            except FileNotFoundError:
                continue
        
        if df is None:
            raise Exception("Could not read Lorin 2024 file with any format/encoding")
            
        print(f"   Raw data: {len(df)} rows, {len(df.columns)} columns")
        
        # Actual column mapping based on provided list
        features = {
            # === DEMOGRAPHICS ===
            'age_category': 'age',
            'education_level': 'education',
            'education_other': 'education_TEXT',
            'it_experience_level': 'it_experience',
            'it_experience_other': 'it_experience_TEXT',
            'employment_status': 'employment',
            'employment_other': 'employment_TEXT',
            
            # === BIG FIVE PERSONALITY TRAITS ===
            'personality_extraversion': 'bfi_extraversion',
            'personality_agreeableness': 'bfi_agreeableness',
            'personality_conscientiousness': 'bfi_conscientiousness',
            'personality_neuroticism': 'bfi_neuroticism',
            'personality_openness': 'bfi_openness',
            
            # === PROFICIENCY SCORES ===
            'practical_score_pre': 'practical_score_pre',
            'total_proficiency_pre': 'total_score_pre',
            'practical_score_post': 'practical_score_post',
            'total_proficiency_post': 'total_score_post',
            'proficiency_category': 'category',
            
            # === HAIS-Q PRE-TRAINING ===
            'haisq_email_attitude_pre': 'pre.haisq_email_attitude',
            'haisq_email_behaviour_pre': 'pre.haisq_email_behaviour',
            'haisq_internet_attitude_pre': 'pre.haisq_internet_attitude',
            'haisq_internet_behaviour_pre': 'pre.haisq_internet_behaviour_short',
            'haisq_weird_total_pre': 'pre.haisq_weird_total',
            
            # === HAIS-Q POST-TRAINING ===
            'haisq_email_attitude_post': 'post.haisq_email_attitude',
            'haisq_email_behaviour_post': 'post.haisq_email_behaviour',
            'haisq_internet_attitude_post': 'post.haisq_internet_attitude',
            'haisq_internet_behaviour_post': 'post.haisq_internet_behaviour_short',
            'haisq_weird_total_post': 'post.haisq_weird_total',
            
            # === SA-13 SECURITY ATTITUDES POST ===
            'sa13_engagement_post': 'post.sa13_engagement',
            'sa13_attentiveness_post': 'post.sa13_attentiveness',
            'sa13_resistance_post': 'post.sa13_resistance',
            'sa13_concernedness_post': 'post.sa13_concernedness',
            'sa13_total_post': 'post.sa13_total',
            
            # === IUIPC-8 PRIVACY CONCERNS ===
            'privacy_control_post': 'post.iuipc8_ctrl',
            'privacy_awareness_post': 'post.iuipc8_awa',
            'privacy_collection_post': 'post.iuipc8_coll',
            'privacy_total_post': 'post.iuipc8_total',
            
            # === SUSCEPTIBILITY OUTCOMES ===
            'phishing_susceptibility_pre': 'susceptibility_pre',
            'phishing_susceptibility_post': 'susceptibility_post'
        }
        
        # Create cleaned dataframe
        cleaned = pd.DataFrame()
        missing_cols = []
        
        for new_col, old_col in features.items():
            if old_col in df.columns:
                cleaned[new_col] = df[old_col]
            else:
                print(f"Warning: Column '{old_col}' not found, setting '{new_col}' to NaN")
                cleaned[new_col] = np.nan
                missing_cols.append(old_col)
        
        # Calculate derived metrics for persona generation
        cleaned = Lorin2024Processor._calculate_persona_metrics(cleaned)
        
        print(f"Processed: {len(cleaned)} rows, {len(cleaned.columns)} features")
        if missing_cols:
            print(f"Missing columns: {missing_cols}")
            
        return cleaned
    
    @staticmethod
    def _calculate_persona_metrics(df):
        """Calculate additional metrics for persona behavioral patterns"""
        
        # Personality-based risk patterns
        df['personality_risk_profile'] = df.apply(lambda row: 
            'High_Risk' if (row['personality_neuroticism'] > 3.5 and row['personality_agreeableness'] > 3.5)
            else 'Low_Risk' if (row['personality_conscientiousness'] > 3.5 and row['personality_neuroticism'] < 2.5)
            else 'Medium_Risk', axis=1)
        
        # Training responsiveness based on personality
        if 'phishing_susceptibility_pre' in df.columns and 'phishing_susceptibility_post' in df.columns:
            df['susceptibility_improvement'] = df['phishing_susceptibility_pre'] - df['phishing_susceptibility_post']
            df['training_responsiveness'] = df['susceptibility_improvement'].apply(lambda x:
                'High_Responder' if x > 0.2 else 'Low_Responder' if x < 0.05 else 'Medium_Responder')
        
        # Security attitude change
        if 'security_attitude_pre' in df.columns and 'security_attitude_post' in df.columns:
            df['attitude_improvement'] = df['security_attitude_post'] - df['security_attitude_pre']
        
        # Personality-driven behavioral tendencies
        df['curiosity_driven_risk'] = df['personality_openness'] * (6 - df['personality_conscientiousness'])
        df['social_engineering_vulnerability'] = df['personality_agreeableness'] * df['personality_extraversion']
        df['anxiety_driven_caution'] = df['personality_neuroticism'] * df['personality_conscientiousness']
        
        return df

In [21]:
lorin2024_data = Lorin2024Processor.process()
lorin2024_data.to_csv(CLEANED_DATA_DIR / "lorin_2024_cleaned.csv", index=False)
print(f"Saved complete dataset to: {CLEANED_DATA_DIR / 'lorin_2024_cleaned.csv'}")

Processing Lorin 2024 Dataset - Complete Version...
   Successfully loaded .csv with utf-8 encoding
   Raw data: 96 rows, 38 columns
Processed: 96 rows, 44 features
Saved complete dataset to: data/cleaned_data/lorin_2024_cleaned.csv
