In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')


In [8]:
# Setup paths
RAW_DATA_DIR = Path("data/raw_data")
CLEANED_DATA_DIR = Path("data/cleaned_data")
FEATURES_DIR = Path("data/features")
MODELS_DIR = Path("models")

for path in [CLEANED_DATA_DIR, FEATURES_DIR, MODELS_DIR]:
    path.mkdir(exist_ok=True, parents=True)

## LORIN et al. (2025) Dataset Processing
### Step 1: Cleaning with readable feature names

In [3]:
class Lorin2025Processor:
    """Processes Lorin et al. (2025) dataset for personality-based capability prediction"""
    
    @staticmethod
    def process():
        print("Processing Lorin 2025 Dataset...")
        
        df = pd.read_csv(RAW_DATA_DIR / "phishing_lorin_2025.csv")
        print(f"Raw data: {len(df)} rows, {len(df.columns)} columns")
        
        # Feature mapping based on the provided specification
        features = {
            # === DEMOGRAPHICS & BACKGROUND ===
            'age_category': 'dem_age',
            'education_level': 'dem_edu', 
            'it_experience': 'dem_it',
            'email_frequency': 'mailboxfrequency',
            'security_training_prior': 'securitytraining',
            
            # === BIG FIVE PERSONALITY TRAITS (COMPOSITES) ===
            'personality_extraversion': 'bfi_extraversion',
            'personality_agreeableness': 'bfi_agreeableness', 
            'personality_conscientiousness': 'bfi_conscientiousness',
            'personality_neuroticism': 'bfi_neuroticism',
            'personality_openness': 'bfi_openness',
            
            # === PRE-TRAINING SECURITY ATTITUDES ===
            'security_engagement': 'pre.sa13_engagement',
            'security_attentiveness': 'pre.sa13_attentiveness', 
            'security_resistance': 'pre.sa13_resistance',
            'security_concern': 'pre.sa13_concernedness',
            'security_attitude_total': 'pre.sa13_total',
            
            # === BASELINE CAPABILITIES ===
            'knowledge_total': 'knowledge_total_pre',
            'proficiency': 'proficiency_pre',
            
            # === TARGET VARIABLES ===
            'class_phish_accuracy': 'class_phish_accuracy_pre',
            'class_nophish_accuracy': 'class_nophish_accuracy_pre',
        }
        
        # Create cleaned dataframe
        cleaned = pd.DataFrame()
        missing_cols = []
        
        for new_col, old_col in features.items():
            if old_col in df.columns:
                cleaned[new_col] = df[old_col]
            else:
                print(f"Warning: Column '{old_col}' not found, setting '{new_col}' to NaN")
                cleaned[new_col] = np.nan
                missing_cols.append(old_col)
        
        # Data quality filters
        initial_count = len(cleaned)
        
        # Filter 1: Remove rows with missing target variables
        target_cols = ['class_phish_accuracy', 'class_nophish_accuracy']
        cleaned = cleaned.dropna(subset=target_cols, how='all')
        print(f"After target filter: {len(cleaned)} rows (removed {initial_count - len(cleaned)})")
        
        # Filter 2: Remove rows with missing core demographics
        demo_cols = ['age_category', 'education_level']
        available_demo = [col for col in demo_cols if col in cleaned.columns]
        if available_demo:
            cleaned = cleaned.dropna(subset=available_demo, how='all')
            print(f"After demographics filter: {len(cleaned)} rows")
        
        print(f"Final cleaned dataset: {len(cleaned)} rows, {len(cleaned.columns)} features")
        if missing_cols:
            print(f"Missing columns: {missing_cols}")
            
        return cleaned

In [4]:
def create_ml_optimized(df):
    """Transform Lorin data into ML-ready features with consistent categorization"""
    
    ml_df = pd.DataFrame()
    scaler = StandardScaler()
    print(f"Creating ML features from {len(df)} rows...")
    
    # ===================================================================
    # 1. DEMOGRAPHICS (5 features) - Consistent with Wash/Oliver
    # ===================================================================
    
    # Age categories (consistent mapping: 1=youngest, 5=oldest)
    age_map = {
        '18-25': 1, '26-35': 2, '36-45': 3, 
        '46-55': 4, '56-65': 5, '66-75': 5, '>75': 5
    }
    ml_df['age_category'] = df['age_category'].map(age_map).fillna(3)
    
    # Education level (consistent 4-tier system): 1=Low, 2=Medium-Low, 3=Medium-High, 4=High
    education_map = {
        'No formal qualifications': 1,
        'Primary school': 1,
        'Secondary school': 2,
        'College': 2,
        'Technical degree': 3,
        'Undergraduate degree': 3,
        'Postgraduate degree': 4,
        'Doctoral degree': 4
    }
    ml_df['education_level'] = df['education_level'].map(education_map).fillna(2)
    
    # IT experience level (ordinal: 1=None to 4=Expert)
    it_map = {
        'No experience': 1,
        'Little experience': 2, 
        'Some experience': 3,
        'Experienced': 4
    }
    ml_df['it_experience'] = df['it_experience'].map(it_map).fillna(2)
    
    # Email frequency (ordinal: 1=Rarely to 5=Very frequently)
    email_map = {
        'Never': 1, 'Rarely': 2, 'Sometimes': 3, 
        'Often': 4, 'Very often': 5
    }
    ml_df['email_frequency'] = df['email_frequency'].map(email_map).fillna(3)
    
    # Security training prior (binary)
    training_map = {'Yes': 1, 'No': 0}
    ml_df['security_training_prior'] = df['security_training_prior'].map(training_map).fillna(0)
    
    # ===================================================================
    # 2. BIG FIVE PERSONALITY TRAITS (5 features)
    # ===================================================================
    
    personality_fields = [
        'personality_extraversion', 'personality_agreeableness',
        'personality_conscientiousness', 'personality_neuroticism', 'personality_openness'
    ]
    
    for field in personality_fields:
        if field in df.columns:
            values = df[field].fillna(df[field].mean())
            ml_df[field] = scaler.fit_transform(values.values.reshape(-1, 1)).flatten()
        else:
            ml_df[field] = 0.0
    
    # ===================================================================
    # 3. PRE-TRAINING SECURITY ATTITUDES (5 features)
    # ===================================================================
    
    security_attitude_fields = [
        'pre_security_engagement', 'pre_security_attentiveness',
        'pre_security_resistance', 'pre_security_concern', 'pre_security_attitude_total'
    ]
    
    for field in security_attitude_fields:
        if field in df.columns:
            values = df[field].fillna(df[field].mean())
            ml_df[field] = scaler.fit_transform(values.values.reshape(-1, 1)).flatten()
        else:
            ml_df[field] = 0.0
    
    # ===================================================================
    # 4. BASELINE CAPABILITIES (2 features) - Standardized
    # ===================================================================
    
    capability_fields = ['knowledge_total', 'proficiency']
    
    for field in capability_fields:
        if field in df.columns:
            values = df[field].fillna(df[field].mean())
            ml_df[field] = scaler.fit_transform(values.values.reshape(-1, 1)).flatten()
        else:
            ml_df[field] = 0.0
    
    # ===================================================================
    # 5. TARGET VARIABLES (2 features)
    # ===================================================================
    
    target_fields = ['class_phish_accuracy', 'class_nophish_accuracy']
    
    for field in target_fields:
        if field in df.columns:
            ml_df[field] = df[field].fillna(df[field].mean())
        else:
            ml_df[field] = 0.5  # Default to 50% accuracy
    
    # Convert all to numeric and fill NaN
    for col in ml_df.columns:
        ml_df[col] = pd.to_numeric(ml_df[col], errors='coerce').fillna(0)
    
    print(f"ML features: {len(ml_df)} rows, {len(ml_df.columns)} features")
    return ml_df

In [5]:
def remove_column_suffixes(df, suffixes=['_encoded', '_binary', '_standardized']):
    """Remove specified suffixes from column names for cleaner output"""
    new_columns = {}
    for col in df.columns:
        new_col = col
        for suffix in suffixes:
            if col.endswith(suffix):
                new_col = col.replace(suffix, '')
                break
        new_columns[col] = new_col
    return df.rename(columns=new_columns)

In [6]:
# === MAIN PROCESSING ===
print("=" * 60)
print("LORIN 2025 PROCESSOR - PERSONALITY-BASED CAPABILITY PREDICTOR")
print("=" * 60)

# Process data
lorin_cleaned = Lorin2025Processor.process()
lorin_ml = create_ml_optimized(lorin_cleaned)

# Save datasets
lorin_cleaned.to_csv(CLEANED_DATA_DIR / "lorin_2025_cleaned.csv", index=False)
lorin_ml_clean = remove_column_suffixes(lorin_ml)
lorin_ml_clean.to_csv(FEATURES_DIR / "lorin_2025_ml_optimized.csv", index=False)

print(f"\nCleaned dataset: {len(lorin_cleaned)} rows, {len(lorin_cleaned.columns)} columns")
print(f"ML dataset: {len(lorin_ml)} rows, {len(lorin_ml.columns)} features")
print(f"Saved to: {CLEANED_DATA_DIR} and {FEATURES_DIR}")

# === MODEL TRAINING ===
def train_model(X, y, task='regression', name=''):
    """Train and evaluate models with cross-validation"""
    
    if task == 'classification':
        models = {
            'rf': RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42),
            'lr': Pipeline([('scaler', StandardScaler()), 
                           ('model', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))])
        }
        scoring = 'accuracy'
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    else:
        models = {
            'rf': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
            'ridge': Pipeline([('scaler', StandardScaler()), ('model', Ridge(random_state=42))])
        }
        scoring = 'r2'
        cv = 5
    
    best_model, best_score, best_name = None, -np.inf, None
    
    for k, model in models.items():
        scores = cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=-1)
        mean_score = scores.mean()
        print(f"{name} | {k.upper()} | {scoring.upper()}={mean_score:.4f}")
        
        if mean_score > best_score:
            best_model, best_score, best_name = model, mean_score, k
    
    # Final evaluation
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y if task=='classification' else None, test_size=0.2, random_state=42)
    
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    
    if task == 'classification':
        print(f"Test Accuracy: {accuracy_score(y_test, yd):.4f}")
        print(classification_report(y_test, y_pred, zero_division=0))
    else:
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        print(f"Test R²: {r2:.4f}, RMSE: {rmse:.4f}")
    
    return best_model, best_score, best_name

def show_feature_importance(model, feature_names, target_name, top_n=10):
    """Display top feature importances"""
    print(f"\n--- {target_name} ---")
    
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        fi_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
        print(fi_df.head(top_n))
        
    elif isinstance(model, Pipeline) and hasattr(model.named_steps['model'], 'coef_'):
        coefs = np.abs(model.named_steps['model'].coef_.flatten())
        fi_df = pd.DataFrame({
            'feature': feature_names,
            'coefficient': coefs
        }).sort_values('coefficient', ascending=False)
        print(fi_df.head(top_n))

# === MODEL TRAINING ===

# Define features (X) and targets (Y) based on specification
X_features = [
    # Demographics (5)
    'age_category', 'education_level', 'it_experience', 'email_frequency', 'security_training_prior',
    
    # Big Five Personality (5)
    'personality_extraversion', 'personality_agreeableness', 'personality_conscientiousness',
    'personality_neuroticism', 'personality_openness',
    
    # Pre-Training Security Attitudes (5) 
    'pre_security_engagement', 'pre_security_attentiveness', 'pre_security_resistance',
    'pre_security_concern', 'pre_security_attitude_total',
    
    # Baseline Capabilities (2)
    'knowledge_total', 'proficiency'
]

# Targets: Pre-training phishing detection capabilities
y_targets = ['class_phish_accuracy', 'class_nophish_accuracy']

# Prepare features
X = lorin_ml[X_features].fillna(0)
print(f"\nTraining models with {X.shape[1]} features, {X.shape[0]} samples")

# Train models
trained_models = {}
model_info = {}

for target in y_targets:
    if target in lorin_ml.columns and lorin_ml[target].nunique() > 1:
        y = lorin_ml[target].fillna(0)
        print(f"\n=== Training {target} ===")
        
        # Both targets are continuous accuracy scores (0-1), so use regression
        model, score, name = train_model(X, y, 'regression', target)
        model_info[target] = {'type': 'regression', 'model': name, 'score': score}
        
        trained_models[target] = model

# Show feature importances
print("\n=== FEATURE IMPORTANCES ===")
for target, model in trained_models.items():
    show_feature_importance(model, X_features, target)

# Save models individually
for name, model in trained_models.items():
    joblib.dump(model, MODELS_DIR / f"lorin_{name}_model.joblib")

# Save metadata
joblib.dump({
    'features': X_features, 
    'models': model_info,
    'feature_count': len(X_features)
}, MODELS_DIR / "lorin_metadata.joblib")

# Create prediction class
class LorinPredictor:
    def __init__(self, models, features):
        self.models = models
        self.features = features
        self.model_info = model_info
    
    def __call__(self, input_data):
        if isinstance(input_data, dict):
            input_data = pd.DataFrame([input_data])
        
        # Ensure all features present
        for f in self.features:
            if f not in input_data.columns:
                input_data[f] = 0
        
        input_data = input_data[self.features].fillna(0)
        
        predictions = {}
        for target, model in self.models.items():
            pred = model.predict(input_data)
            # All Lorin targets are regression (continuous accuracy scores)
            predictions[target] = {'prediction': float(pred[0])}
        
        return predictions

# Save predictor
lorin_predictor = LorinPredictor(trained_models, X_features)
joblib.dump(lorin_predictor, MODELS_DIR / "lorin_predictor.joblib")

print(f"\nAll models and predictor saved to: {MODELS_DIR}")

LORIN 2025 PROCESSOR - PERSONALITY-BASED CAPABILITY PREDICTOR
Processing Lorin 2025 Dataset...
Raw data: 342 rows, 267 columns
After target filter: 342 rows (removed 0)
After demographics filter: 342 rows
Final cleaned dataset: 342 rows, 19 features
Creating ML features from 342 rows...
ML features: 342 rows, 19 features

Cleaned dataset: 342 rows, 19 columns
ML dataset: 342 rows, 19 features
Saved to: data/cleaned_data and data/features

Training models with 17 features, 342 samples

=== Training class_phish_accuracy ===
class_phish_accuracy | RF | R2=0.0648
class_phish_accuracy | RIDGE | R2=0.1592
Test R²: 0.1045, RMSE: 0.2175

=== Training class_nophish_accuracy ===
class_nophish_accuracy | RF | R2=-0.1231
class_nophish_accuracy | RIDGE | R2=-0.0009
Test R²: 0.1137, RMSE: 0.1714

=== FEATURE IMPORTANCES ===

--- class_phish_accuracy ---
                          feature  coefficient
16                    proficiency     0.106795
0                    age_category     0.030045
15     