In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Setup paths
RAW_DATA_DIR = Path("data/raw_data")
CLEANED_DATA_DIR = Path("data/cleaned_data")
FEATURES_DIR = Path("data/features")
MODELS_DIR = Path("models")

for path in [CLEANED_DATA_DIR, FEATURES_DIR, MODELS_DIR]:
    path.mkdir(exist_ok=True, parents=True)

## OLIVER et al. (2022) Dataset Processing
### Step 1: Cleaning with renaming readable feature names

In [3]:
class Oliver2022Processor:
    @staticmethod
    def process():
        print("Processing Oliver 2022 Dataset...")
        
        # Try different encodings
        for encoding in ['utf-8', 'cp1252', 'iso-8859-1', 'latin1']:
            try:
                df = pd.read_csv(RAW_DATA_DIR / "phishing_oliver_2022.csv", encoding=encoding)
                print(f"Loaded with {encoding} encoding: {len(df)} rows, {len(df.columns)} columns")
                break
            except UnicodeDecodeError:
                continue
        
        # Feature mapping
        features = {
            # Demographics
            'age': 'DE02_01',
            'gender': 'Sex',
            'education_level': 'Edu1',
            'employment_status': 'Job',
            'employment_type': 'Anstllung',
            
            # IT Background
            'it_job_status': 'ITSJOB',
            'phishing_victim_count': 'Phish_Vic_Count',
            
            # PMT Constructs
            'perceived_knowledge': 'Per_Know',
            'perceived_self_efficacy': 'Per_SE',
            'perceived_severity': 'Per_Sev',
            'perceived_vulnerability': 'Per_Vuln',
            'email_trust': 'E_Trust',
            
            # Performance Measures
            'knowledge_test_percent_correct': 'correct_percent_kt',
            'phishing_test_percent_correct': 'correct_percent_pt',
            'knowledge_test_total_correct': 'correct_total_kt',
            'phishing_test_total_correct': 'correct_total_pt',
        }
        
        # Create cleaned dataframe
        cleaned = pd.DataFrame()
        for new_col, old_col in features.items():
            if old_col in df.columns:
                cleaned[new_col] = df[old_col]
            else:
                cleaned[new_col] = np.nan
        
        # Data quality filters
        initial_count = len(cleaned)
        cleaned = cleaned.dropna(subset=['age', 'gender'], how='any')
        performance_cols = ['knowledge_test_percent_correct', 'phishing_test_percent_correct']
        cleaned = cleaned.dropna(subset=performance_cols, how='all')
        
        print(f"After filtering: {len(cleaned)} rows (removed {initial_count - len(cleaned)})")
        return cleaned

### Step 2. Feature Engineering

##### **Purpose**: Extract competence assessment patterns from Protection Motivation Theory data

##### **Unique Value**: Controlled assessments measuring performance-perception gaps in phishing detection
##### **Research Questions Enabled**:
1. How WELL can someone detect phishing vs. how confident they feel?
2. What psychological factors predict actual phishing detection accuracy? 
3. Can we identify overconfidence patterns in cybersecurity skills?
4. Which PMT constructs best predict objective performance?

#### **X_inputs**:
1. **Demographics (4)**: Age categories, gender, education level, employment status
2. **IT Background (3)**: IT job frequency, previous phishing victimization (binary + count)  
3. **PMT Psychological Constructs (5)**: Perceived knowledge, self-efficacy, severity, vulnerability, email trust

#### **Y_targets**:
1. **Primary**: Phishing detection accuracy (0-100%) - objective performance measure
2. **Secondary**: Security knowledge test accuracy (0-100%) - foundational knowledge measure

In [4]:
def create_ml_optimized(df):
    """Transform Oliver data into ML-ready features"""
    
    ml_df = pd.DataFrame()
    scaler = StandardScaler()
    print(f"Creating ML features from {len(df)} rows...")
    
    # Demographics (5 features) - Consistent with Wash categories
    # Age categories (same as Wash)
    age_bins = [0, 25, 35, 55, 75, 100]
    age_labels = [1, 2, 3, 4, 5]  # 1=youngest, 5=oldest
    ml_df['age_category'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, include_lowest=True).fillna(3)
    
    # Gender (binary: 1=Male, 0=Female)
    gender_map = {1: 0, 2: 1}  # Male=1, Female=0 
    ml_df['gender'] = df['gender'].map(gender_map).fillna(0)
    
    # Education level
    # 1=Dropped out, 3=Elementary, 4=Secondary, 5=Apprenticeship, 6=Vocational, 7=A-levels, 8=College/University, 9=Still in school, 10=Other
    education_map = {
        1: 1, 9: 1,  # Low: Dropped out, Still in school
        3: 2, 4: 2,  # Medium-Low: Elementary, Secondary  
        5: 3, 6: 3, 7: 3,  # Medium-High: Apprenticeship, Vocational, A-levels
        8: 4,  # High: College/University
        10: 2  # Other -> Medium-Low
    }
    ml_df['education_level'] = df['education_level'].map(education_map).fillna(2)
    
    # Employment Status (binary: employed=1, unemployed=0, based on employment_type)
    # 1=High Schooler, 2=Apprenticeship, 3=Student, 4=Employee, 5=Public Servant, 6=Self-Employed, 7=Unemployed, 8=Other
    employment_map = {
        1: 0, 3: 0, 7: 0,  # Not working: High schooler, Student, Unemployed
        2: 0,  # Training: Apprenticeship
        4: 1, 5: 1,  # Employee: Employee, Public servant
        6: 1,  # Self-employed
        8: 0  # Other -> Not working
    }
    ml_df['employment_status'] = df['employment_type'].map(employment_map).fillna(0)
    
    # IT Background (3 features)
    it_job_map = {1: 1, 2: 1, 3: 1, 4: 0, 5: 0}  # Regular IT work vs Little/None
    ml_df['it_job'] = df['it_job_status'].map(it_job_map).fillna(0)
    ml_df['phishing_victim'] = (df['phishing_victim_count'] > 0).astype(int)
    
    # Standardized victim count
    victim_counts = df['phishing_victim_count'].fillna(0)
    ml_df['phishing_victim_count'] = scaler.fit_transform(victim_counts.values.reshape(-1, 1)).flatten()
    
    # PMT Constructs (5 features) - Standardized
    pmt_fields = ['perceived_knowledge', 'perceived_self_efficacy', 'perceived_severity', 
                  'perceived_vulnerability', 'email_trust']
    
    for field in pmt_fields:
        if field in df.columns:
            values = df[field].fillna(df[field].mean())
            ml_df[field] = scaler.fit_transform(values.values.reshape(-1, 1)).flatten()
        else:
            ml_df[field] = 0.0
    
    # Performance Measures (4 features) - Standardized
    performance_fields = ['knowledge_test_percent_correct', 'phishing_test_percent_correct']
    
    for field in performance_fields:
        if field in df.columns:
            values = df[field].fillna(df[field].mean())
            ml_df[field] = scaler.fit_transform(values.values.reshape(-1, 1)).flatten()
        else:
            ml_df[field] = 0.0
    
    # Convert all to float and fill NaN
    for col in ml_df.columns:
        ml_df[col] = pd.to_numeric(ml_df[col], errors='coerce').fillna(0)
    
    print(f"ML features: {len(ml_df)} rows, {len(ml_df.columns)} features")
    return ml_df

In [5]:
def remove_column_suffixes(df, suffixes=['_encoded', '_binary', '_standardized']):
    """Remove specified suffixes from column names for cleaner output"""
    new_columns = {}
    
    for col in df.columns:
        new_col = col
        for suffix in suffixes:
            if col.endswith(suffix):
                new_col = col.replace(suffix, '')
                break
        new_columns[col] = new_col
    
    return df.rename(columns=new_columns)

In [6]:
# === MODEL TRAINING ===
def train_model(X, y, task='classification', name=''):
    """Train and evaluate models with cross-validation"""
    
    if task == 'classification':
        models = {
            'rf': RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42),
            'lr': Pipeline([('scaler', StandardScaler()), 
                           ('model', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))])
        }
        scoring = 'accuracy'
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    else:
        models = {
            'rf': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
            'ridge': Pipeline([('scaler', StandardScaler()), ('model', Ridge(random_state=42))])
        }
        scoring = 'r2'
        cv = 5
    
    best_model, best_score, best_name = None, -np.inf, None
    
    for k, model in models.items():
        scores = cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=-1)
        mean_score = scores.mean()
        print(f"{name} | {k.upper()} | {scoring.upper()}={mean_score:.4f}")
        
        if mean_score > best_score:
            best_model, best_score, best_name = model, mean_score, k
    
    # Final evaluation
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y if task=='classification' else None, test_size=0.2, random_state=42)
    
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    
    if task == 'classification':
        print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        print(classification_report(y_test, y_pred, zero_division=0))
    else:
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        print(f"Test R²: {r2:.4f}, RMSE: {rmse:.4f}")
    
    return best_model, best_score, best_name

def show_feature_importance(model, feature_names, target_name, top_n=10):
    """Display top feature importances"""
    print(f"\n--- {target_name} ---")
    
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        fi_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
        print(fi_df.head(top_n))
        
    elif isinstance(model, Pipeline) and hasattr(model.named_steps['model'], 'coef_'):
        coefs = np.abs(model.named_steps['model'].coef_.flatten())
        fi_df = pd.DataFrame({
            'feature': feature_names,
            'coefficient': coefs
        }).sort_values('coefficient', ascending=False)
        print(fi_df.head(top_n))


In [7]:
# === MAIN PROCESSING ===

# Step 1: Clean data
oliver_cleaned = Oliver2022Processor.process()
oliver_ml = create_ml_optimized(oliver_cleaned)

# Step 2: Save datasets
oliver_cleaned.to_csv(CLEANED_DATA_DIR / "oliver_2022_cleaned.csv", index=False)
oliver_ml.to_csv(FEATURES_DIR / "oliver_2022_ml_optimized.csv", index=False)

print(f"\nCleaned: {len(oliver_cleaned)} rows, {len(oliver_cleaned.columns)} columns")
print(f"ML-ready: {len(oliver_ml)} rows, {len(oliver_ml.columns)} features")

# Step 3: Train models
# Define features and targets based on Oliver model requirements
X_features = [
    'age_category', 'gender', 'education_level', 'employment_status',
    'it_job', 'phishing_victim', 'phishing_victim_count',
    'perceived_knowledge', 'perceived_self_efficacy', 'perceived_severity', 
    'perceived_vulnerability', 'email_trust'
]

# Targets: Performance measures (regression)
y_targets = ['phishing_test_percent_correct', 'knowledge_test_percent_correct']

# Prepare features
X = oliver_ml[X_features].fillna(0)
print(f"\nTraining models with {X.shape[1]} features, {X.shape[0]} samples")

# Train models
trained_models = {}
model_info = {}

for target in y_targets:
    if target in oliver_ml.columns and oliver_ml[target].nunique() > 1:
        y = oliver_ml[target].fillna(0)
        print(f"\n=== Training {target} ===")
        
        model, score, name = train_model(X, y, 'regression', target)
        trained_models[target] = model
        model_info[target] = {'type': 'regression', 'model': name, 'score': score}

# Show feature importances
print("\n=== FEATURE IMPORTANCES ===")
for target, model in trained_models.items():
    show_feature_importance(model, X_features, target)

# Save models individually
for name, model in trained_models.items():
    joblib.dump(model, MODELS_DIR / f"oliver_{name}_model.joblib")

# Save metadata
joblib.dump({
    'features': X_features, 
    'models': model_info,
    'feature_count': len(X_features)
}, MODELS_DIR / "oliver_metadata.joblib")

# Create prediction class
class OliverPredictor:
    def __init__(self, models, features):
        self.models = models
        self.features = features
        self.model_info = model_info
    
    def __call__(self, input_data):
        if isinstance(input_data, dict):
            input_data = pd.DataFrame([input_data])
        
        # Ensure all features present
        for f in self.features:
            if f not in input_data.columns:
                input_data[f] = 0
        
        input_data = input_data[self.features].fillna(0)
        
        predictions = {}
        for target, model in self.models.items():
            pred = model.predict(input_data)
            predictions[target] = {'prediction': float(pred[0])}
        
        return predictions

# Save predictor
oliver_predictor = OliverPredictor(trained_models, X_features)
joblib.dump(oliver_predictor, MODELS_DIR / "oliver_predictor.joblib")

print(f"\nAll models and predictor saved to: {MODELS_DIR}")

Processing Oliver 2022 Dataset...
Loaded with cp1252 encoding: 296 rows, 84 columns
After filtering: 288 rows (removed 8)
Creating ML features from 288 rows...
ML features: 288 rows, 14 features

Cleaned: 288 rows, 16 columns
ML-ready: 288 rows, 14 features

Training models with 12 features, 288 samples

=== Training phishing_test_percent_correct ===
phishing_test_percent_correct | RF | R2=-0.1262
phishing_test_percent_correct | RIDGE | R2=-0.0197
Test R²: 0.0017, RMSE: 0.9105

=== Training knowledge_test_percent_correct ===
knowledge_test_percent_correct | RF | R2=0.0518
knowledge_test_percent_correct | RIDGE | R2=-0.0115
Test R²: -0.2122, RMSE: 1.1029

=== FEATURE IMPORTANCES ===

--- phishing_test_percent_correct ---
                    feature  coefficient
0              age_category     0.313414
8   perceived_self_efficacy     0.151786
2           education_level     0.099393
3         employment_status     0.071690
10  perceived_vulnerability     0.040978
4                    it_