In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import os

In [2]:
# Setup paths
RAW_DATA_DIR = Path("data/raw_data")
CLEANED_DATA_DIR = Path("data/cleaned_data")
CLEANED_DATA_DIR.mkdir(exist_ok=True)

#### 5. KnowBe4 Dataset Processing  
#### **Purpose**: Extract real-world phishing response probabilities and timing patterns
#### **Key Features**: Click behavior, response timing, risk scores, department patterns

In [4]:
class KnowBe4Processor:
    """
    Processes KnowBe4 phishing simulation dataset
    Focuses on real-world click probabilities and response patterns
    """
    
    @staticmethod
    def process():
        print("Processing KnowBe4 Dataset...")
        df = pd.read_csv(RAW_DATA_DIR / "knowbe4_synthesized.csv")
        print(f"   Raw data: {len(df)} rows, {len(df.columns)} columns")
        
        # Feature mapping with detailed documentation
        features = {
            # === PARTICIPANT IDENTIFIERS ===
            'user_id': 'User ID',  # str: Unique user identifier
            'employee_id': 'Employee ID',  # str: Employee ID number
            'email_address': 'Email',  # str: Participant email (anonymize in production)
            
            # === DEMOGRAPHICS ===
            'first_name': 'First Name',  # str: First name (anonymize in production)
            'last_name': 'Last Name',  # str: Last name (anonymize in production)  
            'department': 'Department',  # cat: Organizational department - KEY for role-based analysis
            'job_title': 'Title',  # cat: Job title - KEY for vulnerability profiling
            'office_location': 'Location',  # cat: Geographic location
            'division': 'Division',  # cat: Business division
            'cost_center': 'Cost Center',  # cat: Cost center assignment
            
            # === PHISHING SIMULATION DETAILS ===
            'campaign_name': 'Campaign Name',  # cat: Specific campaign identifier
            'phishing_template': 'Template',  # cat: Template used - CRITICAL for persona responses
            'template_difficulty': 'Difficulty',  # ord: Easy/Medium/Hard - KEY for vulnerability modeling
            'campaign_type': 'Campaign Type',  # cat: Type of phishing campaign
            'industry_template': 'Industry Template',  # cat: Industry-specific template
            'email_language': 'Language',  # cat: Email language used
            
            # === CORE BEHAVIORAL OUTCOMES ===
            'clicked_email': 'Clicked',  # bool: TRUE/FALSE - PRIMARY outcome for personas
            'opened_email': 'Opened',  # bool: TRUE/FALSE - engagement indicator  
            'replied_to_email': 'Replied',  # bool: TRUE/FALSE - deep engagement
            'opened_attachment': 'Attachment Opened',  # bool: TRUE/FALSE - high-risk behavior
            'enabled_macro': 'Macro Enabled',  # bool: TRUE/FALSE - critical security failure
            'entered_data': 'Data Entered',  # bool: TRUE/FALSE - data compromise
            'reported_email': 'Reported',  # bool: TRUE/FALSE - CRITICAL positive security behavior
            
            # === TIMING DATA ===
            'response_time_click': 'Time to Click (seconds)',  # num: Seconds to click - KEY for impulsivity
            'response_time_report': 'Time to Report (seconds)',  # num: Seconds to report - KEY for proactivity
            
            # === TECHNICAL CONTEXT ===
            'ip_address': 'IP Address',  # str: IP address (anonymize in production)
            'browser_used': 'Browser',  # cat: Browser type
            'operating_system': 'Operating System',  # cat: OS type
            'mobile_device': 'Mobile Device',  # bool: TRUE/FALSE - context for response
            'vpn_connection': 'VPN Connection',  # bool: TRUE/FALSE - security posture
            'two_factor_enabled': 'Two Factor Enabled',  # bool: TRUE/FALSE - security awareness
            'password_manager': 'Password Manager',  # bool: TRUE/FALSE - security tool usage
            
            # === DELIVERY & CAMPAIGN CONTEXT ===
            'delivery_status': 'Delivery Status',  # cat: Delivered/Bounced/etc
            'sent_date': 'Sent Date',  # date: When email was sent (will remove)
            'timezone': 'Time Zone',  # cat: User timezone
            
            # === RISK ASSESSMENT ===
            'phish_prone_percentage': 'Phish-prone Percentage',  # num: 0-100 baseline risk - KEY metric
            'current_risk_score': 'Current Risk Score',  # num: Current risk assessment - CRITICAL
            'baseline_test_score': 'Baseline Test Score',  # int: Initial knowledge test score
            'last_training_score': 'Last Training Score',  # num: Most recent training performance
            'security_proficiency': 'Security Awareness Proficiency',  # cat: Low/Med/High proficiency
            'risk_level': 'Risk Level',  # cat: High/Medium/Low risk classification - KEY
            
            # === BEHAVIORAL HISTORY ===
            'failure_count_12m': 'Failure Count (12 months)',  # int: Failed tests in last year - CRITICAL
            'success_count_12m': 'Success Count (12 months)',  # int: Successful detections - positive indicator
            'last_failure_date': 'Last Failure Date',  # date: Most recent failure (will remove)
            'previous_training_completed': 'Previous Training Completed',  # bool: Training history
            'training_completion_date': 'Training Completion Date',  # date: Last training (will remove)
            
            # === ORGANIZATIONAL CONTEXT ===
            'groups': 'Groups',  # cat: User groups/roles
            'manager_email': 'Manager Email',  # str: Manager contact (anonymize)
            'hire_date': 'Hire Date',  # date: Employment start (will remove)
            'active_directory': 'Active Directory',  # bool: AD account status
            'custom_field_1': 'Custom Field 1',  # str: Organization-specific data
            'custom_field_2': 'Custom Field 2'   # str: Organization-specific data
        }
        
        # Create cleaned dataframe
        cleaned = pd.DataFrame()
        for new_col, old_col in features.items():
            if old_col in df.columns:
                cleaned[new_col] = df[old_col]
            else:
                print(f"Column '{old_col}' not found, setting '{new_col}' to NaN")
                cleaned[new_col] = np.nan
        
        # Remove time-related columns
        time_columns = ['sent_date', 'last_failure_date', 'training_completion_date', 'hire_date']
        time_columns_present = [col for col in time_columns if col in cleaned.columns]
        if time_columns_present:
            cleaned = cleaned.drop(columns=time_columns_present)
            print(f"Removed time columns: {time_columns_present}")
        
        # Convert boolean columns properly
        bool_columns = ['clicked_email', 'opened_email', 'replied_to_email', 'opened_attachment', 
                       'enabled_macro', 'entered_data', 'reported_email', 'mobile_device', 
                       'vpn_connection', 'two_factor_enabled', 'password_manager', 
                       'previous_training_completed', 'active_directory']
        
        for col in bool_columns:
            if col in cleaned.columns:
                cleaned[col] = cleaned[col].astype(bool)
        
        print(f"Processed: {len(cleaned)} rows, {len(cleaned.columns)} features")
        return cleaned

In [5]:
knowbe4_data = KnowBe4Processor.process()
knowbe4_data.to_csv(CLEANED_DATA_DIR / "knowbe4_cleaned.csv", index=False)
print(f"Saved to: {CLEANED_DATA_DIR / 'knowbe4_cleaned.csv'}")

Processing KnowBe4 Dataset...
   Raw data: 5000 rows, 52 columns
Removed time columns: ['sent_date', 'last_failure_date', 'training_completion_date', 'hire_date']
Processed: 5000 rows, 48 features
Saved to: data/cleaned_data/knowbe4_cleaned.csv
