## Feature engineering

In [9]:
import pandas as pd
import numpy as np

# Load the required data
fraud_data_clean = pd.read_csv('../data/fraud_data_clean.csv', parse_dates=['purchase_time', 'signup_time'])
ip_country = pd.read_csv('../data/IpAddress_to_Country.csv')

def engineer_fraud_features(df, ip_country):
    """
    Enhanced feature engineering for fraud detection with detailed justifications.
    
    Args:
        df: Cleaned e-commerce transaction data
        ip_country: IP address to country mapping data
        
    Returns:
        DataFrame with engineered features for fraud detection
    """
    
    # Ensure necessary columns are present
    required_columns = ['ip_address', 'purchase_time', 'signup_time', 'user_id', 'device_id', 'purchase_value']
    for column in required_columns:
        if column not in df.columns:
            raise ValueError(f"Missing column: {column}")
    
    # Convert IP to integer for efficient range matching
    df['ip_address'] = df['ip_address'].apply(
        lambda x: int(x.replace('.', '')) if isinstance(x, str) else None)
    
    # Optimized country mapping
    ip_country['lower_bound'] = ip_country['lower_bound_ip_address'].astype('int64')
    ip_country['upper_bound'] = ip_country['upper_bound_ip_address'].astype('int64')
    
    country_map = list(zip(ip_country['lower_bound'], 
                            ip_country['upper_bound'], 
                            ip_country['country']))
    
    def find_country(ip_int):
        """Efficient IP-country mapping"""
        if ip_int is None:
            return None
        for lower, upper, country in country_map:
            if lower <= ip_int <= upper:
                return country
        return None

    df['country'] = df['ip_address'].apply(find_country)
    
    # Temporal features
    df['hour_of_day'] = df['purchase_time'].dt.hour
    df['day_of_week'] = df['purchase_time'].dt.dayofweek
    df['time_since_signup'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds() / 3600
    
    # Behavioral features
    user_counts = df['user_id'].value_counts().to_dict()
    df['user_transaction_count'] = df['user_id'].map(user_counts)
    
    df = df.sort_values(['user_id', 'purchase_time'])
    df['time_since_last_txn'] = df.groupby('user_id')['purchase_time'].diff().dt.total_seconds()
    
    # Device usage patterns
    device_stats = df.groupby('device_id').agg({
        'user_id': 'nunique',
        'purchase_value': 'mean'
    }).rename(columns={
        'user_id': 'users_per_device',
        'purchase_value': 'avg_device_spend'
    })
    df = df.merge(device_stats, on='device_id', how='left')
    
    # One-hot encoding
    for col in ['source', 'browser', 'sex', 'country']:
        if col in df.columns:
            freq = df[col].value_counts(normalize=True)
            df[col] = np.where(df[col].isin(freq[freq < 0.01].index), 'OTHER', df[col])
    
    df = pd.get_dummies(df, columns=[col for col in ['source', 'browser', 'sex', 'country'] if col in df.columns], 
                        drop_first=True, prefix_sep=':')
    
    # Feature selection
    cols_to_drop = ['user_id', 'device_id', 'signup_time', 'purchase_time', 'ip_address']
    df = df.drop(columns=cols_to_drop, errors='ignore')
    
    return df

# Run feature engineering
fraud_data_fe = engineer_fraud_features(fraud_data_clean, ip_country)

# Save output with compression
fraud_data_fe.to_csv('../data/fraud_data_fe.csv.gz', index=False, compression='gzip')

# Optional: Credit card dataset handling
# credit_data_clean = pd.read_csv('path_to_credit_data.csv')  # <-- uncomment and fix path if needed
# X_credit = credit_data_clean.drop('Class', axis=1)
# y_credit = credit_data_clean['Class']
