# Fraud Detection Data Preprocessing Pipeline
## Tugas Besar 2 IF3070 – Dasar Inteligensi Artifisial

**Author:** AbyuDAIya-Ganbatte Team

This notebook implements a comprehensive preprocessing pipeline for fraud detection data, including:
- Missing value imputation
- One-hot encoding
- Feature scaling
- Outlier handling
- Advanced feature engineering

## 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
np.random.seed(42)

print("Libraries imported successfully!")

: 

## 2. Load Data

In [None]:
# Load training and test data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(f"Training data shape: {train.shape}")
print(f"Test data shape: {test.shape}")
print(f"\nTraining data columns: {train.columns.tolist()}")

## 3. Exploratory Data Analysis

In [None]:
# Display first few rows
print("First 5 rows of training data:")
display(train.head())

print("\nFirst 5 rows of test data:")
display(test.head())

In [None]:
# Check data types and missing values
print("Training data info:")
print(train.info())

print("\n" + "="*50)
print("Missing values in training data:")
missing_train = train.isnull().sum()
print(missing_train[missing_train > 0])

print("\n" + "="*50)
print("Missing values in test data:")
missing_test = test.isnull().sum()
print(missing_test[missing_test > 0])

In [None]:
# Check target distribution
print("Target distribution:")
print(train['is_fraud'].value_counts())
print(f"\nFraud percentage: {train['is_fraud'].mean() * 100:.2f}%")

# Visualize target distribution
plt.figure(figsize=(8, 5))
train['is_fraud'].value_counts().plot(kind='bar')
plt.title('Distribution of Fraud vs Non-Fraud Cases')
plt.xlabel('is_fraud')
plt.ylabel('Count')
plt.xticks([0, 1], ['Non-Fraud', 'Fraud'], rotation=0)
plt.show()

## 4. Data Preprocessing Setup

In [None]:
# Store IDs for later use
test_ids = test['ID'].values
train_ids = train['ID'].values

print(f"Stored {len(train_ids)} training IDs")
print(f"Stored {len(test_ids)} test IDs")

In [None]:
# Separate target variable
y_train = train['is_fraud'].values

print(f"Target variable shape: {y_train.shape}")
print(f"Target variable type: {y_train.dtype}")

In [None]:
# Drop unnecessary columns
cols_to_drop = ['ID', 'transaction_id', 'user_id']

train_features = train.drop(columns=['is_fraud'] + cols_to_drop, errors='ignore')
test_features = test.drop(columns=cols_to_drop, errors='ignore')

print(f"Training features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")

In [None]:
# Identify column types
numerical_cols = train_features.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = train_features.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numerical columns ({len(numerical_cols)}):")
print(numerical_cols)
print(f"\nCategorical columns ({len(categorical_cols)}):")
print(categorical_cols)

## 5. Missing Value Imputation

In [None]:
# Handle missing values in training data
train_filled = train_features.copy()
imputation_values = {}

# Mean imputation for numerical columns
for col in numerical_cols:
    if col in train_filled.columns and train_filled[col].isnull().any():
        mean_value = train_filled[col].mean()
        train_filled[col] = train_filled[col].fillna(mean_value)
        imputation_values[col] = {'type': 'mean', 'value': mean_value}
        print(f"Imputed {col} with mean: {mean_value:.4f}")

# Mode imputation for categorical columns
for col in categorical_cols:
    if col in train_filled.columns and train_filled[col].isnull().any():
        mode_value = train_filled[col].mode()
        if len(mode_value) > 0:
            mode_value = mode_value[0]
            train_filled[col] = train_filled[col].fillna(mode_value)
            imputation_values[col] = {'type': 'mode', 'value': mode_value}
            print(f"Imputed {col} with mode: {mode_value}")

print(f"\nImputation complete. Imputed {len(imputation_values)} columns.")

In [None]:
# Apply same imputation to test data
test_filled = test_features.copy()

for col, info in imputation_values.items():
    if col in test_filled.columns and test_filled[col].isnull().any():
        test_filled[col] = test_filled[col].fillna(info['value'])
        print(f"Applied imputation to test set for: {col}")

# Handle any remaining missing values in test (categories not in train)
for col in numerical_cols:
    if col in test_filled.columns and test_filled[col].isnull().any():
        fill_val = imputation_values.get(col, {}).get('value', 0)
        test_filled[col] = test_filled[col].fillna(fill_val)

for col in categorical_cols:
    if col in test_filled.columns and test_filled[col].isnull().any():
        mode_val = train_filled[col].mode()[0] if len(train_filled[col].mode()) > 0 else 'unknown'
        test_filled[col] = test_filled[col].fillna(mode_val)

print(f"\nMissing values remaining in train: {train_filled.isnull().sum().sum()}")
print(f"Missing values remaining in test: {test_filled.isnull().sum().sum()}")

## 6. Feature Engineering

### 6.1 Ratio Features

In [None]:
# Create ratio features
ratio_pairs = [
    ('transaction_amount', 'avg_transaction_amount', 'amount_vs_avg_ratio'),
    ('transaction_amount', 'std_transaction_amount', 'amount_vs_std_ratio'),
    ('transactions_last_1h', 'transactions_last_24h', 'hourly_vs_daily_ratio'),
    ('failed_login_attempts', 'num_prev_transactions', 'failed_vs_total_ratio'),
    ('shared_ip_users', 'shared_device_users', 'ip_vs_device_shared_ratio'),
]

for num, denom, name in ratio_pairs:
    if num in train_filled.columns and denom in train_filled.columns:
        # Train
        denom_safe = train_filled[denom].replace(0, 1e-10)
        train_filled[name] = train_filled[num] / denom_safe
        
        # Test
        denom_safe = test_filled[denom].replace(0, 1e-10)
        test_filled[name] = test_filled[num] / denom_safe
        
        print(f"Created ratio feature: {name}")

print(f"\nTotal features after ratio creation: {train_filled.shape[1]}")

### 6.2 Log-Transformed Features

In [None]:
# Create log-transformed features for skewed distributions
log_candidates = [
    'transaction_amount', 'avg_transaction_amount', 'std_transaction_amount',
    'account_age_days', 'distance_from_home', 'num_prev_transactions'
]

for col in log_candidates:
    if col in train_filled.columns:
        # Only log transform non-negative columns
        min_val = train_filled[col].min()
        if min_val >= 0:
            train_filled[f'{col}_log'] = np.log1p(train_filled[col])
            test_filled[f'{col}_log'] = np.log1p(test_filled[col])
            print(f"Created log feature: {col}_log")

print(f"\nTotal features after log transformation: {train_filled.shape[1]}")

### 6.3 Interaction Features

In [None]:
# Risk Score Interactions
if 'ip_risk_score' in train_filled.columns and 'device_trust_score' in train_filled.columns:
    train_filled['risk_interaction'] = train_filled['ip_risk_score'] * (1 - train_filled['device_trust_score'] / 100)
    test_filled['risk_interaction'] = test_filled['ip_risk_score'] * (1 - test_filled['device_trust_score'] / 100)
    print("Created: risk_interaction")

if 'merchant_risk' in train_filled.columns and 'country_risk' in train_filled.columns:
    train_filled['merchant_country_risk'] = train_filled['merchant_risk'] * train_filled['country_risk']
    test_filled['merchant_country_risk'] = test_filled['merchant_risk'] * test_filled['country_risk']
    print("Created: merchant_country_risk")

# Transaction amount anomaly indicators
if 'transaction_amount' in train_filled.columns and 'avg_transaction_amount' in train_filled.columns:
    std_col = 'std_transaction_amount'
    if std_col in train_filled.columns and train_filled[std_col].mean() > 0:
        train_filled['amount_zscore'] = (train_filled['transaction_amount'] - train_filled['avg_transaction_amount']) / (train_filled[std_col] + 1e-6)
        test_filled['amount_zscore'] = (test_filled['transaction_amount'] - test_filled['avg_transaction_amount']) / (test_filled[std_col] + 1e-6)
        print("Created: amount_zscore")

# Velocity-based features
if 'transactions_last_24h' in train_filled.columns and 'transactions_last_1h' in train_filled.columns:
    train_filled['hourly_concentration'] = train_filled['transactions_last_1h'] / (train_filled['transactions_last_24h'] + 1)
    test_filled['hourly_concentration'] = test_filled['transactions_last_1h'] / (test_filled['transactions_last_24h'] + 1)
    print("Created: hourly_concentration")

# Account trust features
if 'account_age_days' in train_filled.columns and 'num_prev_transactions' in train_filled.columns:
    train_filled['tx_per_day_age'] = train_filled['num_prev_transactions'] / (train_filled['account_age_days'] + 1)
    test_filled['tx_per_day_age'] = test_filled['num_prev_transactions'] / (test_filled['account_age_days'] + 1)
    print("Created: tx_per_day_age")

# New user risk
if 'is_new_country' in train_filled.columns and 'distance_from_home' in train_filled.columns:
    train_filled['new_location_distance'] = train_filled['is_new_country'] * train_filled['distance_from_home']
    test_filled['new_location_distance'] = test_filled['is_new_country'] * test_filled['distance_from_home']
    print("Created: new_location_distance")

# Failed login impact
if 'failed_login_attempts' in train_filled.columns and 'transaction_amount' in train_filled.columns:
    train_filled['failed_login_amount'] = train_filled['failed_login_attempts'] * train_filled['transaction_amount']
    test_filled['failed_login_amount'] = test_filled['failed_login_attempts'] * test_filled['transaction_amount']
    print("Created: failed_login_amount")

# Shared resource risk
if 'shared_ip_users' in train_filled.columns and 'shared_device_users' in train_filled.columns:
    train_filled['total_shared_users'] = train_filled['shared_ip_users'] + train_filled['shared_device_users']
    test_filled['total_shared_users'] = test_filled['shared_ip_users'] + test_filled['shared_device_users']
    
    train_filled['shared_resource_product'] = train_filled['shared_ip_users'] * train_filled['shared_device_users']
    test_filled['shared_resource_product'] = test_filled['shared_ip_users'] * test_filled['shared_device_users']
    print("Created: total_shared_users, shared_resource_product")

# Time-based risk
if 'time_of_day' in train_filled.columns:
    train_filled['is_night_time'] = ((train_filled['time_of_day'] >= 0) & (train_filled['time_of_day'] <= 6) | 
                                      (train_filled['time_of_day'] >= 22)).astype(float)
    test_filled['is_night_time'] = ((test_filled['time_of_day'] >= 0) & (test_filled['time_of_day'] <= 6) | 
                                     (test_filled['time_of_day'] >= 22)).astype(float)
    print("Created: is_night_time")

if 'day_of_week' in train_filled.columns:
    train_filled['is_weekend'] = (train_filled['day_of_week'] >= 5).astype(float)
    test_filled['is_weekend'] = (test_filled['day_of_week'] >= 5).astype(float)
    print("Created: is_weekend")

# Chargeback history interaction
if 'has_chargeback_history' in train_filled.columns and 'transaction_amount' in train_filled.columns:
    train_filled['chargeback_high_amount'] = train_filled['has_chargeback_history'] * (train_filled['transaction_amount'] > train_filled['transaction_amount'].median()).astype(float)
    test_filled['chargeback_high_amount'] = test_filled['has_chargeback_history'] * (test_filled['transaction_amount'] > train_filled['transaction_amount'].median()).astype(float)
    print("Created: chargeback_high_amount")

# Additional high-value interactions
if 'ip_risk_score' in train_filled.columns and 'transaction_amount' in train_filled.columns:
    train_filled['high_risk_high_amount'] = train_filled['ip_risk_score'] * train_filled['transaction_amount']
    test_filled['high_risk_high_amount'] = test_filled['ip_risk_score'] * test_filled['transaction_amount']
    print("Created: high_risk_high_amount")

if 'failed_login_attempts' in train_filled.columns and 'ip_risk_score' in train_filled.columns:
    train_filled['failed_login_risk'] = train_filled['failed_login_attempts'] * train_filled['ip_risk_score']
    test_filled['failed_login_risk'] = test_filled['failed_login_attempts'] * test_filled['ip_risk_score']
    print("Created: failed_login_risk")

if 'is_new_country' in train_filled.columns and 'transaction_amount' in train_filled.columns:
    train_filled['new_country_amount'] = train_filled['is_new_country'] * train_filled['transaction_amount']
    test_filled['new_country_amount'] = test_filled['is_new_country'] * test_filled['transaction_amount']
    print("Created: new_country_amount")

# Squared features for important risk indicators
if 'ip_risk_score' in train_filled.columns:
    train_filled['ip_risk_squared'] = train_filled['ip_risk_score'] ** 2
    test_filled['ip_risk_squared'] = test_filled['ip_risk_score'] ** 2
    print("Created: ip_risk_squared")

if 'merchant_risk' in train_filled.columns:
    train_filled['merchant_risk_squared'] = train_filled['merchant_risk'] ** 2
    test_filled['merchant_risk_squared'] = test_filled['merchant_risk'] ** 2
    print("Created: merchant_risk_squared")

if 'country_risk' in train_filled.columns:
    train_filled['country_risk_squared'] = train_filled['country_risk'] ** 2
    test_filled['country_risk_squared'] = test_filled['country_risk'] ** 2
    print("Created: country_risk_squared")

# Combined risk score
risk_cols = []
if 'ip_risk_score' in train_filled.columns:
    risk_cols.append('ip_risk_score')
if 'merchant_risk' in train_filled.columns:
    risk_cols.append('merchant_risk')
if 'country_risk' in train_filled.columns:
    risk_cols.append('country_risk')
if len(risk_cols) > 0:
    train_filled['combined_risk'] = train_filled[risk_cols].mean(axis=1)
    test_filled['combined_risk'] = test_filled[risk_cols].mean(axis=1)
    print("Created: combined_risk")

print(f"\nTotal features after interaction creation: {train_filled.shape[1]}")

In [None]:
# Update numerical columns list after feature engineering
numerical_cols = train_filled.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = train_filled.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Features after engineering:")
print(f"  - Numerical: {len(numerical_cols)}")
print(f"  - Categorical: {len(categorical_cols)}")

## 7. One-Hot Encoding

In [None]:
# Perform one-hot encoding on training data
train_encoded = train_filled.copy()
encoding_info = {}

for col in categorical_cols:
    if col in train_encoded.columns:
        # Get unique categories
        categories = train_encoded[col].unique()
        categories = [c for c in categories if pd.notna(c)]
        categories = sorted(categories)
        
        encoding_info[col] = categories
        
        # Create dummy columns
        for category in categories:
            new_col_name = f"{col}_{category}"
            train_encoded[new_col_name] = (train_encoded[col] == category).astype(int)
        
        # Drop original column
        train_encoded = train_encoded.drop(columns=[col])
        print(f"Encoded {col}: {len(categories)} categories")

print(f"\nShape after encoding: {train_encoded.shape}")

In [None]:
# Apply same encoding to test data
test_encoded = test_filled.copy()

for col, categories in encoding_info.items():
    if col in test_encoded.columns:
        for category in categories:
            new_col_name = f"{col}_{category}"
            test_encoded[new_col_name] = (test_encoded[col] == category).astype(int)
        
        test_encoded = test_encoded.drop(columns=[col])

print(f"Test shape after encoding: {test_encoded.shape}")

In [None]:
# Ensure both train and test have same columns
train_cols = set(train_encoded.columns)
test_cols = set(test_encoded.columns)

# Add missing columns to test
for col in train_cols - test_cols:
    test_encoded[col] = 0
    print(f"Added missing column to test: {col}")

# Add missing columns to train (shouldn't happen often)
for col in test_cols - train_cols:
    train_encoded[col] = 0
    print(f"Added missing column to train: {col}")

# Ensure same column order
all_cols = sorted(train_encoded.columns.tolist())
train_encoded = train_encoded[all_cols]
test_encoded = test_encoded[all_cols]

print(f"\nFinal shapes:")
print(f"  Train: {train_encoded.shape}")
print(f"  Test: {test_encoded.shape}")

## 8. Outlier Handling

In [None]:
# Clip outliers using percentile-based winsorization
train_values = train_encoded.values.copy()
test_values = test_encoded.values.copy()

n_features = train_values.shape[1]
clip_bounds = {'lower': [], 'upper': []}

lower_percentile = 1
upper_percentile = 99

for i in range(n_features):
    lower = np.percentile(train_values[:, i], lower_percentile)
    upper = np.percentile(train_values[:, i], upper_percentile)
    
    clip_bounds['lower'].append(lower)
    clip_bounds['upper'].append(upper)
    
    train_values[:, i] = np.clip(train_values[:, i], lower, upper)

print(f"Clipped outliers in training data using {lower_percentile}th and {upper_percentile}th percentiles")
print(f"Shape: {train_values.shape}")

In [None]:
# Apply same clipping to test data
for i in range(test_values.shape[1]):
    test_values[:, i] = np.clip(test_values[:, i], clip_bounds['lower'][i], clip_bounds['upper'][i])

print(f"Applied outlier clipping to test data")
print(f"Shape: {test_values.shape}")

## 9. Feature Scaling (Standardization)

In [None]:
# Compute mean and standard deviation from training data
means = np.mean(train_values, axis=0)
stds = np.std(train_values, axis=0)

# Avoid division by zero
stds[stds == 0] = 1.0

print(f"Computed scaling parameters:")
print(f"  Mean range: [{means.min():.4f}, {means.max():.4f}]")
print(f"  Std range: [{stds.min():.4f}, {stds.max():.4f}]")

In [None]:
# Apply standardization
X_train = (train_values - means) / stds
X_test = (test_values - means) / stds

print(f"Standardization complete!")
print(f"\nX_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")

print(f"\nX_train statistics:")
print(f"  Mean: {X_train.mean():.6f} (should be ~0)")
print(f"  Std: {X_train.std():.6f} (should be ~1)")

## 10. Final Data Summary

In [None]:
print("="*60)
print("PREPROCESSING COMPLETE")
print("="*60)

print(f"\nFinal Dataset Shapes:")
print(f"  X_train: {X_train.shape}")
print(f"  y_train: {y_train.shape}")
print(f"  X_test: {X_test.shape}")
print(f"  test_ids: {test_ids.shape}")

print(f"\nNumber of features: {X_train.shape[1]}")
print(f"Number of training samples: {X_train.shape[0]}")
print(f"Number of test samples: {X_test.shape[0]}")

print(f"\nTarget distribution:")
print(f"  Non-fraud: {(y_train == 0).sum()} ({(y_train == 0).mean()*100:.2f}%)")
print(f"  Fraud: {(y_train == 1).sum()} ({(y_train == 1).mean()*100:.2f}%)")

## 11. Save Preprocessed Data (Optional)

In [None]:
# Save preprocessed data
np.save('X_train_preprocessed.npy', X_train)
np.save('y_train_preprocessed.npy', y_train)
np.save('X_test_preprocessed.npy', X_test)
np.save('test_ids.npy', test_ids)

print("Preprocessed data saved successfully!")
print("  - X_train_preprocessed.npy")
print("  - y_train_preprocessed.npy")
print("  - X_test_preprocessed.npy")
print("  - test_ids.npy")

In [None]:
# Save preprocessing parameters
preprocessing_params = {
    'imputation_values': {k: {'type': v['type'], 'value': float(v['value']) if isinstance(v['value'], (int, float, np.number)) else str(v['value'])} 
                         for k, v in imputation_values.items()},
    'encoding_info': {k: [str(cat) for cat in v] for k, v in encoding_info.items()},
    'clip_bounds': {'lower': [float(x) for x in clip_bounds['lower']], 
                    'upper': [float(x) for x in clip_bounds['upper']]},
    'scaling_params': {'means': means.tolist(), 'stds': stds.tolist()},
    'feature_names': all_cols
}

with open('preprocessing_params.json', 'w') as f:
    json.dump(preprocessing_params, f, indent=2)

print("Preprocessing parameters saved to preprocessing_params.json")

## 12. Visualize Feature Distributions (Sample)

In [None]:
# Visualize distribution of first 5 features after preprocessing
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for i in range(min(6, X_train.shape[1])):
    axes[i].hist(X_train[:, i], bins=50, alpha=0.7, edgecolor='black')
    axes[i].set_title(f'Feature {i}')
    axes[i].set_xlabel('Value (standardized)')
    axes[i].set_ylabel('Frequency')
    axes[i].grid(alpha=0.3)

plt.tight_layout()
plt.suptitle('Distribution of First 6 Standardized Features', y=1.02, fontsize=14)
plt.show()

## 13. Data Ready for Modeling

The preprocessed data is now ready to be used with machine learning models. You have:

- **X_train**: Preprocessed training features (standardized)
- **y_train**: Training labels
- **X_test**: Preprocessed test features (standardized)
- **test_ids**: Test sample IDs for submission

All preprocessing steps performed:
1. ✅ Missing value imputation
2. ✅ Feature engineering (ratios, logs, interactions)
3. ✅ One-hot encoding
4. ✅ Outlier clipping
5. ✅ Standardization (z-score normalization)

You can now proceed to train your machine learning models!