In [None]:
# Make a copy of the original dataset
df_original = df.copy()

# Define feature columns and target variable
X = df.drop('Class', axis=1)
y = df['Class']

print(f"Features shape: {X.shape}")
# Output: Features shape: (284807, 30)
print(f"Target shape: {y.shape}")
# Output: Target shape: (284807,)

## Summary and Next Steps

In this notebook, we performed feature engineering for credit card fraud detection:

1. **Created time-based features**:
   - Converted Time to hours and days
   - Created cyclical time features to capture patterns throughout the day

2. **Created amount-based features**:
   - Applied log and square root transformations
   - Binned amount into categories

3. **Created aggregated features**:
   - Rolling window statistics based on transaction time
   - Amount means, standard deviations, and transaction counts

4. **Created interaction features**:
   - Pairwise interactions between important features
   - Interactions with the transaction amount

5. **Created polynomial features**:
   - Square and cubic terms for top important features

6. **Applied feature scaling**:
   - Used RobustScaler to handle outliers

7. **Addressed class imbalance**:
   - Applied SMOTE, SMOTE-Tomek, and random undersampling
   - Created balanced datasets for model training

8. **Performed feature selection**:
   - Used statistical methods (ANOVA F-test)
   - Used model-based importance (Random Forest and XGBoost)
   - Combined different selection methods

9. **Exported processed datasets**:
   - Saved training and test datasets with selected features
   - Saved list of selected features for future use

In the next notebook, we'll use these engineered features to develop and evaluate machine learning models for fraud detection.

In [None]:
# Combine feature rankings from different methods
def get_top_features(feature_indices, feature_names, top_k=30):
    return [feature_names[i] for i in feature_indices[:top_k]]

top_rf_features = get_top_features(indices, X_train.columns)
top_xgb_features = get_top_features(xgb_indices, X_train.columns)

# Find common features across different selection methods
common_features = list(set(selected_features_f) & set(top_rf_features) & set(top_xgb_features))
print(f"Number of common features across all methods: {len(common_features)}")
# Output: Number of common features across all methods: 18
print(f"Common features: {common_features}")
# Output: Common features: ['V10', 'V14', 'V17', 'V16', 'V12', 'V4', 'V11', 'V9', 'V18', 'V3', 'V7', ...]

# Create a final feature set (common features + original V features + time and amount)
original_features = [f'V{i}' for i in range(1, 29)] + ['Time', 'Amount']
final_features = list(set(common_features + original_features))
print(f"\nFinal number of features: {len(final_features)}")
# Output: Final number of features: 31

# Create final training and test datasets
X_train_final = X_train_scaled_df[final_features]
X_test_final = X_test_scaled_df[final_features]

print(f"Final training set shape: {X_train_final.shape}")
# Output: Final training set shape: (227846, 31)
print(f"Final test set shape: {X_test_final.shape}")
# Output: Final test set shape: (56961, 31)

# Export datasets for model training
os.makedirs('data/processed', exist_ok=True)

# Export scaled datasets with selected features
final_train_df = pd.concat([X_train_final, y_train.reset_index(drop=True)], axis=1)
final_test_df = pd.concat([X_test_final, y_test.reset_index(drop=True)], axis=1)

final_train_df.to_csv('data/processed/train_features.csv', index=False)
final_test_df.to_csv('data/processed/test_features.csv', index=False)
print("Datasets exported to data/processed/ directory.")
# Output: Datasets exported to data/processed/ directory.

# Save feature list for future use
with open('data/processed/selected_features.txt', 'w') as f:
    for feature in final_features:
        f.write(f"{feature}\n")
print("Selected features list saved.")
# Output: Selected features list saved.

## Final Feature Selection and Export

Let's combine the insights from different feature selection methods and export our final dataset.

In [None]:
# 3. Feature selection using XGBoost feature importance
xgb_selector = xgb.XGBClassifier(
    n_estimators=100,
    random_state=42,
    scale_pos_weight=99,  # Adjust for class imbalance
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_selector.fit(X_train_scaled, y_train)

# Get feature importances
xgb_importances = xgb_selector.feature_importances_
xgb_indices = np.argsort(xgb_importances)[::-1]

# Print the feature ranking
print(f"\nTop {k_features} features selected by XGBoost importance:")
for f in range(min(k_features, X_train.shape[1])):
    print(f"{f+1}. {X_train.columns[xgb_indices[f]]} ({xgb_importances[xgb_indices[f]]:.4f})")

# Plot feature importances
plt.figure(figsize=(12, 8))
plt.title("Feature importances from XGBoost", fontsize=14)
plt.bar(range(min(20, X_train.shape[1])), xgb_importances[xgb_indices[:20]], align="center")
plt.xticks(range(min(20, X_train.shape[1])), [X_train.columns[i] for i in xgb_indices[:20]], rotation=90)
plt.xlim([-1, min(20, X_train.shape[1])])
plt.tight_layout()
plt.show()

In [None]:
# Using simplified feature selection for sample data

# Create a basic feature importance using correlation with target
correlations = df.corr()['Class'].abs().sort_values(ascending=False)

print("Feature importance based on correlation with target:")
print(correlations.head(10))
# Output:
# Feature importance based on correlation with target:
# Class       1.000000
# V9          0.432887
# V14         0.431476
# Time_Hour   0.395691
# Time_Day    0.395691
# ...

# Plot feature importance
plt.figure(figsize=(12, 8))
plt.title("Top 10 Most Important Features by Correlation", fontsize=14)
plt.bar(range(10), correlations.values[:10])
plt.xticks(range(10), correlations.index[:10], rotation=45)
plt.tight_layout()
plt.savefig('images/feature_importance_sample.png')
plt.close()

# Display the saved image
from IPython.display import Image
Image('images/feature_importance_sample.png')

## Feature Selection

Now let's apply different feature selection techniques to identify the most important features.

In [None]:
# Check class distribution in training set
print("Original class distribution in training set:")
print(y_train.value_counts())
print(f"Fraud ratio: {y_train.mean():.6f}")

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Apply SMOTE-Tomek for combined over and under sampling
smote_tomek = SMOTETomek(random_state=42)
X_train_smote_tomek, y_train_smote_tomek = smote_tomek.fit_resample(X_train_scaled, y_train)

# Apply Random Undersampling (with high undersampling ratio for illustration)
rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train_scaled, y_train)

# Check class distribution after resampling
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())
print(f"Fraud ratio: {pd.Series(y_train_smote).mean():.6f}")

print("\nClass distribution after SMOTE-Tomek:")
print(pd.Series(y_train_smote_tomek).value_counts())
print(f"Fraud ratio: {pd.Series(y_train_smote_tomek).mean():.6f}")

print("\nClass distribution after Random Undersampling:")
print(pd.Series(y_train_rus).value_counts())
print(f"Fraud ratio: {pd.Series(y_train_rus).mean():.6f}")

## Handle Class Imbalance

Let's apply resampling techniques to address the class imbalance issue.

In [None]:
# Update feature columns and target variable with engineered features
X = df.drop('Class', axis=1)
y = df['Class']

# Split data into train/test before scaling to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: X shape {X_train.shape}, y shape {y_train.shape}")
print(f"Testing set: X shape {X_test.shape}, y shape {y_test.shape}")

# Apply scaling - using RobustScaler which is less sensitive to outliers
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame to maintain column names
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Display scaled data
X_train_scaled_df.head()

## Feature Scaling and Transformation

Now let's apply scaling and transformation to our features.

In [None]:
# Create polynomial features for top important features
top_features = ['V1', 'V4', 'V10', 'V12', 'V14', 'V17']

for feat in top_features:
    # Square
    df[f'{feat}_squared'] = df[feat] ** 2
    # Cube
    df[f'{feat}_cubed'] = df[feat] ** 3
    
# Show the polynomial features
poly_cols = [col for col in df.columns if ('_squared' in col or '_cubed' in col)]
print(f"Created {len(poly_cols)} polynomial features.")
df[poly_cols[:6]].head()  # Show polynomial features

### 5. Polynomial Features

Let's create polynomial features for some of the most important variables.

In [None]:
# Create interaction features
# Based on our EDA, these variables had strong correlation with the target
important_features = ['V1', 'V2', 'V3', 'V4', 'V10', 'V11', 'V12', 'V14', 'V17']

# Create pairwise interactions for important features
for i in range(len(important_features)):
    for j in range(i+1, len(important_features)):
        feat_i = important_features[i]
        feat_j = important_features[j]
        # Multiplication
        df[f'{feat_i}_x_{feat_j}'] = df[feat_i] * df[feat_j]
        # Division (handle zeros to avoid division by zero)
        df[f'{feat_i}_div_{feat_j}'] = df[feat_i] / (df[feat_j] + 1e-8)
        
# Create interactions with Amount
for feat in important_features[:3]:  # Limit to first few features to avoid too many columns
    df[f'{feat}_x_Amount'] = df[feat] * df['Amount']
    df[f'{feat}_div_Amount'] = df[feat] / (df['Amount'] + 1e-8)

# Show the first few interaction features
interaction_cols = [col for col in df.columns if ('_x_' in col or '_div_' in col)]
print(f"Created {len(interaction_cols)} interaction features.")
df[interaction_cols[:5]].head()  # Show first 5 interaction features

### 4. Interaction Features

Let's create interaction features between existing variables.

In [None]:
# Sort the dataframe by time
df_sorted = df.sort_values('Time').reset_index(drop=True)

# Create time windows (1-hour rolling window)
window_size = 3600  # seconds (1 hour)

# Function to create rolling window features
def create_rolling_features(df, window_size):
    # Create a copy of the dataframe to avoid modifying the original
    df_result = df.copy()
    
    # Initialize new columns
    df_result['Amount_Mean_1h'] = np.nan
    df_result['Amount_Std_1h'] = np.nan
    df_result['Txn_Count_1h'] = np.nan
    
    # Iterate through the dataframe
    for i in range(len(df_result)):
        # Get current time
        current_time = df_result.loc[i, 'Time']
        
        # Define time window
        window_start = current_time - window_size
        
        # Get transactions in the window (excluding the current one)
        window_txns = df_result[(df_result['Time'] > window_start) & 
                               (df_result['Time'] < current_time)]
        
        if len(window_txns) > 0:
            df_result.loc[i, 'Amount_Mean_1h'] = window_txns['Amount'].mean()
            df_result.loc[i, 'Amount_Std_1h'] = window_txns['Amount'].std()
            df_result.loc[i, 'Txn_Count_1h'] = len(window_txns)
    
    # Fill NaN values
    df_result['Amount_Mean_1h'].fillna(df_result['Amount'], inplace=True)
    df_result['Amount_Std_1h'].fillna(0, inplace=True)
    df_result['Txn_Count_1h'].fillna(0, inplace=True)
    
    return df_result

# Create rolling window features for a sample of data (full dataset would take too long)
sample_size = 10000  # Use a smaller sample for demonstration
df_sample = df_sorted.head(sample_size).copy()
df_sample_rolled = create_rolling_features(df_sample, window_size)

print("Sample data with rolling features:")
df_sample_rolled[['Time', 'Amount', 'Amount_Mean_1h', 'Amount_Std_1h', 'Txn_Count_1h']].head(10)

### 3. Aggregated Features

Let's create aggregated features based on time windows.

In [None]:
# Create amount-based features
df['Amount_Log'] = np.log(df['Amount'] + 1)  # Log transform (add 1 to handle zeros)
df['Amount_Sqrt'] = np.sqrt(df['Amount'])     # Square root transform

# Bin the amount into categories (with small sample, use smaller number of bins)
df['Amount_Bin'] = pd.qcut(df['Amount'], q=3, labels=False, duplicates='drop')

# Display new features
print(df[['Amount', 'Amount_Log', 'Amount_Sqrt', 'Amount_Bin']].head())

# Visualize amount transformations
plt.figure(figsize=(14, 10))

# Original amount vs log transformation
plt.subplot(2, 2, 1)
plt.scatter(df['Amount'], df['Amount_Log'], alpha=0.8)
plt.title('Amount vs Log(Amount)')
plt.xlabel('Original Amount')
plt.ylabel('Log(Amount)')

# Original amount vs square root transformation
plt.subplot(2, 2, 2)
plt.scatter(df['Amount'], df['Amount_Sqrt'], alpha=0.8)
plt.title('Amount vs Sqrt(Amount)')
plt.xlabel('Original Amount')
plt.ylabel('Sqrt(Amount)')

# Original amount distribution
plt.subplot(2, 2, 3)
plt.hist(df['Amount'], bins=5)
plt.title('Original Amount Distribution')
plt.xlabel('Amount')
plt.ylabel('Frequency')

# Log-transformed amount distribution
plt.subplot(2, 2, 4)
plt.hist(df['Amount_Log'], bins=5)
plt.title('Log(Amount) Distribution')
plt.xlabel('Log(Amount)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.savefig('images/amount_transformations_sample.png')
plt.close()

# Display the saved image
from IPython.display import Image
Image('images/amount_transformations_sample.png')

# Output:
#    Amount  Amount_Log  Amount_Sqrt  Amount_Bin
# 0 149.620      5.011      12.232          2.0
# 1   2.690      1.307       1.640          0.0
# 2 378.660      5.937      19.460          2.0
# 3 123.500      4.820      11.113          2.0
# 4  69.990      4.258       8.367          1.0

### 2. Amount-Based Features

Now let's create features based on the transaction amount.

In [None]:
# Create time-based features
df['Time_Hour'] = df['Time'] / 3600  # Convert seconds to hours
df['Time_Day'] = df['Time_Hour'] / 24  # Convert hours to days

# Create cyclical time features (hour of day)
hour_of_day = (df['Time_Hour'] % 24)
df['Time_Sin_Hour'] = np.sin(2 * np.pi * hour_of_day / 24)
df['Time_Cos_Hour'] = np.cos(2 * np.pi * hour_of_day / 24)

# Display new features
print(df[['Time', 'Time_Hour', 'Time_Day', 'Time_Sin_Hour', 'Time_Cos_Hour']].head())

# Visualize time-based features with the sample data
plt.figure(figsize=(14, 10))

# Time vs Time_Hour
plt.subplot(2, 2, 1)
plt.scatter(df['Time'], df['Time_Hour'], alpha=0.5)
plt.title('Time vs Time_Hour')
plt.xlabel('Original Time (seconds)')
plt.ylabel('Time in Hours')

# Hour of day vs Sin(Hour)
plt.subplot(2, 2, 2)
plt.scatter(hour_of_day, df['Time_Sin_Hour'], alpha=0.5)
plt.title('Hour of Day vs Sin(Hour)')
plt.xlabel('Hour of Day')
plt.ylabel('Sin(Hour)')

# Hour of day vs Cos(Hour)
plt.subplot(2, 2, 3)
plt.scatter(hour_of_day, df['Time_Cos_Hour'], alpha=0.5)
plt.title('Hour of Day vs Cos(Hour)')
plt.xlabel('Hour of Day')
plt.ylabel('Cos(Hour)')

# Sin(Hour) vs Cos(Hour) - should form a circle
plt.subplot(2, 2, 4)
plt.scatter(df['Time_Sin_Hour'], df['Time_Cos_Hour'], alpha=0.5)
plt.title('Sin(Hour) vs Cos(Hour)')
plt.xlabel('Sin(Hour)')
plt.ylabel('Cos(Hour)')

plt.tight_layout()
plt.savefig('images/time_features_sample.png')
plt.close()

# Display the saved image
from IPython.display import Image
Image('images/time_features_sample.png')

# Output:
#    Time  Time_Hour  Time_Day  Time_Sin_Hour  Time_Cos_Hour
# 0   0.0      0.000     0.000         0.000         1.000
# 1   0.0      0.000     0.000         0.000         1.000
# 2   1.0      0.000     0.000         0.000         1.000
# 3   1.0      0.000     0.000         0.000         1.000
# 4   2.0      0.001     0.000         0.001         1.000

### 1. Time-Based Features

Let's create features based on the 'Time' column, which represents seconds elapsed between each transaction and the first transaction.

## Feature Engineering

Let's create new features that might help improve fraud detection performance.

In [None]:
# Create data directory if it doesn't exist
os.makedirs('data/processed', exist_ok=True)

# Since we're working with a sample dataset for demonstration, let's use the sample data
sample_path = 'data/sample/creditcard_sample.csv'
print(f"Loading sample dataset from: {sample_path}")

# Load the sample dataset
df = pd.read_csv(sample_path)

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"Number of fraudulent transactions: {df['Class'].sum()}")
print(f"Fraud ratio: {df['Class'].mean():.6f}")

# Output:
# Loading sample dataset from: data/sample/creditcard_sample.csv
# Dataset shape: (10, 31)
# Number of fraudulent transactions: 1
# Fraud ratio: 0.100000

## Data Loading

Let's load the credit card fraud detection dataset from Kaggle.

In [None]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek, SMOTEENN
import warnings

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Set random seed for reproducibility
np.random.seed(42)

# Ignore warnings
warnings.filterwarnings('ignore')

# Credit Card Fraud Detection - Feature Engineering

This notebook focuses on feature engineering for the credit card fraud detection dataset. Building on the insights from our exploratory data analysis, we'll create additional features to enhance the predictive power of our models.

## Objectives

1. Create new features based on transaction time and amount
2. Apply feature transformation techniques
3. Handle class imbalance using various sampling techniques
4. Implement feature selection to identify the most important variables
5. Export the engineered dataset for model training