# Feature Engineering for Fraud Detection
This notebook processes the `train_transaction` and `train_identity` datasets, merges them on `TransactionID`, and generates simple features.

In [None]:
import pandas as pd
import numpy as np

# Load the datasets
train_transaction = pd.read_csv('../data/raw/train_transaction.csv', nrows=10000)
train_identity = pd.read_csv('../data/raw/train_identity.csv', nrows=10000)

# Merge datasets on TransactionID
df = train_transaction.merge(train_identity, on='TransactionID', how='left')

print(f'Merged dataset shape: {df.shape}')

## Handling Categorical Features

In [None]:
# Define categorical features
cat_features = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 
                'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain']

cat_features += [f'M{i}' for i in range(1, 10)]
cat_features += ['DeviceType', 'DeviceInfo']
cat_features += [f'id_{i}' for i in range(12, 39)]

# Convert categorical features to category type
for col in cat_features:
    if col in df.columns:
        df[col] = df[col].astype('category')

print('Categorical features converted to category type.')

## Handling Missing Values

In [None]:
# Ensure 'missing' is a category before filling missing values
for col in cat_features:
    if col in df.columns and df[col].dtype.name == 'category':
        df[col] = df[col].cat.add_categories(['missing'])

# Fill missing categorical values with 'missing'
df[cat_features] = df[cat_features].fillna('missing')

# Fill missing numerical values with -999 (a placeholder for missing values)
num_features = df.select_dtypes(include=['number']).columns.tolist()
df[num_features] = df[num_features].fillna(-999)

print('Missing values handled.')

## Feature Engineering

In [None]:
df = df.copy()  # Defragment

df = df.assign(
    TransactionAmt_log=np.log1p(df['TransactionAmt']),
    TransactionDay=(df['TransactionDT'] // (24 * 3600)) % 7
)

print('Simple features created.')

## Save Processed Dataset

In [None]:
df.to_csv('processed_train.csv', index=False)
print('Processed dataset saved.')