### Task 4 - Proxy Target Variable Engineering

In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
%reload_ext autoreload

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import sys
import warnings
warnings.filterwarnings("ignore")
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [13]:
sys.path.append(os.path.abspath('../src/'))

In [14]:
from data_preprocessing import data_loader
from data_preprocessing_FE_Proxy import process_data_with_proxy_target

#### Perform Proxy Target Variable Engineering

In [15]:
df = data_loader('../data/raw/data.csv')
logging.info("Data loaded successfully")
logging.info(f"Input data shape: {df.shape}")
logging.info(f"Input data columns: {list(df.columns)}")

2025-07-06 15:05:58,621 - INFO - CSV file loaded successfully from ../data/raw/data.csv.
2025-07-06 15:05:58,626 - INFO - Data loaded successfully
2025-07-06 15:05:58,631 - INFO - Input data shape: (95662, 16)
2025-07-06 15:05:58,640 - INFO - Input data columns: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult']


In [16]:
# Define columns
numerical_columns = ['Amount', 'Value']
categorical_columns = ['ProductCategory', 'ChannelId', 'ProviderId']
# Adjust if necessary (e.g., 'customer_id')
customer_id_col = 'CustomerId'

X_processed, y, y_proxy, feature_names = process_data_with_proxy_target(
    df,
    numerical_columns=numerical_columns,
    categorical_columns=categorical_columns,
    customer_id_col=customer_id_col
    )

2025-07-06 15:06:16,164 - INFO - Starting data processing with proxy target engineering
2025-07-06 15:06:16,173 - INFO - Starting data processing
2025-07-06 15:06:16,457 - INFO - Creating data processing pipeline
2025-07-06 15:06:16,471 - INFO - Extracting time-based features
2025-07-06 15:06:18,753 - INFO - Columns after time feature extraction: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'TransactionHour', 'TransactionDay', 'TransactionMonth', 'TransactionYear']
2025-07-06 15:06:19,005 - INFO - Aggregating features by CustomerId
2025-07-06 15:06:19,531 - INFO - Columns after aggregation: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'PricingStrategy', 'TransactionHour', 

In [17]:
# Save processed data and targets
X_processed.to_csv('processed_data_with_proxy.csv', index=False)
y.to_csv('target_fraud.csv', index=False)
y_proxy.to_csv('target_proxy.csv', index=False)
logging.info("Processed data and targets saved")

2025-07-06 15:07:36,622 - INFO - Processed data and targets saved


In [18]:
print("Feature names:", feature_names)
print("Processed data preview:")
print(X_processed.head())
print("Proxy target distribution:")
print(y_proxy.value_counts())

Feature names: ['Amount', 'Value', 'TransactionHour', 'TransactionDay', 'TransactionMonth', 'TransactionYear', 'Amount_TotalAmount', 'Amount_AvgAmount', 'Amount_TransactionCount', 'Amount_StdAmount', 'ProductCategory_data_bundles', 'ProductCategory_financial_services', 'ProductCategory_movies', 'ProductCategory_other', 'ProductCategory_ticket', 'ProductCategory_transport', 'ProductCategory_tv', 'ProductCategory_utility_bill', 'ChannelId_ChannelId_2', 'ChannelId_ChannelId_3', 'ChannelId_ChannelId_5', 'ProviderId_ProviderId_2', 'ProviderId_ProviderId_3', 'ProviderId_ProviderId_4', 'ProviderId_ProviderId_5', 'ProviderId_ProviderId_6', 'TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProductId', 'PricingStrategy', 'is_high_risk']
Processed data preview:
     Amount     Value TransactionHour TransactionDay TransactionMonth  \
0 -0.139857 -0.072291        -2.15553      -0.100739         0.848684   
1 -0.457536 -0.080251        -2.15553 