In [1]:
import pandas as pd
import numpy as np
import sys
import os
from IPython.display import display

current_dir = os.getcwd()

project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print(f"Added '{project_root}' to sys.path.")

# Import the feature engineering pipeline and its components
try:
    from src.feature_engineering import (
        create_feature_engineering_pipeline,
        FeatureExtractor,
        RFMCalculator,
        AggregateFeatures,
        MissingValueHandler,
        CustomEncoder,
        FeatureScaler
    )
    print("Successfully imported feature_engineering.py components.")
except ImportError as e:
    print(f"Error importing feature_engineering.py: {e}")
    print("Please ensure 'src/feature_engineering.py' exists and is correctly defined.")
    print("Also, check that the project root (containing 'src') is added to sys.path.")
    raise # Re-raise the error to halt execution if import fails

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("\nSetup complete.")

DATA_PATH = '../data/raw/data.csv' # Path to your raw data file

# --- Load Data ---
print("--- Loading Raw Data ---")
try:
    df_raw = pd.read_csv(DATA_PATH)
    print(f"Data loaded successfully from '{DATA_PATH}'")
    print(f"Raw data shape: {df_raw.shape}")
    print("\nRaw data head:")
    display(df_raw.head())
    print("\nRaw data info:")
    df_raw.info()
except FileNotFoundError:
    print(f"Error: {DATA_PATH} not found.")
    print("Please ensure 'data.csv' is located in the '../data/raw/' directory relative to this notebook.")

print("\n--- Testing RFMCalculator ---")
# Use a fresh copy of df_raw, as RFM needs the 'TransactionStartTime'
rfm_calc = RFMCalculator()
df_rfm = rfm_calc.fit_transform(df_raw.copy())
print("After RFMCalculator (first 5 rows):")
display(df_rfm.head())
print(f"RFM columns added: {[col for col in ['Recency', 'Frequency', 'Monetary'] if col in df_rfm.columns]}")
print("\nRFM features summary statistics:")
display(df_rfm[['Recency', 'Frequency', 'Monetary']].describe())
# Use a fresh copy of df_raw for this individual test, as FeatureExtractor expects 'TransactionStartTime'
extractor = FeatureExtractor()
df_extracted = extractor.fit_transform(df_raw.copy())
print("After FeatureExtractor (first 5 rows):")
display(df_extracted.head())
print(f"New time-based columns: {[col for col in df_extracted.columns if 'Transaction' in col and col != 'TransactionId']}")
print(f"Original 'TransactionStartTime' column removed: {'TransactionStartTime' not in df_extracted.columns}")
# Cell 5: Test AggregateFeatures
print("\n--- Testing AggregateFeatures ---")
# Ensure 'Amount' is numeric before aggregation for this individual test
df_agg_test_copy = df_raw.copy()
df_agg_test_copy['Amount'] = pd.to_numeric(df_agg_test_copy['Amount'], errors='coerce')

aggregator = AggregateFeatures()
df_aggregated = aggregator.fit_transform(df_agg_test_copy)
print("After AggregateFeatures (first 5 rows):")
display(df_aggregated.head())
print(f"Aggregate columns added: {[col for col in df_aggregated.columns if 'TransactionAmount' in col or 'TransactionCount' in col]}")
print("\nAggregate features summary statistics:")
display(df_aggregated[['TotalTransactionAmount', 'AverageTransactionAmount', 'MinTransactionAmount', 'MaxTransactionAmount', 'TransactionCount', 'StdDevTransactionAmount']].describe())

df_with_nans = df_raw.copy()
# Introduce NaN in 'Amount' and 'Value' for a couple of rows
df_with_nans.loc[[1, 5], ['Amount', 'Value']] = np.nan
# Introduce NaN in a categorical column for testing
df_with_nans.loc[[2, 6], 'ProductCategory'] = np.nan
df_with_nans.loc[[3], 'ChannelId'] = np.nan

print("Data before MissingValueHandler (showing NaNs counts):")
print(df_with_nans[['Amount', 'Value', 'ProductCategory', 'ChannelId']].isnull().sum())
print("\nBefore imputation - Sample rows with NaNs:")
display(df_with_nans[df_with_nans['Amount'].isnull() | df_with_nans['ProductCategory'].isnull()].head())

imputer = MissingValueHandler(strategy='mean')
df_imputed = imputer.fit_transform(df_with_nans.copy()) # Use a fresh copy for this test

print("\nData after MissingValueHandler (showing NaNs counts):")
print(df_imputed[['Amount', 'Value', 'ProductCategory', 'ChannelId']].isnull().sum())
print("\nAfter imputation - Sample rows (should be filled):")
display(df_imputed[df_imputed['TransactionId'].isin([df_with_nans.loc[1, 'TransactionId'], df_with_nans.loc[2, 'TransactionId']])])

print("\nValue counts for ProductCategory after imputation (should show no NaNs, most frequent used):")
print(df_imputed['ProductCategory'].value_counts(dropna=False))
print("\n--- Testing CustomEncoder (One-Hot Encoding) ---")
# Start with data that has been imputed (or a fresh copy if no NaNs expected)
df_for_encoding = df_raw.copy()
# Assuming MissingValueHandler might be part of the full pipeline, for individual test, handle NaNs if needed
df_for_encoding['ProductCategory'] = df_for_encoding['ProductCategory'].fillna('Unknown')
df_for_encoding['ChannelId'] = df_for_encoding['ChannelId'].fillna('Unknown')
df_for_encoding['CurrencyCode'] = df_for_encoding['CurrencyCode'].fillna('Unknown')
# Select some categorical columns for encoding
categorical_cols_onehot = ['ProductCategory', 'ChannelId', 'CurrencyCode']
cols_to_encode_existing = [col for col in categorical_cols_onehot if col in df_for_encoding.columns and df_for_encoding[col].dtype == 'object']

encoder_onehot = CustomEncoder(method='onehot', columns=cols_to_encode_existing)
df_encoded_onehot = encoder_onehot.fit_transform(df_for_encoding.copy())

print(f"Original categorical columns being encoded: {cols_to_encode_existing}")
print(f"Shape before encoding: {df_for_encoding.shape}")
print(f"Shape after One-Hot Encoding: {df_encoded_onehot.shape}")
print("\nFirst 5 rows with new one-hot encoded columns:")
display(df_encoded_onehot.head())

print("\nPresence of original and new columns:")
for col in cols_to_encode_existing:
    print(f"Is original '{col}' column present? {col in df_encoded_onehot.columns}")
    example_dummy_cols = [c for c in df_encoded_onehot.columns if c.startswith(f"{col}_")][:3]
    if example_dummy_cols:
        print(f"Example new dummy columns for '{col}': {example_dummy_cols}")
    else:
        print(f"No new dummy columns found for '{col}'. Check if it was encoded.")
        
print("\n--- Testing CustomEncoder (Label Encoding) ---")
# Use a fresh copy of the raw data for a clear demonstration of label encoding
df_for_label_encoding = df_raw.copy()
df_for_label_encoding['ProductCategory'] = df_for_label_encoding['ProductCategory'].fillna('Unknown') # Handle NaNs for label encoding

categorical_cols_label = ['ProductId', 'ProductCategory'] # Example columns for label encoding

encoder_label = CustomEncoder(method='label', columns=categorical_cols_label)
df_encoded_label = encoder_label.fit_transform(df_for_label_encoding.copy())

print(f"Original categorical columns being encoded: {categorical_cols_label}")
print("\nFirst 5 rows with label encoded columns:")
display(df_encoded_label[categorical_cols_label].head())

print("\nUnique encoded values for 'ProductId':")
print(df_encoded_label['ProductId'].unique())
print("\nUnique encoded values for 'ProductCategory':")
print(df_encoded_label['ProductCategory'].unique())
print(f"Original 'ProductId' dtype: {df_for_label_encoding['ProductId'].dtype}, Encoded 'ProductId' dtype: {df_encoded_label['ProductId'].dtype}")
print("\n--- Testing FeatureScaler (StandardScaler) ---")
# To test scaler, we need numerical data. Let's create a derived dataframe for this test.
# Step 1: Apply RFM
temp_df_for_scaling = RFMCalculator().fit_transform(df_raw.copy())
# Step 2: Apply FeatureExtractor (which drops TransactionStartTime)
temp_df_for_scaling = FeatureExtractor().fit_transform(temp_df_for_scaling)
# Step 3: Apply AggregateFeatures
temp_df_for_scaling = AggregateFeatures().fit_transform(temp_df_for_scaling)
# Step 4: Handle Missing Values (important before scaling)
# Determine numerical and categorical columns dynamically
numerical_cols = temp_df_for_scaling.select_dtypes(include=np.number).columns.tolist()
categorical_cols = temp_df_for_scaling.select_dtypes(include='object').columns.tolist()
imputer_for_scaler_test = MissingValueHandler(numerical_cols=numerical_cols, categorical_cols=categorical_cols)
temp_df_for_scaling = imputer_for_scaler_test.fit_transform(temp_df_for_scaling)

# Identify numerical columns to scale from the prepared temporary dataframe
numerical_cols_to_scale = temp_df_for_scaling.select_dtypes(include=np.number).columns.tolist()
# Exclude identifier columns that shouldn't be scaled (if they are numeric)
# These IDs might not be present if removed by other transformers, but good to list
ids_to_exclude = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
                  'CountryCode', 'PricingStrategy', 'FraudResult', 'ProviderId']
numerical_cols_to_scale = [col for col in numerical_cols_to_scale if col not in ids_to_exclude]

print(f"Numerical columns selected for scaling: {numerical_cols_to_scale}")

if not numerical_cols_to_scale:
    print("No suitable numerical columns found for scaling after previous steps. Skipping scaler test.")
else:
    scaler = FeatureScaler(method='standard', columns=numerical_cols_to_scale)
    df_scaled = scaler.fit_transform(temp_df_for_scaling.copy())

    print("\nAfter FeatureScaler (first 5 rows of scaled numerical columns):")
    display(df_scaled[numerical_cols_to_scale].head())

    print("\nMean and Standard Deviation of scaled columns (should be ~0 and ~1):")
    display(df_scaled[numerical_cols_to_scale].describe().loc[['mean', 'std']])
print("\n--- Testing Full Feature Engineering Pipeline ---")

# It's good practice to ensure df_raw is loaded just before calling the full pipeline,
df_raw = pd.read_csv(DATA_PATH)
# Initialize the full pipeline
full_pipeline = create_feature_engineering_pipeline(
    numerical_imputation_strategy='mean',
    categorical_encoding_method='onehot'
)
# Apply the full pipeline to the raw data
print(f"Shape of raw data: {df_raw.shape}")
df_processed_full = full_pipeline.fit_transform(df_raw.copy()) # Use a fresh copy of the raw data

print("\n--- Full Processed Data Overview ---")
print(f"Shape of fully processed data: {df_processed_full.shape}")
print("\nFirst 5 rows of fully processed data:")
display(df_processed_full.head())
print("\nInformation about fully processed columns:")
df_processed_full.info()

print("\n--- Final Check for Missing Values in Fully Processed Data ---")
final_missing_values = df_processed_full.isnull().sum()
final_missing_values = final_missing_values[final_missing_values > 0]
print(final_missing_values)
if final_missing_values.empty:
    print("No missing values found in the final processed data.")
else:
    print("Warning: Missing values still present after full pipeline. Investigate columns listed above.")

print("\n--- Feature Engineering Pipeline Testing Completed ---")

Added 'D:\10academy\week5\Credit_Risk_Probability_Model' to sys.path.
Successfully imported feature_engineering.py components.

Setup complete.
--- Loading Raw Data ---
Data loaded successfully from '../data/raw/data.csv'
Raw data shape: (95662, 16)

Raw data head:


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0



Raw data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TransactionId         95662 non-null  object 
 1   BatchId               95662 non-null  object 
 2   AccountId             95662 non-null  object 
 3   SubscriptionId        95662 non-null  object 
 4   CustomerId            95662 non-null  object 
 5   CurrencyCode          95662 non-null  object 
 6   CountryCode           95662 non-null  int64  
 7   ProviderId            95662 non-null  object 
 8   ProductId             95662 non-null  object 
 9   ProductCategory       95662 non-null  object 
 10  ChannelId             95662 non-null  object 
 11  Amount                95662 non-null  float64
 12  Value                 95662 non-null  int64  
 13  TransactionStartTime  95662 non-null  object 
 14  PricingStrategy       95662 non-null  int64  
 15  Fra

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,Frequency,Monetary,Recency
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0,66,156884.0,6
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0,30893,-27750277.5,1
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0,2,1000.0,82
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0,26,251000.0,6
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0,30893,-27750277.5,1


RFM columns added: ['Recency', 'Frequency', 'Monetary']

RFM features summary statistics:


Unnamed: 0,Recency,Frequency,Monetary
count,95662.0,95662.0,95662.0
mean,9.428341,10301.076519,-13617000.0
std,16.891222,14252.105087,25879350.0
min,1.0,1.0,-112561900.0
25%,1.0,39.0,-27750280.0
50%,1.0,211.0,50000.0
75%,8.0,30893.0,339500.0
max,91.0,30893.0,83466000.0


After FeatureExtractor (first 5 rows):


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy,FraudResult,TransactionHour,TransactionDay,TransactionMonth,TransactionYear
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2,0,2,15,11,2018
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2,0,2,15,11,2018
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2,0,2,15,11,2018
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2,0,3,15,11,2018
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2,0,3,15,11,2018


New time-based columns: ['TransactionHour', 'TransactionDay', 'TransactionMonth', 'TransactionYear']
Original 'TransactionStartTime' column removed: True

--- Testing AggregateFeatures ---
After AggregateFeatures (first 5 rows):


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,...,Value,TransactionStartTime,PricingStrategy,FraudResult,TotalTransactionAmount,AverageTransactionAmount,MinTransactionAmount,MaxTransactionAmount,TransactionCount,StdDevTransactionAmount
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,...,1000,2018-11-15T02:18:49Z,2,0,156884.0,2377.030303,32.0,20000.0,66,3146.231284
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,...,20,2018-11-15T02:19:08Z,2,0,-27750277.5,-898.270725,-25000.0,50.0,30893,1845.812752
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,...,500,2018-11-15T02:44:21Z,2,0,1000.0,500.0,500.0,500.0,2,0.0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,...,21800,2018-11-15T03:32:55Z,2,0,251000.0,9653.846154,500.0,100000.0,26,19707.241933
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,...,644,2018-11-15T03:34:21Z,2,0,-27750277.5,-898.270725,-25000.0,50.0,30893,1845.812752


Aggregate columns added: ['TotalTransactionAmount', 'AverageTransactionAmount', 'MinTransactionAmount', 'MaxTransactionAmount', 'TransactionCount', 'StdDevTransactionAmount']

Aggregate features summary statistics:


Unnamed: 0,TotalTransactionAmount,AverageTransactionAmount,MinTransactionAmount,MaxTransactionAmount,TransactionCount,StdDevTransactionAmount
count,95662.0,95662.0,95662.0,95662.0,95662.0,95662.0
mean,-13617000.0,6717.846,-53846.52,86926.8,10301.076519,15994.13
std,25879350.0,86656.57,210378.7,484165.0,14252.105087,88499.77
min,-112561900.0,-157142.9,-1000000.0,-100000.0,1.0,0.0
25%,-27750280.0,-898.2707,-25000.0,50.0,39.0,1845.813
50%,50000.0,2141.176,500.0,10000.0,211.0,2681.519
75%,339500.0,5432.3,500.0,30000.0,30893.0,7458.485
max,83466000.0,8601821.0,2000000.0,9880000.0,30893.0,3309916.0


Data before MissingValueHandler (showing NaNs counts):
Amount             2
Value              2
ProductCategory    2
ChannelId          1
dtype: int64

Before imputation - Sample rows with NaNs:


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,,,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,,ChannelId_3,500.0,500.0,2018-11-15T02:44:21Z,2,0
5,TransactionId_23223,BatchId_25954,AccountId_1078,SubscriptionId_4238,CustomerId_1432,UGX,256,ProviderId_6,ProductId_3,airtime,ChannelId_3,,,2018-11-15T03:35:10Z,2,0
6,TransactionId_118063,BatchId_118460,AccountId_2442,SubscriptionId_1980,CustomerId_2858,UGX,256,ProviderId_5,ProductId_3,,ChannelId_3,10000.0,10000.0,2018-11-15T03:44:31Z,4,0



Data after MissingValueHandler (showing NaNs counts):
Amount             0
Value              0
ProductCategory    0
ChannelId          0
dtype: int64

After imputation - Sample rows (should be filled):


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,6717.966188,9900.76982,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,financial_services,ChannelId_3,500.0,500.0,2018-11-15T02:44:21Z,2,0



Value counts for ProductCategory after imputation (should show no NaNs, most frequent used):
ProductCategory
financial_services    45407
airtime               45025
utility_bill           1920
data_bundles           1613
tv                     1279
ticket                  216
movies                  175
transport                25
other                     2
Name: count, dtype: int64

--- Testing CustomEncoder (One-Hot Encoding) ---
Original categorical columns being encoded: ['ProductCategory', 'ChannelId', 'CurrencyCode']
Shape before encoding: (95662, 16)
Shape after One-Hot Encoding: (95662, 27)

First 5 rows with new one-hot encoded columns:


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CountryCode,ProviderId,ProductId,Amount,Value,...,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,CurrencyCode_UGX
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,256,ProviderId_6,ProductId_10,1000.0,1000,...,False,False,False,False,False,False,False,True,False,True
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,256,ProviderId_4,ProductId_6,-20.0,20,...,False,False,False,False,False,False,True,False,False,True
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,256,ProviderId_6,ProductId_1,500.0,500,...,False,False,False,False,False,False,False,True,False,True
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,256,ProviderId_1,ProductId_21,20000.0,21800,...,False,False,False,False,True,False,False,True,False,True
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,256,ProviderId_4,ProductId_6,-644.0,644,...,False,False,False,False,False,False,True,False,False,True



Presence of original and new columns:
Is original 'ProductCategory' column present? False
Example new dummy columns for 'ProductCategory': ['ProductCategory_airtime', 'ProductCategory_data_bundles', 'ProductCategory_financial_services']
Is original 'ChannelId' column present? False
Example new dummy columns for 'ChannelId': ['ChannelId_ChannelId_1', 'ChannelId_ChannelId_2', 'ChannelId_ChannelId_3']
Is original 'CurrencyCode' column present? False
Example new dummy columns for 'CurrencyCode': ['CurrencyCode_UGX']

--- Testing CustomEncoder (Label Encoding) ---
Original categorical columns being encoded: ['ProductId', 'ProductCategory']

First 5 rows with label encoded columns:


Unnamed: 0,ProductId,ProductCategory
0,1,0
1,19,2
2,0,0
3,11,8
4,19,2



Unique encoded values for 'ProductId':
[ 1 19  0 11 16  6  2  8 17 18 10 22 14  5  9  4 12 21 20 15  3  7 13]

Unique encoded values for 'ProductCategory':
[0 2 8 1 7 6 5 3 4]
Original 'ProductId' dtype: object, Encoded 'ProductId' dtype: int64

--- Testing FeatureScaler (StandardScaler) ---
Numerical columns selected for scaling: ['Amount', 'Value', 'Frequency', 'Monetary', 'Recency', 'TransactionHour', 'TransactionDay', 'TransactionMonth', 'TransactionYear', 'TotalTransactionAmount', 'AverageTransactionAmount', 'MinTransactionAmount', 'MaxTransactionAmount', 'TransactionCount', 'StdDevTransactionAmount']

After FeatureScaler (first 5 rows of scaled numerical columns):


Unnamed: 0,Amount,Value,Frequency,Monetary,Recency,TransactionHour,TransactionDay,TransactionMonth,TransactionYear,TotalTransactionAmount,AverageTransactionAmount,MinTransactionAmount,MaxTransactionAmount,TransactionCount,StdDevTransactionAmount
0,-0.046371,-0.072291,-0.718149,0.532237,-0.202967,-2.15553,-0.100739,0.848684,-0.994246,0.532237,-0.050092,0.256104,-0.138232,-0.718149,-0.145175
1,-0.054643,-0.080251,1.444841,-0.546125,-0.49898,-2.15553,-0.100739,0.848684,-0.994246,-0.546125,-0.087889,0.137118,-0.179437,1.444841,-0.159869
2,-0.050426,-0.076352,-0.722639,0.526214,4.296435,-2.15553,-0.100739,0.848684,-0.994246,0.526214,-0.071753,0.258328,-0.178508,-0.722639,-0.180726
3,0.107717,0.096648,-0.720955,0.535874,-0.202967,-1.949214,-0.100739,0.848684,-0.994246,0.535874,0.033881,0.258328,0.027002,-0.720955,0.041956
4,-0.059704,-0.075183,1.444841,-0.546125,-0.49898,-1.949214,-0.100739,0.848684,-0.994246,-0.546125,-0.087889,0.137118,-0.179437,1.444841,-0.159869



Mean and Standard Deviation of scaled columns (should be ~0 and ~1):


Unnamed: 0,Amount,Value,Frequency,Monetary,Recency,TransactionHour,TransactionDay,TransactionMonth,TransactionYear,TotalTransactionAmount,AverageTransactionAmount,MinTransactionAmount,MaxTransactionAmount,TransactionCount,StdDevTransactionAmount
mean,-3.565266e-18,-3.565266e-18,0.0,4.7536890000000003e-17,-1.901475e-17,-4.9913730000000007e-17,0.0,7.605902e-17,-3.650833e-14,4.7536890000000003e-17,-5.942111e-18,3.802951e-17,1.901475e-17,0.0,2.376844e-18
std,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005



--- Testing Full Feature Engineering Pipeline ---
Shape of raw data: (95662, 16)
Converting column 'CountryCode' to object dtype for encoding.

--- Full Processed Data Overview ---
Shape of fully processed data: (95662, 39)

First 5 rows of fully processed data:


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,Amount,Value,PricingStrategy,...,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,ProviderId_6,ProductId_10,-0.046371,-0.072291,-0.349252,...,False,False,False,False,False,False,False,False,True,False
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,ProviderId_4,ProductId_6,-0.054643,-0.080251,-0.349252,...,False,False,False,False,False,False,False,True,False,False
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,ProviderId_6,ProductId_1,-0.050426,-0.076352,-0.349252,...,False,False,False,False,False,False,False,False,True,False
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,ProviderId_1,ProductId_21,0.107717,0.096648,-0.349252,...,False,False,False,False,False,True,False,False,True,False
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,ProviderId_4,ProductId_6,-0.059704,-0.075183,-0.349252,...,False,False,False,False,False,False,False,True,False,False



Information about fully processed columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 39 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   TransactionId                       95662 non-null  object 
 1   BatchId                             95662 non-null  object 
 2   AccountId                           95662 non-null  object 
 3   SubscriptionId                      95662 non-null  object 
 4   CustomerId                          95662 non-null  object 
 5   ProviderId                          95662 non-null  object 
 6   ProductId                           95662 non-null  object 
 7   Amount                              95662 non-null  float64
 8   Value                               95662 non-null  float64
 9   PricingStrategy                     95662 non-null  float64
 10  FraudResult                         95662 non-null  float64
 1