In [None]:
import pandas as pd
import numpy as np
import sys
import os
from IPython.display import display

current_dir = os.getcwd()

project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print(f"Added '{project_root}' to sys.path.")

# Import the feature engineering pipeline and its components
try:
    from src.feature_engineering import (
        create_feature_engineering_pipeline,
        FeatureExtractor,
        RFMCalculator,
        AggregateFeatures,
        MissingValueHandler,
        CustomEncoder,
        FeatureScaler
    )
    print("Successfully imported feature_engineering.py components.")
except ImportError as e:
    print(f"Error importing feature_engineering.py: {e}")
    print("Please ensure 'src/feature_engineering.py' exists and is correctly defined.")
    print("Also, check that the project root (containing 'src') is added to sys.path.")
    raise # Re-raise the error to halt execution if import fails

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("\nSetup complete.")

DATA_PATH = '../data/raw/data.csv' # Path to your raw data file

# --- Load Data ---
print("--- Loading Raw Data ---")
try:
    df_raw = pd.read_csv(DATA_PATH)
    print(f"Data loaded successfully from '{DATA_PATH}'")
    print(f"Raw data shape: {df_raw.shape}")
    print("\nRaw data head:")
    display(df_raw.head())
    print("\nRaw data info:")
    df_raw.info()
except FileNotFoundError:
    print(f"Error: {DATA_PATH} not found.")
    print("Please ensure 'data.csv' is located in the '../data/raw/' directory relative to this notebook.")

print("\n--- Testing RFMCalculator ---")
# Use a fresh copy of df_raw, as RFM needs the 'TransactionStartTime'
rfm_calc = RFMCalculator()
df_rfm = rfm_calc.fit_transform(df_raw.copy())
print("After RFMCalculator (first 5 rows):")
display(df_rfm.head())
print(f"RFM columns added: {[col for col in ['Recency', 'Frequency', 'Monetary'] if col in df_rfm.columns]}")
print("\nRFM features summary statistics:")
display(df_rfm[['Recency', 'Frequency', 'Monetary']].describe())
# Use a fresh copy of df_raw for this individual test, as FeatureExtractor expects 'TransactionStartTime'
extractor = FeatureExtractor()
df_extracted = extractor.fit_transform(df_raw.copy())
print("After FeatureExtractor (first 5 rows):")
display(df_extracted.head())
print(f"New time-based columns: {[col for col in df_extracted.columns if 'Transaction' in col and col != 'TransactionId']}")
print(f"Original 'TransactionStartTime' column removed: {'TransactionStartTime' not in df_extracted.columns}")
# Cell 5: Test AggregateFeatures
print("\n--- Testing AggregateFeatures ---")
# Ensure 'Amount' is numeric before aggregation for this individual test
df_agg_test_copy = df_raw.copy()
df_agg_test_copy['Amount'] = pd.to_numeric(df_agg_test_copy['Amount'], errors='coerce')

aggregator = AggregateFeatures()
df_aggregated = aggregator.fit_transform(df_agg_test_copy)
print("After AggregateFeatures (first 5 rows):")
display(df_aggregated.head())
print(f"Aggregate columns added: {[col for col in df_aggregated.columns if 'TransactionAmount' in col or 'TransactionCount' in col]}")
print("\nAggregate features summary statistics:")
display(df_aggregated[['TotalTransactionAmount', 'AverageTransactionAmount', 'MinTransactionAmount', 'MaxTransactionAmount', 'TransactionCount', 'StdDevTransactionAmount']].describe())

df_with_nans = df_raw.copy()
# Introduce NaN in 'Amount' and 'Value' for a couple of rows
df_with_nans.loc[[1, 5], ['Amount', 'Value']] = np.nan
# Introduce NaN in a categorical column for testing
df_with_nans.loc[[2, 6], 'ProductCategory'] = np.nan
df_with_nans.loc[[3], 'ChannelId'] = np.nan

print("Data before MissingValueHandler (showing NaNs counts):")
print(df_with_nans[['Amount', 'Value', 'ProductCategory', 'ChannelId']].isnull().sum())
print("\nBefore imputation - Sample rows with NaNs:")
display(df_with_nans[df_with_nans['Amount'].isnull() | df_with_nans['ProductCategory'].isnull()].head())

imputer = MissingValueHandler(strategy='mean')
df_imputed = imputer.fit_transform(df_with_nans.copy()) # Use a fresh copy for this test

print("\nData after MissingValueHandler (showing NaNs counts):")
print(df_imputed[['Amount', 'Value', 'ProductCategory', 'ChannelId']].isnull().sum())
print("\nAfter imputation - Sample rows (should be filled):")
display(df_imputed[df_imputed['TransactionId'].isin([df_with_nans.loc[1, 'TransactionId'], df_with_nans.loc[2, 'TransactionId']])])

print("\nValue counts for ProductCategory after imputation (should show no NaNs, most frequent used):")
print(df_imputed['ProductCategory'].value_counts(dropna=False))
print("\n--- Testing CustomEncoder (One-Hot Encoding) ---")
# Start with data that has been imputed (or a fresh copy if no NaNs expected)
df_for_encoding = df_raw.copy()
# Assuming MissingValueHandler might be part of the full pipeline, for individual test, handle NaNs if needed
df_for_encoding['ProductCategory'] = df_for_encoding['ProductCategory'].fillna('Unknown')
df_for_encoding['ChannelId'] = df_for_encoding['ChannelId'].fillna('Unknown')
df_for_encoding['CurrencyCode'] = df_for_encoding['CurrencyCode'].fillna('Unknown')
# Select some categorical columns for encoding
categorical_cols_onehot = ['ProductCategory', 'ChannelId', 'CurrencyCode']
cols_to_encode_existing = [col for col in categorical_cols_onehot if col in df_for_encoding.columns and df_for_encoding[col].dtype == 'object']

encoder_onehot = CustomEncoder(method='onehot', columns=cols_to_encode_existing)
df_encoded_onehot = encoder_onehot.fit_transform(df_for_encoding.copy())

print(f"Original categorical columns being encoded: {cols_to_encode_existing}")
print(f"Shape before encoding: {df_for_encoding.shape}")
print(f"Shape after One-Hot Encoding: {df_encoded_onehot.shape}")
print("\nFirst 5 rows with new one-hot encoded columns:")
display(df_encoded_onehot.head())

print("\nPresence of original and new columns:")
for col in cols_to_encode_existing:
    print(f"Is original '{col}' column present? {col in df_encoded_onehot.columns}")
    example_dummy_cols = [c for c in df_encoded_onehot.columns if c.startswith(f"{col}_")][:3]
    if example_dummy_cols:
        print(f"Example new dummy columns for '{col}': {example_dummy_cols}")
    else:
        print(f"No new dummy columns found for '{col}'. Check if it was encoded.")
        
print("\n--- Testing CustomEncoder (Label Encoding) ---")
# Use a fresh copy of the raw data for a clear demonstration of label encoding
df_for_label_encoding = df_raw.copy()
df_for_label_encoding['ProductCategory'] = df_for_label_encoding['ProductCategory'].fillna('Unknown') # Handle NaNs for label encoding

categorical_cols_label = ['ProductId', 'ProductCategory'] # Example columns for label encoding

encoder_label = CustomEncoder(method='label', columns=categorical_cols_label)
df_encoded_label = encoder_label.fit_transform(df_for_label_encoding.copy())

print(f"Original categorical columns being encoded: {categorical_cols_label}")
print("\nFirst 5 rows with label encoded columns:")
display(df_encoded_label[categorical_cols_label].head())

print("\nUnique encoded values for 'ProductId':")
print(df_encoded_label['ProductId'].unique())
print("\nUnique encoded values for 'ProductCategory':")
print(df_encoded_label['ProductCategory'].unique())
print(f"Original 'ProductId' dtype: {df_for_label_encoding['ProductId'].dtype}, Encoded 'ProductId' dtype: {df_encoded_label['ProductId'].dtype}")
print("\n--- Testing FeatureScaler (StandardScaler) ---")
# To test scaler, we need numerical data. Let's create a derived dataframe for this test.
# Step 1: Apply RFM
temp_df_for_scaling = RFMCalculator().fit_transform(df_raw.copy())
# Step 2: Apply FeatureExtractor (which drops TransactionStartTime)
temp_df_for_scaling = FeatureExtractor().fit_transform(temp_df_for_scaling)
# Step 3: Apply AggregateFeatures
temp_df_for_scaling = AggregateFeatures().fit_transform(temp_df_for_scaling)
# Step 4: Handle Missing Values (important before scaling)
# Determine numerical and categorical columns dynamically
numerical_cols = temp_df_for_scaling.select_dtypes(include=np.number).columns.tolist()
categorical_cols = temp_df_for_scaling.select_dtypes(include='object').columns.tolist()
imputer_for_scaler_test = MissingValueHandler(numerical_cols=numerical_cols, categorical_cols=categorical_cols)
temp_df_for_scaling = imputer_for_scaler_test.fit_transform(temp_df_for_scaling)

# Identify numerical columns to scale from the prepared temporary dataframe
numerical_cols_to_scale = temp_df_for_scaling.select_dtypes(include=np.number).columns.tolist()
# Exclude identifier columns that shouldn't be scaled (if they are numeric)
# These IDs might not be present if removed by other transformers, but good to list
ids_to_exclude = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
                  'CountryCode', 'PricingStrategy', 'FraudResult', 'ProviderId']
numerical_cols_to_scale = [col for col in numerical_cols_to_scale if col not in ids_to_exclude]

print(f"Numerical columns selected for scaling: {numerical_cols_to_scale}")

if not numerical_cols_to_scale:
    print("No suitable numerical columns found for scaling after previous steps. Skipping scaler test.")
else:
    scaler = FeatureScaler(method='standard', columns=numerical_cols_to_scale)
    df_scaled = scaler.fit_transform(temp_df_for_scaling.copy())

    print("\nAfter FeatureScaler (first 5 rows of scaled numerical columns):")
    display(df_scaled[numerical_cols_to_scale].head())

    print("\nMean and Standard Deviation of scaled columns (should be ~0 and ~1):")
    display(df_scaled[numerical_cols_to_scale].describe().loc[['mean', 'std']])
print("\n--- Testing Full Feature Engineering Pipeline ---")

# It's good practice to ensure df_raw is loaded just before calling the full pipeline,
df_raw = pd.read_csv(DATA_PATH)
# Initialize the full pipeline
full_pipeline = create_feature_engineering_pipeline(
    numerical_imputation_strategy='mean',
    categorical_encoding_method='onehot'
)
# Apply the full pipeline to the raw data
print(f"Shape of raw data: {df_raw.shape}")
df_processed_full = full_pipeline.fit_transform(df_raw.copy()) # Use a fresh copy of the raw data

print("\n--- Full Processed Data Overview ---")
print(f"Shape of fully processed data: {df_processed_full.shape}")
print("\nFirst 5 rows of fully processed data:")
display(df_processed_full.head())
print("\nInformation about fully processed columns:")
df_processed_full.info()

print("\n--- Final Check for Missing Values in Fully Processed Data ---")
final_missing_values = df_processed_full.isnull().sum()
final_missing_values = final_missing_values[final_missing_values > 0]
print(final_missing_values)
if final_missing_values.empty:
    print("No missing values found in the final processed data.")
else:
    print("Warning: Missing values still present after full pipeline. Investigate columns listed above.")

print("\n--- Feature Engineering Pipeline Testing Completed ---")