In [1]:
# 1. Imports and Setup
import pandas as pd
import numpy as np
import os
import sys
from sklearn.model_selection import train_test_split

# Add src folder to path
sys.path.append('..')
from src import feature_engineering

# 2. Load and Prepare Data
print("⏳ Loading data...")
df = pd.read_csv('../data/raw/data.csv')

# 3. Feature Engineering
print("🛠️ Engineering features...")
df = feature_engineering.extract_datetime_features(df, 'TransactionStartTime')
agg_df = feature_engineering.aggregate_customer_features(df)

# Get latest transaction per customer
df_latest = (df.sort_values("TransactionStartTime")
             .groupby("CustomerId")
             .last()
             .reset_index())

# Merge with aggregated features
df_merged = pd.merge(df_latest, agg_df, on="CustomerId")

# 4. Prepare Modeling Data
print("📊 Preparing modeling data...")
drop_cols = [
    'TransactionId', 'BatchId', 'AccountId', 
    'SubscriptionId', 'CustomerId', 'TransactionStartTime'
]
df_model = df_merged.drop(columns=drop_cols)

# 5. Define Feature Types
numerical_features = [
    'Amount', 'Value', 'PricingStrategy',
    'total_amount', 'avg_amount', 'std_amount', 'transaction_count',
    'transaction_hour', 'transaction_day', 'transaction_month', 'transaction_year'
]
# Filter for columns that actually exist
numerical_features = [col for col in numerical_features if col in df_model.columns]
categorical_features = df_model.select_dtypes(include=["object"]).columns.tolist()

# 6. Build and Apply Pipeline
print("🔧 Building and applying preprocessing pipeline...")
pipeline = feature_engineering.build_feature_pipeline(
    numerical_features, 
    categorical_features
)

X_preprocessed = pipeline.fit_transform(df_model)

# 7. Get Feature Names
try:
    feature_names = pipeline.named_steps['preprocessing'].get_feature_names_out()
except AttributeError:
    # Fallback for older sklearn versions
    numeric_names = numerical_features
    cat_encoder = pipeline.named_steps['preprocessing'].named_transformers_['cat'].named_steps['encoder']
    cat_names = cat_encoder.get_feature_names_out(categorical_features)
    feature_names = np.concatenate([numeric_names, cat_names])

# 8. Save Results
print("💾 Saving processed data...")
os.makedirs('../data/processed', exist_ok=True)
pd.DataFrame(X_preprocessed, columns=feature_names).to_csv(
    '../data/processed/X_preprocessed.csv', 
    index=False
)

print("✅ Processing complete!")
print(f"🔢 Final shape: {X_preprocessed.shape}")
print(f"📝 Features: {list(feature_names)}")

⏳ Loading data...
🛠️ Engineering features...
📊 Preparing modeling data...
🔧 Building and applying preprocessing pipeline...
💾 Saving processed data...
✅ Processing complete!
🔢 Final shape: (3742, 51)
📝 Features: ['num__Amount', 'num__Value', 'num__PricingStrategy', 'num__total_amount', 'num__avg_amount', 'num__std_amount', 'num__transaction_count', 'num__transaction_hour', 'num__transaction_day', 'num__transaction_month', 'num__transaction_year', 'cat__CurrencyCode_UGX', 'cat__ProviderId_ProviderId_1', 'cat__ProviderId_ProviderId_2', 'cat__ProviderId_ProviderId_3', 'cat__ProviderId_ProviderId_4', 'cat__ProviderId_ProviderId_5', 'cat__ProviderId_ProviderId_6', 'cat__ProductId_ProductId_1', 'cat__ProductId_ProductId_10', 'cat__ProductId_ProductId_11', 'cat__ProductId_ProductId_13', 'cat__ProductId_ProductId_14', 'cat__ProductId_ProductId_15', 'cat__ProductId_ProductId_16', 'cat__ProductId_ProductId_19', 'cat__ProductId_ProductId_2', 'cat__ProductId_ProductId_20', 'cat__ProductId_ProductI