In [9]:




# 1. Imports and Setup
import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path
from sklearn.model_selection import train_test_split

# Add src folder to path
sys.path.append('..')
from src import feature_engineering


# 2. Configuration
DATA_PATH = Path('../data/raw/data.csv')
OUTPUT_DIR = Path('../data/processed')
DROP_COLS = [
    'TransactionId', 'BatchId', 'AccountId',
    'SubscriptionId', 'CustomerId', 'TransactionStartTime'
]

# 3. Data Loading and Preparation
print("⏳ Loading and preparing data...")
df = pd.read_csv(DATA_PATH)
df = feature_engineering.extract_datetime_features(df, 'TransactionStartTime')
agg_df = feature_engineering.aggregate_customer_features(df)

# Get latest transaction per customer
df_latest = (df.sort_values("TransactionStartTime")
             .groupby("CustomerId")
             .last()
             .reset_index())

# Merge features
df_merged = pd.merge(df_latest, agg_df, on="CustomerId")

# 4. Feature Selection
print("🔍 Selecting features...")
df_model = df_merged.drop(columns=DROP_COLS)
numerical_features = [
    'Amount', 'Value', 'PricingStrategy',
    'total_amount', 'avg_amount', 'std_amount', 'transaction_count',
    'transaction_hour', 'transaction_day', 'transaction_month', 'transaction_year',
    'max_amount', 'min_amount', 'avg_value', 'std_value'
]
numerical_features = [col for col in numerical_features if col in df_model.columns]
categorical_features = df_model.select_dtypes(include=["object"]).columns.tolist()

# 5. Feature Processing
print("⚙️ Processing features...")
pipeline = feature_engineering.build_feature_pipeline(numerical_features, categorical_features)
X_processed = pipeline.fit_transform(df_model)

# 6. Save Outputs
print("💾 Saving processed data...")
OUTPUT_DIR.mkdir(exist_ok=True)

# Save processed features
feature_names = pipeline.named_steps['preprocessing'].get_feature_names_out()
pd.DataFrame(X_processed, columns=feature_names).to_csv(
    OUTPUT_DIR/'X_preprocessed.csv', 
    index=False
)

# Save customer IDs for target merging
df_merged[['CustomerId']].to_csv(OUTPUT_DIR/'customer_ids.csv', index=False)

print("✅ Feature engineering complete!")
print(f"📊 Final feature matrix shape: {X_processed.shape}")

⏳ Loading and preparing data...
🔍 Selecting features...
⚙️ Processing features...
💾 Saving processed data...
✅ Feature engineering complete!
📊 Final feature matrix shape: (3742, 55)
