In [32]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from sqlalchemy import create_engine
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get current notebook location
NOTEBOOK_DIR = Path().absolute()

# Smart detection of project root
# If notebook is in scripts/notebooks/, go up TWO levels
# If notebook is in notebooks/, go up ONE level
if 'scripts' in str(NOTEBOOK_DIR):
    PROJECT_ROOT = NOTEBOOK_DIR.parent.parent  # scripts/notebooks/ ‚Üí Payflow/
    print(f"üìç Detected notebook in scripts/ subdirectory")
else:
    PROJECT_ROOT = NOTEBOOK_DIR.parent  # notebooks/ ‚Üí Payflow/
    print(f"üìç Detected notebook at root level")

print("="*70)
print("PATH CONFIGURATION")
print("="*70)
print(f"Notebook directory: {NOTEBOOK_DIR}")
print(f"Project root:       {PROJECT_ROOT}")

# Define data directories at PROJECT ROOT (not under scripts/)
DATA_DIR = PROJECT_ROOT / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'
OUTPUT_DIR = PROJECT_ROOT / 'output'

# Create directories if they don't exist
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Define file paths
TRANSACTIONS_CLEAN = PROCESSED_DIR / 'transactions_clean.csv'
CUSTOMERS_CLEAN = PROCESSED_DIR / 'customers_clean.csv'
FRAUD_FLAGGED = PROCESSED_DIR / 'fraud_flagged.csv'
FRAUD_MODEL = OUTPUT_DIR / 'fraud_model.pkl'
SCALER = OUTPUT_DIR / 'scaler.pkl'

print(f"\nüìÇ Directories:")
print(f"   Data:       {DATA_DIR}")
print(f"   Processed:  {PROCESSED_DIR}")
print(f"   Output:     {OUTPUT_DIR}")

print(f"\nüìÑ File paths:")
print(f"   transactions_clean.csv ‚Üí {TRANSACTIONS_CLEAN}")
print(f"   customers_clean.csv    ‚Üí {CUSTOMERS_CLEAN}")
print(f"   fraud_flagged.csv      ‚Üí {FRAUD_FLAGGED}")
print(f"   fraud_model.pkl        ‚Üí {FRAUD_MODEL}")
print(f"   scaler.pkl             ‚Üí {SCALER}")
print("="*70 + "\n")

# Database connection
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_PORT = os.getenv('DB_PORT', '5432')
DATABASE_URL = f"postgresql://postgres:{DB_PASSWORD}@localhost:{DB_PORT}/payflow_commerce"

# Connect to PostgreSQL
engine = create_engine(DATABASE_URL)

print("Loading data from PostgreSQL...")

transactions = pd.read_sql("SELECT * FROM transactions", engine)
customers = pd.read_sql("SELECT * FROM customers", engine)
products = pd.read_sql("SELECT * FROM products", engine)

print(f"Loaded {len(transactions)} transactions")
print(f"Loaded {len(customers)} customers")
print(f"Loaded {len(products)} products")

üìç Detected notebook in scripts/ subdirectory
PATH CONFIGURATION
Notebook directory: c:\Users\grant\OneDrive\Desktop\Projects\vscode\Payflow\scripts\notebooks
Project root:       c:\Users\grant\OneDrive\Desktop\Projects\vscode\Payflow

üìÇ Directories:
   Data:       c:\Users\grant\OneDrive\Desktop\Projects\vscode\Payflow\data
   Processed:  c:\Users\grant\OneDrive\Desktop\Projects\vscode\Payflow\data\processed
   Output:     c:\Users\grant\OneDrive\Desktop\Projects\vscode\Payflow\output

üìÑ File paths:
   transactions_clean.csv ‚Üí c:\Users\grant\OneDrive\Desktop\Projects\vscode\Payflow\data\processed\transactions_clean.csv
   customers_clean.csv    ‚Üí c:\Users\grant\OneDrive\Desktop\Projects\vscode\Payflow\data\processed\customers_clean.csv
   fraud_flagged.csv      ‚Üí c:\Users\grant\OneDrive\Desktop\Projects\vscode\Payflow\data\processed\fraud_flagged.csv
   fraud_model.pkl        ‚Üí c:\Users\grant\OneDrive\Desktop\Projects\vscode\Payflow\output\fraud_model.pkl
   scaler.pkl

In [28]:
print("="*50)
print("DATA QUALITY ASSESSMENT")
print("="*50)

# Missing values
print("\n--- Missing Values ---")
print("\nTransactions:")
print(transactions.isnull().sum())

print("\n--- Duplicate Customers (by email) ---")
duplicate_emails = customers[customers.duplicated(subset=['email'], keep=False)]
print(f"Duplicate customer records: {len(duplicate_emails)}")

print("\n--- Data Types ---")
print(transactions.dtypes)

print("\n--- Invalid Values Check ---")
print(f"Negative amounts: {(transactions['amount'] < 0).sum()}")
print(f"Future dates: {(pd.to_datetime(transactions['order_date']) > pd.Timestamp.now()).sum()}")

DATA QUALITY ASSESSMENT

--- Missing Values ---

Transactions:
transaction_id             0
customer_id                0
product_id                 0
order_date                 0
order_time               662
amount                     0
quantity                   0
payment_method             0
shipping_address         423
billing_address            0
acquisition_channel        0
is_fraud                   0
chargeback_date        13407
device_type                0
ip_address                 0
dtype: int64

--- Duplicate Customers (by email) ---
Duplicate customer records: 770

--- Data Types ---
transaction_id          object
customer_id             object
product_id              object
order_date              object
order_time              object
amount                 float64
quantity                 int64
payment_method          object
shipping_address        object
billing_address         object
acquisition_channel     object
is_fraud                  bool
chargeback_date         o

In [29]:
print("\n" + "="*50)
print("CLEANING DATA")
print("="*50)

# Clean transactions
transactions_clean = transactions.copy()

# Fill missing order_time with median
median_time = pd.to_datetime(transactions_clean['order_time'], format='%H:%M:%S', errors='coerce').dt.hour.median()
transactions_clean['order_time'] = transactions_clean['order_time'].fillna(f"{int(median_time):02d}:00:00")

# Drop rows with missing critical data 
print(f"\nRows before cleaning: {len(transactions_clean)}")
transactions_clean = transactions_clean.dropna(subset=['customer_id', 'amount'])
print(f"Rows after cleaning: {len(transactions_clean)}")

transactions_clean['is_weekend'] = pd.to_datetime(transactions_clean['order_date']).dt.dayofweek.isin([5, 6]).astype(int)
transactions_clean['is_high_value'] = (transactions_clean['amount'] > 500).astype(int)

# Calculating days since signup
transactions_clean = transactions_clean.merge(customers[['customer_id', 'signup_date']], on='customer_id', how='left')
transactions_clean['days_since_signup'] = (pd.to_datetime(transactions_clean['order_date']) - pd.to_datetime(transactions_clean['signup_date'])).dt.days
transactions_clean['is_new_customer'] = (transactions_clean['days_since_signup'] < 30).astype(int)

# Shipping/billing mismatch
transactions_clean['shipping_billing_mismatch'] = (transactions_clean['shipping_address'] != transactions_clean['billing_address']).astype(int)

print("\nFeatures engineered:")
print("  - is_weekend")
print("  - is_high_value")
print("  - is_new_customer")
print("  - days_since_signup")
print("  - shipping_billing_mismatch")


CLEANING DATA

Rows before cleaning: 14082
Rows after cleaning: 14082

Features engineered:
  - is_weekend
  - is_high_value
  - is_new_customer
  - days_since_signup
  - shipping_billing_mismatch


In [30]:
print("\n" + "="*50)
print("EXPORTING CLEANED DATA")
print("="*50)

transactions_clean.to_csv(TRANSACTIONS_CLEAN, index=False)
customers.to_csv(CUSTOMERS_CLEAN, index=False)

print(f"\nFinal dataset shape: {transactions_clean.shape}")
print(f"Columns: {list(transactions_clean.columns)}")

# List all columns to verify
print("\nColumns in exported file:")
for i, col in enumerate(transactions_clean.columns, 1):
    print(f"  {i:2d}. {col}")

# Specifically verify engineered features
engineered_features = [
    'is_weekend',
    'is_high_value', 
    'is_new_customer',
    'days_since_signup',
    'shipping_billing_mismatch'
]

print("\n--- Engineered Features Check ---")
for feature in engineered_features:
    if feature in transactions_clean.columns:
        print(f"  ‚úì {feature}")
    else:
        print(f"  ‚úó {feature} MISSING!")


EXPORTING CLEANED DATA

Final dataset shape: (14082, 21)
Columns: ['transaction_id', 'customer_id', 'product_id', 'order_date', 'order_time', 'amount', 'quantity', 'payment_method', 'shipping_address', 'billing_address', 'acquisition_channel', 'is_fraud', 'chargeback_date', 'device_type', 'ip_address', 'is_weekend', 'is_high_value', 'signup_date', 'days_since_signup', 'is_new_customer', 'shipping_billing_mismatch']

Columns in exported file:
   1. transaction_id
   2. customer_id
   3. product_id
   4. order_date
   5. order_time
   6. amount
   7. quantity
   8. payment_method
   9. shipping_address
  10. billing_address
  11. acquisition_channel
  12. is_fraud
  13. chargeback_date
  14. device_type
  15. ip_address
  16. is_weekend
  17. is_high_value
  18. signup_date
  19. days_since_signup
  20. is_new_customer
  21. shipping_billing_mismatch

--- Engineered Features Check ---
  ‚úì is_weekend
  ‚úì is_high_value
  ‚úì is_new_customer
  ‚úì days_since_signup
  ‚úì shipping_billi

In [31]:
# ============================================================
# VERIFICATION CELL - Add this as the LAST cell in notebook
# ============================================================

print("\n" + "="*70)
print("VERIFICATION - Checking what was actually saved")
print("="*70)

# Print where we THINK we saved it
print(f"\nüìç File saved to:")
print(f"   {TRANSACTIONS_CLEAN.absolute()}")

# Now read it back and verify
import pandas as pd
df_verify = pd.read_csv(TRANSACTIONS_CLEAN)

print(f"\nüìä File on disk has:")
print(f"   Rows: {len(df_verify):,}")
print(f"   Columns: {len(df_verify.columns)}")

print(f"\nüìã Columns in saved file:")
for i, col in enumerate(df_verify.columns, 1):
    print(f"   {i:2d}. {col}")

# Check engineered features
engineered = ['is_weekend', 'is_high_value', 'is_new_customer', 
              'days_since_signup', 'shipping_billing_mismatch']
              
print(f"\n‚úÖ Engineered Features in SAVED file:")
missing = []
for feat in engineered:
    if feat in df_verify.columns:
        print(f"   ‚úì {feat}")
    else:
        print(f"   ‚úó {feat} MISSING!")
        missing.append(feat)

if missing:
    print(f"\n‚ùå ERROR: Features created but not saved!")
    print(f"   Features missing: {missing}")
    print(f"\nüîç Debugging info:")
    print(f"   transactions_clean in memory has {len(transactions_clean.columns)} columns")
    print(f"   File on disk has {len(df_verify.columns)} columns")
    print(f"   ‚Üí These should match!")
else:
    print(f"\n‚úÖ SUCCESS: All features saved correctly!")


VERIFICATION - Checking what was actually saved

üìç File saved to:
   c:\Users\grant\OneDrive\Desktop\Projects\vscode\Payflow\data\processed\transactions_clean.csv

üìä File on disk has:
   Rows: 14,082
   Columns: 21

üìã Columns in saved file:
    1. transaction_id
    2. customer_id
    3. product_id
    4. order_date
    5. order_time
    6. amount
    7. quantity
    8. payment_method
    9. shipping_address
   10. billing_address
   11. acquisition_channel
   12. is_fraud
   13. chargeback_date
   14. device_type
   15. ip_address
   16. is_weekend
   17. is_high_value
   18. signup_date
   19. days_since_signup
   20. is_new_customer
   21. shipping_billing_mismatch

‚úÖ Engineered Features in SAVED file:
   ‚úì is_weekend
   ‚úì is_high_value
   ‚úì is_new_customer
   ‚úì days_since_signup
   ‚úì shipping_billing_mismatch

‚úÖ SUCCESS: All features saved correctly!
