In [5]:
import pandas as pd
import numpy as np
from pathlib import Path

print("üöÄ Starting Synthetic Salary Feature Generation...")

# ============================================================================
# STEP 1: LOAD DATA
# ============================================================================
print("\nüìÇ Loading data...")

# Load previous features
df = pd.read_csv('../DATA/processed/customer_complete_features.csv')

print(f"‚úÖ Loaded {len(df):,} customers")
print(f"   Current columns: {df.shape[1]}")

# ============================================================================
# STEP 2: BASE SALARY CALCULATION
# ============================================================================
print("\nüíµ Calculating base salary metrics...")

# Use AMT_INCOME_TOTAL as base salary
df['avg_salary_6m'] = df['AMT_INCOME_TOTAL']

# Salary transaction count (assume monthly salary = 6 transactions in 6 months)
df['salary_txn_count_6m'] = 6

# ============================================================================
# STEP 3: SALARY AMOUNT VARIANCE (Realistic Simulation)
# ============================================================================
print("\nüìä Generating salary variance patterns...")

def calculate_salary_cv(row):
    """Calculate Coefficient of Variation for salary"""
    cv_base = 0.05  # 5% base variance
    
    if row['bureau_risk_flag'] == 'HIGH':
        cv_base += 0.15
    elif row['bureau_risk_flag'] == 'MEDIUM':
        cv_base += 0.08
    
    if row['payment_discipline_flag'] == 'POOR':
        cv_base += 0.10
    elif row['payment_discipline_flag'] == 'MODERATE':
        cv_base += 0.05
    
    if row['liquidity_flag'] == 'LOW':
        cv_base += 0.08
    
    return min(cv_base, 0.35)

df['salary_amount_cv'] = df.apply(calculate_salary_cv, axis=1)

print(f"   Salary CV range: {df['salary_amount_cv'].min():.3f} to {df['salary_amount_cv'].max():.3f}")

# ============================================================================
# STEP 4: SALARY DATE VARIANCE
# ============================================================================
print("\nüìÖ Generating salary date variance...")

def calculate_salary_date_std(row):
    """Calculate standard deviation of salary credit dates"""
    std_base = 2.0
    
    if row['bureau_risk_flag'] == 'HIGH':
        std_base += 5.0
    elif row['bureau_risk_flag'] == 'MEDIUM':
        std_base += 2.5
    
    if row['payment_discipline_flag'] == 'POOR':
        std_base += 3.0
    
    if row['salary_amount_cv'] > 0.20:
        std_base += 2.0
    
    return min(std_base, 10.0)

df['salary_date_std'] = df.apply(calculate_salary_date_std, axis=1)

print(f"   Date std range: {df['salary_date_std'].min():.1f} to {df['salary_date_std'].max():.1f} days")

# ============================================================================
# STEP 5: EMPLOYER CONSISTENCY
# ============================================================================
print("\nüè¢ Generating employer consistency flags...")

def calculate_employer_consistency(row):
    """Determine if employer name is consistent"""
    if row['bureau_risk_flag'] == 'HIGH' and row['payment_discipline_flag'] == 'POOR':
        return np.random.choice([0, 1], p=[0.3, 0.7])
    elif row['bureau_risk_flag'] == 'HIGH' or row['payment_discipline_flag'] == 'POOR':
        return np.random.choice([0, 1], p=[0.15, 0.85])
    elif row['salary_amount_cv'] > 0.20:
        return np.random.choice([0, 1], p=[0.10, 0.90])
    else:
        return 1

np.random.seed(42)
df['salary_creditor_consistent'] = df.apply(calculate_employer_consistency, axis=1)

print(f"   Consistent: {df['salary_creditor_consistent'].sum():,} ({df['salary_creditor_consistent'].mean()*100:.1f}%)")

# ============================================================================
# STEP 6: MISSING SALARY MONTHS
# ============================================================================
print("\n‚ùå Calculating missing salary months...")

def calculate_missing_months(row):
    """Calculate number of months without salary"""
    if row['salary_creditor_consistent'] == 0:
        return np.random.choice([1, 2], p=[0.7, 0.3])
    
    if row['bureau_risk_flag'] == 'HIGH':
        return np.random.choice([0, 1, 2], p=[0.6, 0.3, 0.1])
    elif row['bureau_risk_flag'] == 'MEDIUM':
        return np.random.choice([0, 1], p=[0.85, 0.15])
    else:
        return 0

df['salary_missing_months'] = df.apply(calculate_missing_months, axis=1)
df['salary_txn_count_6m'] = 6 - df['salary_missing_months']

print(f"   Customers with missing months: {(df['salary_missing_months'] > 0).sum():,}")

# ============================================================================
# STEP 7: SALARY STABILITY FLAG
# ============================================================================
print("\nüö¶ Creating salary stability flag...")

def assign_salary_stability(row):
    """Overall salary stability assessment"""
    if (row['salary_amount_cv'] > 0.15 or 
        row['salary_date_std'] > 5.0 or 
        row['salary_creditor_consistent'] == 0 or
        row['salary_missing_months'] > 1):
        return 'UNSTABLE'
    elif (row['salary_amount_cv'] > 0.10 or 
          row['salary_date_std'] > 3.0 or
          row['salary_missing_months'] == 1):
        return 'MODERATE'
    else:
        return 'STABLE'

df['salary_stability_flag'] = df.apply(assign_salary_stability, axis=1)

# ============================================================================
# STEP 8: STATISTICS
# ============================================================================
print("\n" + "="*60)
print("‚úÖ SALARY FEATURES COMPLETE!")
print("="*60)

print(f"\nüìä Dataset: {df.shape}")
print(f"\nüíµ Salary Statistics:")
print(df[['avg_salary_6m', 'salary_amount_cv', 'salary_date_std']].describe())

print(f"\nüìà Salary Stability Distribution:")
for stab, count in df['salary_stability_flag'].value_counts().items():
    print(f"   {stab:10s}: {count:6,} ({count/len(df)*100:5.1f}%)")

print(f"\nüè¢ Employer Consistency:")
print(f"   Consistent: {df['salary_creditor_consistent'].sum():,}")
print(f"   Inconsistent: {(1-df['salary_creditor_consistent']).sum():,}")

print(f"\n‚ùå Missing Months:")
print(df['salary_missing_months'].value_counts().sort_index())

# ============================================================================
# STEP 9: CORRELATION VALIDATION
# ============================================================================
print("\nüîç Validating correlations...")

print("\n   Salary CV by Bureau Risk:")
for risk, cv in df.groupby('bureau_risk_flag')['salary_amount_cv'].mean().items():
    print(f"      {risk:8s}: {cv:.3f}")

print("\n   Date Std by Payment Discipline:")
for disc, std in df.groupby('payment_discipline_flag')['salary_date_std'].mean().items():
    print(f"      {disc:10s}: {std:.2f} days")

# ============================================================================
# STEP 10: SAVE
# ============================================================================
output_file = Path('../DATA/processed/customer_final_features.csv')
df.to_csv(output_file, index=False)

print(f"\nüíæ Saved: {output_file}")
print(f"   Total columns: {df.shape[1]}")
print(f"   Total rows: {df.shape[0]:,}")

print("\n" + "="*60)
print("üéâ FEATURE ENGINEERING COMPLETE!")
print("="*60)
print("\n‚úÖ Ready for Step 7: Risk Scoring Engine")
# ```

# ---

# ## üöÄ What to Do Now

# 1. **Create new notebook**: `notebooks/04_salary_features.ipynb`
# 2. **Paste the code above** (clean version, no markdown)
# 3. **Run all cells**
# 4. **Wait ~30 seconds**

# ---

# ## ‚úÖ Expected Output

# You should see:
# ```
# üéâ FEATURE ENGINEERING COMPLETE!
# ‚úÖ Ready for Step 7: Risk Scoring Engine


üöÄ Starting Synthetic Salary Feature Generation...

üìÇ Loading data...
‚úÖ Loaded 307,511 customers
   Current columns: 41

üíµ Calculating base salary metrics...

üìä Generating salary variance patterns...
   Salary CV range: 0.050 to 0.350

üìÖ Generating salary date variance...
   Date std range: 2.0 to 10.0 days

üè¢ Generating employer consistency flags...
   Consistent: 307,151 (99.9%)

‚ùå Calculating missing salary months...
   Customers with missing months: 925

üö¶ Creating salary stability flag...

‚úÖ SALARY FEATURES COMPLETE!

üìä Dataset: (307511, 48)

üíµ Salary Statistics:
       avg_salary_6m  salary_amount_cv  salary_date_std
count   3.075110e+05     307511.000000    307511.000000
mean    1.687979e+05          0.123362         2.050442
std     2.371231e+05          0.025864         0.524267
min     2.565000e+04          0.050000         2.000000
25%     1.125000e+05          0.130000         2.000000
50%     1.471500e+05          0.130000         2.000000
7