In [None]:
"""
================================================================================
Main: HDB Resale Price ‚Äì Exploratory Data Analysis (EDA) Workflow
================================================================================

This script uses Snowpark and pandas to conduct an in-depth EDA on HDB resale
flat data (from the HDB_SILVER table) to support fair resale price estimation.

--------------------------------------------------------------------------------
Step-by-Step Workflow
--------------------------------------------------------------------------------
| Step | Title                               | Description                                                                 |
|------|-------------------------------------|-----------------------------------------------------------------------------|
| 0    | Environment Setup                    | Imports Snowpark, pandas, numpy, matplotlib, seaborn; sets plotting styles   |
| 1    | Data Acquisition & Overview          | Loads HDB_SILVER into Snowpark DataFrame; prints record count and features |
| 2    | Missing Values Analysis              | Checks and reports null values for key numerical & categorical columns      |
| 3    | Numerical Variable Analysis          | Computes count, mean, median, min, max, stddev for numeric attributes       |
| 4    | Resale Price Statistics              | Calculates detailed stats and percentiles (25th‚Äì99th) for resale price      |
| 5    | Categorical Variable Analysis         | Reports unique values, frequency, average and median resale price per category |
| 6    | Visualizations ‚Äì Numeric             | Generates scatterplots, histograms, distributions, correlation heatmap, pairplot |
| 7    | Visualizations ‚Äì Categorical         | Creates boxplots and count plots for categorical features vs resale price   |
| 8    | Correlation & Relationship Analysis  | Correlation matrix, correlation strength, and pair relationships            |
| 9    | Advanced Visualizations              | Distribution plots with KDE, log transforms, Q‚ÄìQ plots, and special resale price views |
| 10   | Price Analysis by Category           | Aggregates price stats (min/avg/median/max) per category                     |
| 11   | Outlier Detection & Analysis         | IQR-based outlier counts, thresholds, extreme value stats, outlier visualizations |
| 12   | Data Quality Checks                  | Validates AGE, MONTH_NUM, FLOOR_AREA_SQM, RESALE_PRICE within realistic ranges |
| 13   | Feature Engineering Suggestions      | Proposes engineered features like PRICE_PER_SQM, AGE_GROUP, SEASON, PRICE_TIER |
| 14   | Outlier Treatment Strategy           | Strategy guidelines for handling, investigating, transforming outliers      |
| 15   | Modeling Recommendations             | Model pipeline suggestions: Feature selection, model types, validation strategy |
| 16   | Final Summary & Next Steps           | Prints dataset summary, strongest predictor, and actionable next steps       |
--------------------------------------------------------------------------------

Usage:
    main(session)

Inputs:
    - session: snowpark.Session (active Snowpark session connected to Snowflake)

Visualization Tools:
    - pandas, matplotlib, seaborn, numpy (plus optionally scipy for KDE and Q‚ÄìQ plot)

Assumptions:
    - Table HDB_SILVER exists and includes numeric (e.g., AGE, FLOOR_AREA_SQM, RESALE_PRICE) and categorical columns (e.g., TOWN, FLAT_TYPE)
    - Snowpark session is authenticated and operational

Outcomes:
    - Detailed descriptive stats and visual insights
    - Data quality and outlier analysis
    - Feature and modeling recommendations for resale price prediction

"""

In [None]:
import snowflake.snowpark as snowpark
from snowflake.snowpark.functions import col, count, avg, min as min_, max as max_, stddev, median, percentile_cont, sum as sum_
from snowflake.snowpark.functions import when, lit, abs as abs_, sqrt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

def main(session: snowpark.Session):
    print("="*80)
    print("HDB RESALE PRICE - COMPREHENSIVE EDA ANALYSIS")
    print("Singapore HDB Data Analysis for Fair Resale Price Estimation")
    print("="*80)
    
    # Reference your HDB_SILVER table
    tableName = 'HDB_SILVER'
    hdb_silver_df = session.table(tableName)
    
    # Define column categories
    num_attribs = ["AGE", "FLOOR_AREA_SQM", "YEAR", "MONTH_NUM", "RESALE_PRICE"]
    cat_attribs = ["TOWN", "FLAT_TYPE", "STOREY_RANGE", "FLAT_MODEL"]
    
    print("\n" + "="*60)
    print("1. BASIC DATA OVERVIEW")
    print("="*60)
    
    # Get basic dataset info
    total_rows = hdb_silver_df.count()
    print("üìä Dataset Overview:")
    print(f"   Total Records: {total_rows:,}")
    print(f"   Total Features: {len(num_attribs + cat_attribs)}")
    print(f"   Numerical Features: {len(num_attribs)}")
    print(f"   Categorical Features: {len(cat_attribs)}")
    
    # Show sample data
    print("\nüìã Sample Data (First 5 rows):")
    hdb_silver_df.show(5)
    
    # Check for missing values
    print("\n‚ùå Missing Values Analysis:")
    for col_name in num_attribs + cat_attribs:
        null_count = hdb_silver_df.filter(col(col_name).is_null()).count()
        if null_count > 0:
            print(f"   {col_name}: {null_count:,} ({null_count/total_rows*100:.2f}%)")
        else:
            print(f"   {col_name}: 0 (0.00%)")
    
    print("\n" + "="*60)
    print("2. NUMERICAL VARIABLES ANALYSIS")
    print("="*60)
    
    # Comprehensive statistics for numerical variables
    print("\nüìà NUMERICAL STATISTICS:")
    print("-" * 70)
    print(f"{'Variable':<15} {'Count':<10} {'Mean':<12} {'Median':<10} {'Min':<10} {'Max':<12} {'StdDev':<10}")
    print("-" * 70)
    
    num_stats = {}
    for col_name in num_attribs:
        stats = hdb_silver_df.select(
            count(col(col_name)).alias("count"),
            avg(col(col_name)).alias("mean"),
            median(col(col_name)).alias("median"),
            min_(col(col_name)).alias("min"),
            max_(col(col_name)).alias("max"),
            stddev(col(col_name)).alias("stddev")
        ).collect()[0]
        
        num_stats[col_name] = {
            'count': stats['COUNT'],
            'mean': stats['MEAN'],
            'median': stats['MEDIAN'],
            'min': stats['MIN'],
            'max': stats['MAX'],
            'stddev': stats['STDDEV']
        }
        
        print(f"{col_name:<15} {stats['COUNT']:<10,} {stats['MEAN']:<12.2f} {stats['MEDIAN']:<10.2f} {stats['MIN']:<10.2f} {stats['MAX']:<12.2f} {stats['STDDEV']:<10.2f}")
    
    # Detailed RESALE_PRICE analysis
    print("\nüè† RESALE PRICE DETAILED ANALYSIS:")
    print("-" * 50)
    price_stats = num_stats['RESALE_PRICE']
    print(f"   Mean Price: SGD {price_stats['mean']:,.2f}")
    print(f"   Median Price: SGD {price_stats['median']:,.2f}")
    print(f"   Price Range: SGD {price_stats['min']:,.2f} - SGD {price_stats['max']:,.2f}")
    print(f"   Standard Deviation: SGD {price_stats['stddev']:,.2f}")
    
    # Price percentiles
    percentiles = hdb_silver_df.select(
        percentile_cont(0.25).within_group(col("RESALE_PRICE")).alias("p25"),
        percentile_cont(0.75).within_group(col("RESALE_PRICE")).alias("p75"),
        percentile_cont(0.90).within_group(col("RESALE_PRICE")).alias("p90"),
        percentile_cont(0.95).within_group(col("RESALE_PRICE")).alias("p95"),
        percentile_cont(0.99).within_group(col("RESALE_PRICE")).alias("p99")
    ).collect()[0]
    
    print(f"   25th Percentile: SGD {percentiles['P25']:,.2f}")
    print(f"   75th Percentile: SGD {percentiles['P75']:,.2f}")
    print(f"   90th Percentile: SGD {percentiles['P90']:,.2f}")
    print(f"   95th Percentile: SGD {percentiles['P95']:,.2f}")
    print(f"   99th Percentile: SGD {percentiles['P99']:,.2f}")
    
    print("\n" + "="*60)
    print("3. CATEGORICAL VARIABLES ANALYSIS")
    print("="*60)
    
    # Analyze categorical variables
    cat_stats = {}
    for col_name in cat_attribs:
        print(f"\nüìä {col_name.upper()} ANALYSIS:")
        print("-" * 40)
        
        # Get value counts and statistics
        value_counts = hdb_silver_df.group_by(col(col_name)).agg(
            count(lit(1)).alias("count"),
            avg(col("RESALE_PRICE")).alias("avg_price"),
            median(col("RESALE_PRICE")).alias("median_price")
        ).sort(col("count").desc())
        
        unique_count = hdb_silver_df.select(col(col_name)).distinct().count()
        print(f"   Unique Values: {unique_count}")
        print("   Top Categories by Frequency:")
        
        top_categories = value_counts.limit(10).collect()
        for i, row in enumerate(top_categories, 1):
            pct = (row['COUNT'] / total_rows) * 100
            print(f"      {i:2d}. {row[col_name]:<20} {row['COUNT']:>7,} ({pct:>5.1f}%) - Avg Price: SGD {row['AVG_PRICE']:>10,.0f}")
        
        cat_stats[col_name] = {
            'unique_count': unique_count,
            'top_categories': top_categories
        }
    
    print("\n" + "="*60)
    print("4. VISUALIZATIONS - NUMERICAL VARIABLES")

    print("="*60)
    print("4. VISUALIZATIONS - NUMERICAL VARIABLES")
    print("="*60)
    print("4. VISUALIZATIONS - NUMERICAL VARIABLES")
    print("="*60)
    
    # Convert to Pandas for visualizations (sample for performance if dataset is very large)
    print("Converting data to Pandas for visualizations...")
    
    total_rows = hdb_silver_df.count()
    sample_size = min(50000, total_rows)
    
    if total_rows > 50000:
        print(f"Using random sample of {sample_size:,} records for visualizations...")
        # Corrected line: Pass the fraction as a positional argument
        viz_df = hdb_silver_df.sample(sample_size / total_rows).to_pandas()
    else:
        viz_df = hdb_silver_df.to_pandas()

    print(f"Visualization dataset shape: {viz_df.shape}")
    
    # SCATTERPLOTS: All numerical variables vs RESALE_PRICE
    print("\nüìä Creating Scatterplots: Numerical Variables vs Resale Price...")
    
    fig, axes = plt.subplots(2, 2, figsize=(20, 16))
    fig.suptitle('Scatterplots: Numerical Variables vs Resale Price', fontsize=16, fontweight='bold')
    
    numerical_features = [col for col in num_attribs if col != 'RESALE_PRICE']
    
    for i, feature in enumerate(numerical_features):
        row = i // 2
        col_idx = i % 2
        
        # Create scatterplot
        axes[row, col_idx].scatter(viz_df[feature], viz_df['RESALE_PRICE'], 
                                  alpha=0.6, s=20, color=sns.color_palette("husl")[i])
        axes[row, col_idx].set_xlabel(feature, fontsize=12, fontweight='bold')
        axes[row, col_idx].set_ylabel('Resale Price (SGD)', fontsize=12, fontweight='bold')
        axes[row, col_idx].set_title(f'{feature} vs Resale Price', fontsize=14, fontweight='bold')
        axes[row, col_idx].grid(True, alpha=0.3)
        axes[row, col_idx].ticklabel_format(style='plain', axis='y')
        
        # Add trend line
        try:
            z = np.polyfit(viz_df[feature], viz_df['RESALE_PRICE'], 1)
            p = np.poly1d(z)
            axes[row, col_idx].plot(viz_df[feature], p(viz_df[feature]), "r--", alpha=0.8, linewidth=2)
        except:
            pass
        
        # Add correlation coefficient
        corr = viz_df[feature].corr(viz_df['RESALE_PRICE'])
        axes[row, col_idx].text(0.05, 0.95, f'r = {corr:.3f}', transform=axes[row, col_idx].transAxes,
                               bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
                               fontsize=11, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print("\n" + "="*60)
    print("5. VISUALIZATIONS - CATEGORICAL VARIABLES") 
    print("="*60)
    
    # BOX PLOTS: All categorical variables vs RESALE_PRICE
    print("\nüìä Creating Box Plots: Categorical Variables vs Resale Price...")
    
    fig, axes = plt.subplots(2, 2, figsize=(24, 18))
    fig.suptitle('Box Plots: Categorical Variables vs Resale Price', fontsize=16, fontweight='bold')
    
    for i, feature in enumerate(cat_attribs):
        row = i // 2
        col_idx = i % 2
        
        # Create box plot
        viz_df.boxplot(column='RESALE_PRICE', by=feature, ax=axes[row, col_idx])
        axes[row, col_idx].set_xlabel(feature, fontsize=12, fontweight='bold')
        axes[row, col_idx].set_ylabel('Resale Price (SGD)', fontsize=12, fontweight='bold')
        axes[row, col_idx].set_title(f'Resale Price Distribution by {feature}', fontsize=14, fontweight='bold')
        axes[row, col_idx].tick_params(axis='x', rotation=45, labelsize=10)
        axes[row, col_idx].ticklabel_format(style='plain', axis='y')
        axes[row, col_idx].grid(True, alpha=0.3)
        
        # Calculate and display statistics
        category_stats = viz_df.groupby(feature)['RESALE_PRICE'].agg(['mean', 'median', 'count'])
        print(f"\n{feature} - Price Statistics:")
        print(category_stats.sort_values('mean', ascending=False).head(10))
    
    plt.tight_layout()
    plt.show()
    
    # Additional detailed box plots for better readability
    print("\nüìä Creating Detailed Individual Box Plots...")
    
    for feature in cat_attribs:
        plt.figure(figsize=(16, 8))
        
        # Create enhanced box plot
        box_plot = viz_df.boxplot(column='RESALE_PRICE', by=feature, figsize=(16, 8))
        plt.suptitle(f'Resale Price Distribution by {feature}', fontsize=16, fontweight='bold')
        plt.xlabel(feature, fontsize=14, fontweight='bold')
        plt.ylabel('Resale Price (SGD)', fontsize=14, fontweight='bold')
        plt.xticks(rotation=45, ha='right')
        plt.ticklabel_format(style='plain', axis='y')
        plt.grid(True, alpha=0.3)
        
        # Add mean values as text
        category_means = viz_df.groupby(feature)['RESALE_PRICE'].mean()
        for i, (category, mean_price) in enumerate(category_means.items(), 1):
            plt.text(i, mean_price, f'${mean_price:,.0f}', ha='center', va='bottom', 
                    fontweight='bold', fontsize=9, 
                    bbox=dict(boxstyle="round,pad=0.2", facecolor="yellow", alpha=0.7))
        
        plt.tight_layout()
        plt.show()
    
    print("\n" + "="*60)
    print("6. CORRELATION ANALYSIS")
    print("="*60)
    
    # Correlation analysis
    print("\nüîó CORRELATION MATRIX AND ANALYSIS:")
    
    # Create correlation matrix
    correlation_df = viz_df[num_attribs]
    correlations = correlation_df.corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(correlations, dtype=bool))
    sns.heatmap(correlations, mask=mask, annot=True, cmap='coolwarm', center=0, 
                square=True, fmt='.3f', cbar_kws={'label': 'Correlation Coefficient'})
    plt.title('Correlation Matrix - Numerical Variables', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print("\nüîó CORRELATION WITH RESALE_PRICE:")
    print("-" * 50)
    price_correlations = correlations['RESALE_PRICE'].sort_values(ascending=False)
    
    for var, corr in price_correlations.items():
        if var != 'RESALE_PRICE':
            if abs(corr) > 0.7:
                strength = "Very Strong"
            elif abs(corr) > 0.5:
                strength = "Strong"
            elif abs(corr) > 0.3:
                strength = "Moderate"
            else:
                strength = "Weak"
            print(f"   {var:<20}: {corr:>7.3f} ({strength})")
    
    # Create pairplot for numerical variables
    print("\nüìä Creating Pairplot for Numerical Variables...")
    if len(num_attribs) <= 5:  # Only create pairplot if manageable number of variables
        plt.figure(figsize=(16, 12))
        sns.pairplot(correlation_df, diag_kind='hist', plot_kws={'alpha': 0.6})
        plt.suptitle('Pairplot - Numerical Variables Relationships', fontsize=16, fontweight='bold', y=1.02)
        plt.tight_layout()
        plt.show()
    
    print("\n" + "="*60)
    print("7. ADVANCED VISUALIZATIONS")
    print("="*60)
    
    # Distribution plots for numerical variables
    print("\nüìä Creating Distribution Plots for Numerical Variables...")
    
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    fig.suptitle('Distribution of Numerical Variables', fontsize=16, fontweight='bold')
    
    for i, feature in enumerate(num_attribs):
        row = i // 3
        col_idx = i % 3
        
        # Histogram with KDE
        axes[row, col_idx].hist(viz_df[feature], bins=50, alpha=0.7, density=True, 
                               color=sns.color_palette("husl")[i], edgecolor='black')
        
        # Add KDE line
        try:
            from scipy import stats
            kde = stats.gaussian_kde(viz_df[feature].dropna())
            x_range = np.linspace(viz_df[feature].min(), viz_df[feature].max(), 100)
            axes[row, col_idx].plot(x_range, kde(x_range), 'r-', linewidth=2, label='KDE')
        except:
            pass
        
        axes[row, col_idx].set_title(f'Distribution of {feature}', fontweight='bold')
        axes[row, col_idx].set_xlabel(feature)
        axes[row, col_idx].set_ylabel('Density')
        axes[row, col_idx].grid(True, alpha=0.3)
        
        # Add statistics
        mean_val = viz_df[feature].mean()
        median_val = viz_df[feature].median()
        axes[row, col_idx].axvline(mean_val, color='red', linestyle='--', alpha=0.8, label=f'Mean: {mean_val:.1f}')
        axes[row, col_idx].axvline(median_val, color='green', linestyle='--', alpha=0.8, label=f'Median: {median_val:.1f}')
        axes[row, col_idx].legend(fontsize=9)
    
    # Remove empty subplot if exists
    if len(num_attribs) < 6:
        fig.delaxes(axes[1, 2])
    
    plt.tight_layout()
    plt.show()
    
    # Special analysis for RESALE_PRICE
    print("\nüìä Special Analysis for Resale Price Distribution...")
    
    fig, axes = plt.subplots(2, 2, figsize=(20, 12))
    fig.suptitle('Resale Price Analysis', fontsize=16, fontweight='bold')
    
    # Original distribution
    axes[0, 0].hist(viz_df['RESALE_PRICE'], bins=100, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 0].set_title('Resale Price Distribution', fontweight='bold')
    axes[0, 0].set_xlabel('Resale Price (SGD)')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].ticklabel_format(style='plain', axis='x')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Log distribution
    axes[0, 1].hist(np.log(viz_df['RESALE_PRICE']), bins=100, alpha=0.7, color='lightgreen', edgecolor='black')
    axes[0, 1].set_title('Log(Resale Price) Distribution', fontweight='bold')
    axes[0, 1].set_xlabel('Log(Resale Price)')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Box plot
    axes[1, 0].boxplot(viz_df['RESALE_PRICE'])
    axes[1, 0].set_title('Resale Price Box Plot', fontweight='bold')
    axes[1, 0].set_ylabel('Resale Price (SGD)')
    axes[1, 0].ticklabel_format(style='plain', axis='y')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Q-Q plot for normality check
    from scipy import stats
    stats.probplot(viz_df['RESALE_PRICE'], dist="norm", plot=axes[1, 1])
    axes[1, 1].set_title('Q-Q Plot (Normality Check)', fontweight='bold')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Categorical variable distributions
    print("\nüìä Creating Distribution Plots for Categorical Variables...")
    
    fig, axes = plt.subplots(2, 2, figsize=(20, 16))
    fig.suptitle('Distribution of Categorical Variables', fontsize=16, fontweight='bold')
    
    for i, feature in enumerate(cat_attribs):
        row = i // 2
        col_idx = i % 2
        
        # Count plot
        value_counts = viz_df[feature].value_counts()
        colors = sns.color_palette("husl", len(value_counts))
        
        bars = axes[row, col_idx].bar(range(len(value_counts)), value_counts.values, color=colors)
        axes[row, col_idx].set_title(f'Distribution of {feature}', fontweight='bold')
        axes[row, col_idx].set_xlabel(feature)
        axes[row, col_idx].set_ylabel('Count')
        axes[row, col_idx].set_xticks(range(len(value_counts)))
        axes[row, col_idx].set_xticklabels(value_counts.index, rotation=45, ha='right')
        axes[row, col_idx].grid(True, alpha=0.3, axis='y')
        
        # Add count labels on bars
        for j, bar in enumerate(bars):
            height = bar.get_height()
            axes[row, col_idx].text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                                   f'{int(height):,}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()
    
    print("\n" + "="*60)
    print("8. PRICE ANALYSIS BY CATEGORIES")
    print("="*60)
    
    # Detailed price analysis by each categorical variable
    for col_name in cat_attribs:
        print(f"\nüí∞ RESALE PRICE BY {col_name.upper()}:")
        print("-" * 60)
        
        price_by_category = hdb_silver_df.group_by(col(col_name)).agg(
            count(lit(1)).alias("count"),
            avg(col("RESALE_PRICE")).alias("avg_price"),
            median(col("RESALE_PRICE")).alias("median_price"),
            min_(col("RESALE_PRICE")).alias("min_price"),
            max_(col("RESALE_PRICE")).alias("max_price")
        ).sort(col("avg_price").desc())
        
        print(f"{'Category':<20} {'Count':<8} {'Avg_Price':<12} {'Median_Price':<12} {'Min_Price':<10} {'Max_Price':<12}")
        print("-" * 80)
        
        results = price_by_category.limit(15).collect()
        for row in results:
            print(f"{str(row[col_name]):<20} {row['COUNT']:<8,} {row['AVG_PRICE']:<12,.0f} {row['MEDIAN_PRICE']:<12,.0f} {row['MIN_PRICE']:<10,.0f} {row['MAX_PRICE']:<12,.0f}")
    
    print("\n" + "="*60)
    print("9. OUTLIER DETECTION AND ANALYSIS")
    print("="*60)
    
    # Outlier detection using IQR method
    print("\nüéØ OUTLIER ANALYSIS (IQR Method):")
    print("-" * 70)
    print(f"{'Variable':<20} {'Q1':<12} {'Q3':<12} {'IQR':<10} {'Lower_Bound':<12} {'Upper_Bound':<12} {'Outliers':<10}")
    print("-" * 70)
    
    outlier_summary = {}
    
    for col_name in num_attribs:
        # Calculate quartiles using Snowpark
        quartiles = hdb_silver_df.select(
            percentile_cont(0.25).within_group(col(col_name)).alias("q1"),
            percentile_cont(0.75).within_group(col(col_name)).alias("q3")
        ).collect()[0]
        
        q1 = float(quartiles['Q1'])
        q3 = float(quartiles['Q3'])
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        
        # Count outliers
        outlier_count = hdb_silver_df.filter(
            (col(col_name) < lower_bound) | (col(col_name) > upper_bound)
        ).count()
        
        outlier_pct = (outlier_count / total_rows) * 100
        
        print(f"{col_name:<20} {q1:<12.2f} {q3:<12.2f} {iqr:<10.2f} {lower_bound:<12.2f} {upper_bound:<12.2f} {outlier_count:<10,} ({outlier_pct:.1f}%)")
        
        outlier_summary[col_name] = {
            'q1': q1, 'q3': q3, 'iqr': iqr,
            'lower_bound': lower_bound, 'upper_bound': upper_bound,
            'outlier_count': outlier_count, 'outlier_percentage': outlier_pct
        }
    
    # Special analysis for RESALE_PRICE outliers
    print("\nüè† RESALE PRICE OUTLIER DETAILS:")
    print("-" * 50)
    price_outliers = outlier_summary['RESALE_PRICE']
    
    # Get extreme outliers
    extreme_outliers = hdb_silver_df.filter(
        (col("RESALE_PRICE") < price_outliers['lower_bound']) | 
        (col("RESALE_PRICE") > price_outliers['upper_bound'])
    ).select(
        min_(col("RESALE_PRICE")).alias("min_outlier"),
        max_(col("RESALE_PRICE")).alias("max_outlier")
    ).collect()[0]
    
    print(f"   Normal Price Range: SGD {price_outliers['lower_bound']:,.0f} - SGD {price_outliers['upper_bound']:,.0f}")
    print(f"   Total Outliers: {price_outliers['outlier_count']:,} ({price_outliers['outlier_percentage']:.1f}%)")
    
    if extreme_outliers['MIN_OUTLIER'] is not None:
        print(f"   Extreme Low Price: SGD {extreme_outliers['MIN_OUTLIER']:,.0f}")
        print(f"   Extreme High Price: SGD {extreme_outliers['MAX_OUTLIER']:,.0f}")
    
    # Analyze outliers by categories
    print("\nüìä HIGH-PRICE OUTLIERS BY CATEGORY:")
    print("-" * 50)
    
    high_price_threshold = price_outliers['upper_bound']
    for col_name in cat_attribs:
        high_price_outliers = hdb_silver_df.filter(
            col("RESALE_PRICE") > high_price_threshold
        ).group_by(col(col_name)).agg(
            count(lit(1)).alias("outlier_count"),
            avg(col("RESALE_PRICE")).alias("avg_outlier_price")
        ).sort(col("outlier_count").desc())
        
        print(f"\n   Top {col_name} with High-Price Outliers:")
        results = high_price_outliers.limit(5).collect()
        for row in results:
            print(f"      {row[col_name]}: {row['OUTLIER_COUNT']} outliers (Avg: SGD {row['AVG_OUTLIER_PRICE']:,.0f})")
    
    print("\nüìä Creating Outlier Visualizations...")
    
    # Box plots for outlier detection
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    fig.suptitle('Outlier Detection - Box Plots for Numerical Variables', fontsize=16, fontweight='bold')
    
    for i, feature in enumerate(num_attribs):
        row = i // 3
        col_idx = i % 3
        
        # Create box plot
        box_plot = axes[row, col_idx].boxplot(viz_df[feature], patch_artist=True)
        
        # Customize box plot
        box_plot['boxes'][0].set_facecolor(sns.color_palette("husl")[i])
        box_plot['boxes'][0].set_alpha(0.7)
        
        axes[row, col_idx].set_title(f'Outliers in {feature}', fontweight='bold')
        axes[row, col_idx].set_ylabel(feature)
        axes[row, col_idx].grid(True, alpha=0.3)
        
        # Add outlier statistics
        Q1 = viz_df[feature].quantile(0.25)
        Q3 = viz_df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = viz_df[(viz_df[feature] < lower_bound) | (viz_df[feature] > upper_bound)]
        
        axes[row, col_idx].text(0.05, 0.95, f'Outliers: {len(outliers)}', 
                               transform=axes[row, col_idx].transAxes,
                               bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
                               fontsize=10, fontweight='bold', verticalalignment='top')
    
    # Remove empty subplot if exists
    if len(num_attribs) < 6:
        fig.delaxes(axes[1, 2])
    
    plt.tight_layout()
    plt.show()
    
    # Outlier scatter plots
    print("\nüìä Creating Outlier Scatter Plots...")
    
    fig, axes = plt.subplots(2, 2, figsize=(20, 16))
    fig.suptitle('Outlier Analysis: Feature Values vs Resale Price', fontsize=16, fontweight='bold')
    
    numerical_features = [col for col in num_attribs if col != 'RESALE_PRICE']
    
    for i, feature in enumerate(numerical_features):
        row = i // 2
        col_idx = i % 2
        
        # Calculate outliers for this feature
        Q1 = viz_df[feature].quantile(0.25)
        Q3 = viz_df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Separate normal and outlier data
        normal_data = viz_df[(viz_df[feature] >= lower_bound) & (viz_df[feature] <= upper_bound)]
        outlier_data = viz_df[(viz_df[feature] < lower_bound) | (viz_df[feature] > upper_bound)]
        
        # Plot normal data
        axes[row, col_idx].scatter(normal_data[feature], normal_data['RESALE_PRICE'], 
                                  alpha=0.6, s=20, color='blue', label=f'Normal ({len(normal_data)})')
        
        # Plot outliers
        if len(outlier_data) > 0:
            axes[row, col_idx].scatter(outlier_data[feature], outlier_data['RESALE_PRICE'], 
                                      alpha=0.8, s=30, color='red', label=f'Outliers ({len(outlier_data)})')
        
        axes[row, col_idx].set_xlabel(feature, fontsize=12, fontweight='bold')
        axes[row, col_idx].set_ylabel('Resale Price (SGD)', fontsize=12, fontweight='bold')
        axes[row, col_idx].set_title(f'Outlier Analysis: {feature}', fontsize=14, fontweight='bold')
        axes[row, col_idx].grid(True, alpha=0.3)
        axes[row, col_idx].legend()
        axes[row, col_idx].ticklabel_format(style='plain', axis='y')
    
    plt.tight_layout()
    plt.show()
    
    print("\n" + "="*60)
    print("10. DATA QUALITY ASSESSMENT")
    print("="*60)
    
    print("\n‚úÖ DATA QUALITY CHECKS:")
    print("-" * 40)
    
    # Check for reasonable value ranges
    quality_issues = []
    
    # Check AGE (should be 0-50 years for HDB)
    invalid_age = hdb_silver_df.filter((col("AGE") < 0) | (col("AGE") > 50)).count()
    if invalid_age > 0:
        quality_issues.append(f"AGE: {invalid_age} records with unrealistic ages")
    else:
        print("   ‚úÖ AGE: All values within reasonable range (0-50 years)")
    
    # Check MONTH_NUM (should be 1-12)
    invalid_month = hdb_silver_df.filter((col("MONTH_NUM") < 1) | (col("MONTH_NUM") > 12)).count()
    if invalid_month > 0:
        quality_issues.append(f"MONTH_NUM: {invalid_month} records with invalid months")
    else:
        print("   ‚úÖ MONTH_NUM: All values within valid range (1-12)")
    
    # Check FLOOR_AREA_SQM (reasonable for HDB: 30-200 sqm)
    invalid_area = hdb_silver_df.filter((col("FLOOR_AREA_SQM") < 30) | (col("FLOOR_AREA_SQM") > 200)).count()
    if invalid_area > 0:
        quality_issues.append(f"FLOOR_AREA_SQM: {invalid_area} records with unusual areas")
    else:
        print("   ‚úÖ FLOOR_AREA_SQM: All values within reasonable range")
    
    # Check RESALE_PRICE (reasonable for HDB: 100k-2M SGD)
    invalid_price = hdb_silver_df.filter((col("RESALE_PRICE") < 100000) | (col("RESALE_PRICE") > 2000000)).count()
    if invalid_price > 0:
        print(f"   ‚ö†Ô∏è  RESALE_PRICE: {invalid_price} records with extreme prices (review needed)")
    else:
        print("   ‚úÖ RESALE_PRICE: All values within reasonable range")
    
    if quality_issues:
        print("\n   ‚ö†Ô∏è  DATA QUALITY ISSUES FOUND:")
        for issue in quality_issues:
            print(f"      - {issue}")
    else:
        print("\n   ‚úÖ No major data quality issues detected!")
    
    print("\n" + "="*60)
    print("11. FEATURE ENGINEERING RECOMMENDATIONS")
    print("="*60)
    
    print("""
üîß SUGGESTED NEW FEATURES:

1. PRICE_PER_SQM:
   - Formula: RESALE_PRICE / FLOOR_AREA_SQM
   - Normalizes price by size for better comparison

2. AGE_GROUP:
   - 0-5 years: New
   - 6-15 years: Moderate  
   - 16-30 years: Old
   - 30+ years: Very Old

3. PRICE_TIER:
   - Based on percentiles of RESALE_PRICE
   - Budget: 0-25th percentile
   - Mid-range: 25-75th percentile  
   - Premium: 75-95th percentile
   - Luxury: 95th+ percentile

4. SEASON:
   - Q1: Jan-Mar, Q2: Apr-Jun, Q3: Jul-Sep, Q4: Oct-Dec

5. STOREY_NUMERIC:
   - Extract middle value from STOREY_RANGE
   - "10 TO 12" ‚Üí 11

6. IS_CORNER_UNIT:
   - Based on FLAT_MODEL patterns
   - Premium models often indicate corner units
""")
    
    print("\n" + "="*60)
    print("12. OUTLIER RECOMMENDATIONS")
    print("="*60)
    
    print("""
üéØ OUTLIER TREATMENT STRATEGY:

üìã KEEP OUTLIERS:
   ‚úÖ AGE: Represent genuine old/new buildings
   ‚úÖ YEAR: Historical data is valuable
   ‚úÖ MONTH_NUM: Seasonal patterns are real

‚ö†Ô∏è  INVESTIGATE OUTLIERS:
   üîç RESALE_PRICE: Manual review needed
      - High outliers: Luxury units or data errors?
      - Low outliers: Subsidized sales or mistakes?
   
   üîç FLOOR_AREA_SQM: Validate extreme sizes
      - Very large: Jumbo flats or measurement errors?
      - Very small: Studio units or data issues?

üí° TREATMENT OPTIONS:
   1. WINSORIZATION: Cap at 95th/99th percentiles
   2. LOG TRANSFORMATION: For right-skewed RESALE_PRICE
   3. ROBUST MODELS: Random Forest handles outliers well
   4. STRATIFICATION: Separate models by FLAT_TYPE
   5. FEATURE FLAGS: Create outlier indicators

üö´ DO NOT:
   ‚ùå Remove outliers without investigation
   ‚ùå Apply same treatment to all variables
   ‚ùå Ignore domain knowledge of Singapore HDB market
""")
    
    print("\n" + "="*60)
    print("13. MODELING RECOMMENDATIONS")
    print("="*60)
    
    print("""
ü§ñ ML PIPELINE SUGGESTIONS:

üìä FEATURE SELECTION:
   ‚Ä¢ High correlation with price: Check correlation results above
   ‚Ä¢ Categorical encoding needed for: TOWN, FLAT_TYPE, STOREY_RANGE, FLAT_MODEL
   ‚Ä¢ Consider interaction terms: FLAT_TYPE √ó TOWN

üèóÔ∏è  MODEL ARCHITECTURE:
   1. BASELINE: Linear Regression with engineered features
   2. ENSEMBLE: Random Forest (handles outliers well)
   3. BOOSTING: XGBoost/LightGBM for accuracy
   4. ADVANCED: Neural Networks for complex patterns

üîÑ VALIDATION STRATEGY:
   ‚Ä¢ Time-based split: Train on older years, test on recent
   ‚Ä¢ Stratified CV: Ensure balanced FLAT_TYPE distribution
   ‚Ä¢ Geographic CV: Test generalization across TOWNs

üìà PERFORMANCE METRICS:
   ‚Ä¢ Primary: RMSE, MAE (in SGD)
   ‚Ä¢ Secondary: MAPE (percentage error)  
   ‚Ä¢ Business: Accuracy within ¬±10% of true price
""")
    
    print("\n" + "="*60)
    print("14. SUMMARY STATISTICS")
    print("="*60)
    
    # Final summary
    print("\nüìä ANALYSIS SUMMARY:")
    print(f"   Dataset Size: {total_rows:,} records")
    print(f"   Price Range: SGD {price_stats['min']:,.0f} - SGD {price_stats['max']:,.0f}")
    print(f"   Average Price: SGD {price_stats['mean']:,.0f}")
    print(f"   Most Common Flat Type: {cat_stats['FLAT_TYPE']['top_categories'][0]['FLAT_TYPE']}")
    print(f"   Most Expensive Town (avg): {cat_stats['TOWN']['top_categories'][0]['TOWN']} (SGD {cat_stats['TOWN']['top_categories'][0]['AVG_PRICE']:,.0f})")
    print(f"   Total Outliers (Price): {price_outliers['outlier_count']:,} ({price_outliers['outlier_percentage']:.1f}%)")
    
    strongest_predictor = max([(var, abs(corr)) for var, corr in price_correlations.items() if var != 'RESALE_PRICE'], key=lambda x: x[1])
    print(f"   Strongest Predictor: {strongest_predictor[0]} (r = {price_correlations[strongest_predictor[0]]:.3f})")
    
    print("\nüéØ NEXT STEPS:")
    print("   1. Review high-price outliers manually")
    print("   2. Engineer new features (price per sqm, age groups)")
    print("   3. Implement robust preprocessing pipeline")
    print("   4. Start with Random Forest baseline model")
    print("   5. Validate on holdout test set")
    
    print("\n" + "="*60)
    print("‚úÖ EDA ANALYSIS COMPLETED SUCCESSFULLY!")
    print("="*60)
    
    return hdb_silver_df

# Call the main function
# main(session)

In [None]:
"""
==================
       OUTPUT
==================
============================================================
1. BASIC DATA OVERVIEW
============================================================
üìä Dataset Overview:
   Total Records: 300,402
   Total Features: 9
   Numerical Features: 5
   Categorical Features: 4
üìã Sample Data (First 5 rows):
---------------------------------------------------------------------------------------------------------------------------
|"TOWN"  |"FLAT_TYPE"  |"STOREY_RANGE"  |"FLOOR_AREA_SQM"  |"FLAT_MODEL"  |"RESALE_PRICE"  |"AGE"  |"YEAR"  |"MONTH_NUM"  |
---------------------------------------------------------------------------------------------------------------------------
|YISHUN  |4 ROOM       |10 TO 12        |92                |MODEL A       |580000          |0      |2025    |4            |
|YISHUN  |4 ROOM       |13 TO 15        |92                |MODEL A       |618888          |0      |2025    |5            |
|YISHUN  |4 ROOM       |01 TO 03        |92                |MODEL A       |560000          |0      |2025    |5            |
|YISHUN  |4 ROOM       |13 TO 15        |92                |MODEL A       |610000          |0      |2025    |5            |
|YISHUN  |4 ROOM       |07 TO 09        |92                |MODEL A       |580000          |0      |2025    |6            |
---------------------------------------------------------------------------------------------------------------------------

üè† RESALE PRICE DETAILED ANALYSIS:
--------------------------------------------------
   Mean Price: SGD 500,265.17
   Median Price: SGD 465,000.00
   Price Range: SGD 140,000.00 - SGD 1,658,888.00
   Standard Deviation: SGD 172,934.20
   25th Percentile: SGD 375,000.00
   75th Percentile: SGD 595,000.00
   90th Percentile: SGD 738,000.00
   95th Percentile: SGD 835,000.00
   99th Percentile: SGD 1,010,099.99
============================================================
3. CATEGORICAL VARIABLES ANALYSIS
============================================================

       3. NEW GENERATION        41,607 ( 13.9%) - Avg Price: SGD    377,769
       4. PREMIUM APARTMENT     31,495 ( 10.5%) - Avg Price: SGD    541,256
       5. SIMPLIFIED            12,960 (  4.3%) - Avg Price: SGD    390,292
       6. APARTMENT             11,323 (  3.8%) - Avg Price: SGD    681,360
       7. STANDARD               9,036 (  3.0%) - Avg Price: SGD    430,542
       8. MAISONETTE             8,402 (  2.8%) - Avg Price: SGD    743,656
       9. MODEL A2               3,787 (  1.3%) - Avg Price: SGD    406,018
      10. DBSS                   3,720 (  1.2%) - Avg Price: SGD    788,096
============================================================
4. VISUALIZATIONS - NUMERICAL VARIABLES
============================================================
4. VISUALIZATIONS - NUMERICAL VARIABLES
============================================================
4. VISUALIZATIONS - NUMERICAL VARIABLES
============================================================
Converting data to Pandas for visualizations...
Using random sample of 50,000 records for visualizations...
Visualization dataset shape: (49942, 9)


üìä Creating Scatterplots: Numerical Variables vs Resale Price...
============================================================
5. VISUALIZATIONS - CATEGORICAL VARIABLES
============================================================
üìä Creating Box Plots: Categorical Variables vs Resale Price...
TOWN - Price Statistics:
                          mean    median  count
TOWN                                           
BUKIT TIMAH      745345.252174  725000.0    115
CENTRAL AREA     668468.939394  541000.0    429
BISHAN           656751.772422  632000.0    892
BUKIT MERAH      607408.995300  625000.0   1915
QUEENSTOWN       591144.016511  555000.0   1393
KALLANG/WHAMPOA  557694.356146  520000.0   1505
MARINE PARADE    552254.417867  492000.0    347
PASIR RIS        549277.353261  525000.0   1472
SERANGOON        531058.084299  500000.0    949
PUNGGOL          528210.485990  512000.0   3105

FLAT_TYPE - Price Statistics:
                           mean    median  count
FLAT_TYPE                                       
MULTI-GENERATION  885882.352941  865000.0     17
EXECUTIVE         698637.433197  668000.0   3675
5 ROOM            594006.143968  560000.0  12218
4 ROOM            500523.782857  465000.0  20673
3 ROOM            360805.989971  343000.0  12464
2 ROOM            289969.299656  280000.0    871
1 ROOM            227729.166667  230000.0     24

STOREY_RANGE - Price Statistics:
                      mean     median  count
STOREY_RANGE                                
49 TO 51      1.344667e+06  1408000.0      3
46 TO 48      1.080086e+06  1028000.0      9
43 TO 45      1.068228e+06  1060000.0     17
40 TO 42      9.416944e+05   905000.0     38
37 TO 39      9.092503e+05   886500.0     96
31 TO 33      8.769729e+05   871000.0    135
34 TO 36      8.734519e+05   865000.0    102
28 TO 30      8.107704e+05   810000.0    227
25 TO 27      7.447669e+05   753500.0    364
26 TO 30      7.106250e+05   717500.0      8

FLAT_MODEL - Price Statistics:
                                mean     median  count
FLAT_MODEL                                            
TYPE S2                 1.141250e+06  1120000.0     46
TYPE S1                 1.004192e+06   945000.0     85
PREMIUM APARTMENT LOFT  9.859302e+05   915000.0     22
TERRACE                 9.049110e+05   870000.0     25
MULTI GENERATION        8.858824e+05   865000.0     17
IMPROVED-MAISONETTE     8.193333e+05   808000.0      3
DBSS                    7.796542e+05   763000.0    615
PREMIUM MAISONETTE      7.640000e+05   756500.0      4
MAISONETTE              7.444269e+05   715000.0   1357
ADJOINED FLAT           7.329775e+05   718000.0    109

üìä Creating Detailed Individual Box Plots...

============================================================
6. CORRELATION ANALYSIS
============================================================
üîó CORRELATION MATRIX AND ANALYSIS:
üîó CORRELATION WITH RESALE_PRICE:
--------------------------------------------------
   FLOOR_AREA_SQM      :   0.588 (Strong)
   YEAR                :   0.319 (Moderate)
   MONTH_NUM           :  -0.016 (Weak)
   AGE                 :  -0.215 (Weak)
üìä Creating Pairplot for Numerical Variables...

============================================================
7. ADVANCED VISUALIZATIONS
============================================================
üìä Creating Distribution Plots for Numerical Variables...
üìä Special Analysis for Resale Price Distribution...
üìä Creating Distribution Plots for Categorical Variables...

============================================================
8. PRICE ANALYSIS BY CATEGORIES
============================================================
BUKIT TIMAH          709      746,487      725,000      300,000    1,501,555   
BISHAN               5,362    669,836      648,000      210,000    1,588,000   
CENTRAL AREA         2,401    655,196      530,000      200,000    1,600,000   
BUKIT MERAH          11,673   607,194      618,000      157,000    1,588,000   
QUEENSTOWN           8,233    597,887      590,000      182,000    1,658,888   
KALLANG/WHAMPOA      9,199    556,170      516,000      185,000    1,568,000   
PASIR RIS            9,068    553,339      530,000      180,000    1,238,000   
MARINE PARADE        1,924    553,239      488,444      180,000    1,380,000   
SERANGOON            5,769    533,719      499,888      170,000    1,268,000   
TOA PAYOH            9,615    530,958      430,000      140,000    1,600,000   
TAMPINES             20,777   528,642      502,000      150,000    1,208,000   
PUNGGOL              18,228   528,587      510,500      225,000    1,230,000   
CLEMENTI             6,940    519,034      450,000      205,000    1,458,888   
SENGKANG             23,012   510,752      495,000      196,000    1,058,000   
HOUGANG              15,292   493,283      460,000      195,000    1,280,000   

üí∞ RESALE PRICE BY FLAT_TYPE:
------------------------------------------------------------
Category             Count    Avg_Price    Median_Price Min_Price  Max_Price   
--------------------------------------------------------------------------------
MULTI-GENERATION     110      836,097      819,000      600,000    1,388,888   
EXECUTIVE            22,506   697,329      670,000      390,000    1,588,000   
5 ROOM               72,699   595,881      560,000      270,000    1,658,888   
4 ROOM               124,390  502,740      468,000      218,000    1,518,000   
3 ROOM               75,395   359,577      342,000      140,000    1,568,000   
2 ROOM               5,170    290,118      280,000      150,000    585,000     
1 ROOM               132      216,304      219,000      157,000    300,000    


üí∞ RESALE PRICE BY STOREY_RANGE:
------------------------------------------------------------
Category             Count    Avg_Price    Median_Price Min_Price  Max_Price   
--------------------------------------------------------------------------------
49 TO 51             21       1,199,843    1,200,000    910,000    1,600,000   
46 TO 48             53       1,107,658    1,050,000    697,000    1,588,000   
43 TO 45             74       1,079,269    1,064,000    670,000    1,580,000   
40 TO 42             261      954,512      928,000      340,000    1,580,000   
37 TO 39             553      890,730      885,000      360,000    1,568,380   
34 TO 36             661      884,005      878,000      355,000    1,588,000   
31 TO 33             704      875,071      860,000      325,000    1,558,000   
28 TO 30             1,399    816,225      818,000      150,000    1,538,000   
25 TO 27             2,181    738,365      740,000      230,000    1,528,000   
36 TO 40             7        717,841      730,888      648,000    743,000     
31 TO 35             2        706,500      706,500      705,000    708,000     
22 TO 24             3,617    686,362      685,000      205,000    1,658,888   
26 TO 30             39       684,725      718,000      500,000    860,000     
19 TO 21             5,247    651,869      644,000      180,000    1,600,000   
21 TO 25             91       632,311      633,000      300,000    895,000     

üí∞ RESALE PRICE BY FLAT_MODEL:
------------------------------------------------------------
Category             Count    Avg_Price    Median_Price Min_Price  Max_Price   
--------------------------------------------------------------------------------
TYPE S2              236      1,118,484    1,075,000    820,000    1,600,000   
TYPE S1              487      1,005,930    950,000      650,000    1,518,000   
PREMIUM APARTMENT LOFT 120      983,846      930,000      722,800    1,658,888   
TERRACE              170      865,596      844,500      635,000    1,568,000   
MULTI GENERATION     110      836,097      819,000      600,000    1,388,888   
PREMIUM MAISONETTE   24       820,167      785,500      618,000    1,208,000   
DBSS                 3,720    788,096      770,000      300,000    1,600,000   
MODEL A-MAISONETTE   520      758,806      750,000      425,000    1,305,000   
3GEN                 64       750,793      745,000      638,000    925,000     
ADJOINED FLAT        501      748,923      738,388      375,000    1,500,000   
MAISONETTE           8,402    743,656      715,000      448,000    1,588,000   
IMPROVED-MAISONETTE  40       733,539      700,000      627,000    1,060,000   
APARTMENT            11,323   681,360      655,000      390,000    1,448,000   
PREMIUM APARTMENT    31,495   541,256      520,000      200,000    1,485,000   
IMPROVED             75,028   509,729      488,000      140,000    1,588,000   

============================================================
9. OUTLIER DETECTION AND ANALYSIS
============================================================
üéØ OUTLIER ANALYSIS (IQR Method):
   Normal Price Range: SGD 45,000 - SGD 925,000
   Total Outliers: 7,193 (2.4%)
   Extreme Low Price: SGD 925,233
   Extreme High Price: SGD 1,658,888
üìä HIGH-PRICE OUTLIERS BY CATEGORY:
--------------------------------------------------
   Top TOWN with High-Price Outliers:
      BUKIT MERAH: 939 outliers (Avg: SGD 1,031,887)
      QUEENSTOWN: 851 outliers (Avg: SGD 1,031,498)
      TOA PAYOH: 839 outliers (Avg: SGD 1,064,343)
      KALLANG/WHAMPOA: 685 outliers (Avg: SGD 1,039,869)
      BISHAN: 627 outliers (Avg: SGD 1,058,400)
  
   Top FLAT_TYPE with High-Price Outliers:
      5 ROOM: 2935 outliers (Avg: SGD 1,056,549)
      4 ROOM: 2363 outliers (Avg: SGD 1,019,954)
      EXECUTIVE: 1830 outliers (Avg: SGD 1,023,660)
      3 ROOM: 40 outliers (Avg: SGD 1,061,906)
      MULTI-GENERATION: 25 outliers (Avg: SGD 1,022,891)

   Top STOREY_RANGE with High-Price Outliers:
      10 TO 12: 976 outliers (Avg: SGD 1,027,319)
      07 TO 09: 901 outliers (Avg: SGD 1,014,932)
      04 TO 06: 872 outliers (Avg: SGD 1,014,600)
      13 TO 15: 623 outliers (Avg: SGD 1,020,319)
      16 TO 18: 590 outliers (Avg: SGD 1,028,182)     

    Top FLAT_MODEL with High-Price Outliers:
      MODEL A: 1830 outliers (Avg: SGD 1,006,476)
      IMPROVED: 1487 outliers (Avg: SGD 1,046,308)
      MAISONETTE: 1078 outliers (Avg: SGD 1,028,537)
      DBSS: 771 outliers (Avg: SGD 1,076,970)
      APARTMENT: 693 outliers (Avg: SGD 1,014,886)
üìä Creating Outlier Visualizations...
üìä Creating Outlier Scatter Plots...

============================================================
10. DATA QUALITY ASSESSMENT
============================================================
‚úÖ DATA QUALITY CHECKS:
----------------------------------------
   ‚úÖ MONTH_NUM: All values within valid range (1-12)
   ‚úÖ RESALE_PRICE: All values within reasonable range
   ‚ö†Ô∏è  DATA QUALITY ISSUES FOUND:
      - AGE: 3335 records with unrealistic ages
      - FLOOR_AREA_SQM: 18 records with unusual areas

============================================================
11. FEATURE ENGINEERING RECOMMENDATIONS
============================================================
üîß SUGGESTED NEW FEATURES:
1. PRICE_PER_SQM:
   - Formula: RESALE_PRICE / FLOOR_AREA_SQM
   - Normalizes price by size for better comparison
2. AGE_GROUP:
   - 0-5 years: New
   - 6-15 years: Moderate  
   - 16-30 years: Old
   - 30+ years: Very Old
3. PRICE_TIER:
   - Based on percentiles of RESALE_PRICE
   - Budget: 0-25th percentile
   - Mid-range: 25-75th percentile  
   - Premium: 75-95th percentile
   - Luxury: 95th+ percentile
4. SEASON:
   - Q1: Jan-Mar, Q2: Apr-Jun, Q3: Jul-Sep, Q4: Oct-Dec
5. STOREY_NUMERIC:
   - Extract middle value from STOREY_RANGE
   - "10 TO 12" ‚Üí 11
6. IS_CORNER_UNIT:
   - Based on FLAT_MODEL patterns
   - Premium models often indicate corner units

============================================================
12. OUTLIER RECOMMENDATIONS
============================================================


üìã KEEP OUTLIERS:
   ‚úÖ AGE: Represent genuine old/new buildings
   ‚úÖ YEAR: Historical data is valuable
   ‚úÖ MONTH_NUM: Seasonal patterns are real
‚ö†Ô∏è  INVESTIGATE OUTLIERS:
   üîç RESALE_PRICE: Manual review needed
      - High outliers: Luxury units or data errors?
      - Low outliers: Subsidized sales or mistakes?
   üîç FLOOR_AREA_SQM: Validate extreme sizes
      - Very large: Jumbo flats or measurement errors?
      - Very small: Studio units or data issues?
üí° TREATMENT OPTIONS:
   1. WINSORIZATION: Cap at 95th/99th percentiles
   2. LOG TRANSFORMATION: For right-skewed RESALE_PRICE
   3. ROBUST MODELS: Random Forest handles outliers well
   4. STRATIFICATION: Separate models by FLAT_TYPE
   5. FEATURE FLAGS: Create outlier indicators
üö´ DO NOT:
   ‚ùå Remove outliers without investigation
   ‚ùå Apply same treatment to all variables
   ‚ùå Ignore domain knowledge of Singapore HDB market


============================================================
13. MODELING RECOMMENDATIONS
============================================================
ü§ñ ML PIPELINE SUGGESTIONS:
üìä FEATURE SELECTION:
   ‚Ä¢ High correlation with price: Check correlation results above
   ‚Ä¢ Categorical encoding needed for: TOWN, FLAT_TYPE, STOREY_RANGE, FLAT_MODEL
   ‚Ä¢ Consider interaction terms: FLAT_TYPE √ó TOWN
üèóÔ∏è  MODEL ARCHITECTURE:
   1. BASELINE: Linear Regression with engineered features
   2. ENSEMBLE: Random Forest (handles outliers well)
   3. BOOSTING: XGBoost/LightGBM for accuracy
   4. ADVANCED: Neural Networks for complex patterns
üîÑ VALIDATION STRATEGY:
   ‚Ä¢ Time-based split: Train on older years, test on recent
   ‚Ä¢ Stratified CV: Ensure balanced FLAT_TYPE distribution
   ‚Ä¢ Geographic CV: Test generalization across TOWNs
üìà PERFORMANCE METRICS:
   ‚Ä¢ Primary: RMSE, MAE (in SGD)
   ‚Ä¢ Secondary: MAPE (percentage error)  
   ‚Ä¢ Business: Accuracy within ¬±10% of true price


============================================================
14. SUMMARY STATISTICS
============================================================
üìä ANALYSIS SUMMARY:
   Dataset Size: 300,402 records
   Price Range: SGD 140,000 - SGD 1,658,888
   Average Price: SGD 500,265
   Most Common Flat Type: 4 ROOM
   Most Expensive Town (avg): SENGKANG (SGD 510,752)
   Total Outliers (Price): 7,193 (2.4%)
   Strongest Predictor: FLOOR_AREA_SQM (r = 0.588)
üéØ NEXT STEPS:
   1. Review high-price outliers manually
   2. Engineer new features (price per sqm, age groups)
   3. Implement robust preprocessing pipeline
   4. Start with Random Forest baseline model
   5. Validate on holdout test set
============================================================
‚úÖ EDA ANALYSIS COMPLETED SUCCESSFULLY!
============================================================

"""
