In [None]:
"""
================================================================================
Main: HDB Resale Price – Data Transformation & Feature Engineering Pipeline
================================================================================

This script executes a complete data transformation workflow on the HDB_SILVER
table using Snowpark and pandas. The transformed and enriched dataset is saved
to a new Snowflake table called HDB_GOLD for further analytics or modeling.

--------------------------------------------------------------------------------
Step-by-Step Workflow
--------------------------------------------------------------------------------
| Step | Title                               | Description                                                                                          |
|------|-------------------------------------|------------------------------------------------------------------------------------------------------|
| 1    | Load Source Data                    | Loads HDB_SILVER table via Snowpark into a pandas DataFrame                                          |
| 2    | Data Cleaning after EDA             | Remove rows with FLAT_TYPE = ['MULTI GENERATION', '1 ROOM', '2 ROOM'] and Remove FLAT_MODEL column   |
| 3    | Outlier Handling (Capping)          | Caps RESALE_PRICE values at 0.5th and 99.5th percentiles                                             |
| 4    | Feature Engineering                 | Creates new derived features:                                                                        |
|      |                                     |   - AGE_GROUP: Bucketized age bins                                                                   |
|      |                                     |   - PRICE_TIER: Quantile-based price segmentation                                                    |
|      |                                     |   - SEASON: Quarter (Q1–Q4) from MONTH_NUM                                                           |
|      |                                     |   - STOREY_NUMERIC: Midpoint of STOREY_RANGE                                                         |
|      |                                     |   - PRICE_PER_SQM: Price per square meter                                                            |
| 5    | One-Hot Encoding                    | Converts categorical variables into binary indicators                                                |
| 6    | Save to HDB_GOLD                    | Writes transformed dataset to a new Snowflake table (HDB_GOLD)                                       |
| 7    | Final Preview                       | Displays sample rows and schema from the saved table                                                 |
| 8    | Export Instructions                 | Provides COPY INTO SQL command to download HDB_GOLD as CSV from Snowflake                            |
-----------------------------------------------------------------------------------------------------------------------------------------------------

Inputs:
    - session: snowflake.snowpark.Session (active session connected to Snowflake)

Key Features Engineered:
    - AGE_GROUP (bucketed age bands)
    - PRICE_TIER (quantile-based price segments)
    - SEASON (calendar quarter)
    - STOREY_NUMERIC (numerical representation of storey range)
    - PRICE_PER_SQM (price per square meter)

Target Output:
    - Table: HDB_GOLD (fully transformed and enriched version of HDB_SILVER)

Assumptions:
    - HDB_SILVER table exists and contains necessary columns:
      RESALE_PRICE, AGE, FLOOR_AREA_SQM, STOREY_RANGE, MONTH_NUM, etc.
    - Snowpark session is authenticated and operational

Outcome:
    - A modeling-ready dataset with clean pricing data, rich features, and
      encoded categorical variables stored in a new Snowflake table.

"""

In [None]:
import pandas as pd
from snowflake.snowpark import Session


def main(session: Session):
    """
    Main function to orchestrate the data transformation pipeline.
    The session is now passed as an argument, as is standard for Snowflake Python Worksheets.
    """
    
    # --- 1. Load Data from HDB_SILVER table ---
    print("Loading data from 'HDB_SILVER' table...")
    snowpark_df = session.table("HDB_SILVER")
    # Convert Snowpark DataFrame to Pandas DataFrame for easier transformations
    pandas_df = snowpark_df.to_pandas()
    print(f"Data loaded successfully. Shape: {pandas_df.shape}")

    # --- 2. Data Cleaning ---
    print("Starting data cleaning...")
    
    # 2.1. Remove rows with specified FLAT_TYPEs
    flat_types_to_remove = ['MULTI GENERATION', '1 ROOM', '2 ROOM']
    initial_row_count = len(pandas_df)
    pandas_df = pandas_df[~pandas_df['FLAT_TYPE'].isin(flat_types_to_remove)]
    rows_removed_ft = initial_row_count - len(pandas_df)
    print(f"Removed {rows_removed_ft} rows with FLAT_TYPE in {flat_types_to_remove}.")

    # 2.2. Remove the FLAT_MODEL column
    if 'FLAT_MODEL' in pandas_df.columns:
        pandas_df = pandas_df.drop(columns=['FLAT_MODEL'])
        print("Dropped 'FLAT_MODEL' column.")
    else:
        print("Warning: 'FLAT_MODEL' column not found. Skipping column removal.")
    
    print("Data cleaning complete. New shape: {pandas_df.shape}")

    # --- 3. Outlier Handling (Capping) on 'RESALE_PRICE' ---
    print("Handling outliers on 'RESALE_PRICE' by capping at the 0.5 and 99.5 percentiles...")
    # Define quantile thresholds
    lower_q = pandas_df['RESALE_PRICE'].quantile(0.005)
    upper_q = pandas_df['RESALE_PRICE'].quantile(0.995)

    # Flag outliers
    pandas_df['is_outlier'] = ((pandas_df['RESALE_PRICE'] < lower_q) | (pandas_df['RESALE_PRICE'] > upper_q)).astype(int)

    # Cap resale prices at thresholds and overwrite the original column
    pandas_df['RESALE_PRICE'] = pandas_df['RESALE_PRICE'].clip(lower=lower_q, upper=upper_q)
    print("Outlier handling complete. Prices are now capped.")

    # --- 4. Feature Engineering ---
    print("Performing feature engineering...")
    
    # 4.1. Create 'AGE_GROUP' feature
    bins = [0, 5, 15, 30, float('inf')]
    labels = ['New', 'Moderate', 'Old', 'Very Old']
    pandas_df['AGE_GROUP'] = pd.cut(pandas_df['AGE'], bins=bins, labels=labels, right=False)

    # 4.2. Create 'PRICE_TIER' feature
    bins = pandas_df['RESALE_PRICE'].quantile([0, 0.25, 0.75, 0.95, 1.0])
    labels = ['Budget', 'Mid-range', 'Premium', 'Luxury']
    pandas_df['PRICE_TIER'] = pd.cut(pandas_df['RESALE_PRICE'], bins=bins, labels=labels, include_lowest=True)

    # 4.3. Create 'SEASON' feature from 'MONTH_NUM'
    if 'MONTH_NUM' in pandas_df.columns:
        def get_season(month):
            if 1 <= month <= 3:
                return 'Q1'
            elif 4 <= month <= 6:
                return 'Q2'
            elif 7 <= month <= 9:
                return 'Q3'
            elif 10 <= month <= 12:
                return 'Q4'
            return None
            
        pandas_df['SEASON'] = pandas_df['MONTH_NUM'].apply(get_season)
        print("'SEASON' feature created successfully from 'MONTH_NUM'.")
    else:
        print("Warning: 'MONTH_NUM' column not found. Skipping 'SEASON' feature creation.")
    
    # 4.4. Create 'STOREY_NUMERIC' by extracting the middle value from 'STOREY_RANGE'
    def get_middle_storey(storey_range):
        """
        Extracts the numeric midpoint from a storey range string.
        Example: "10 TO 12" -> 11.0
        """
        try:
            parts = storey_range.split(' TO ')
            lower = int(parts[0])
            upper = int(parts[1])
            return (lower + upper) / 2
        except (ValueError, IndexError):
            return None
            
    pandas_df['STOREY_NUMERIC'] = pandas_df['STOREY_RANGE'].apply(get_middle_storey)

    # 4.5. Create the 'PRICE_PER_SQM' feature
    pandas_df['PRICE_PER_SQM'] = pandas_df['RESALE_PRICE'] / pandas_df['FLOOR_AREA_SQM']
    
    '''
    # --- 5. One-Hot Encoding for Categorical Variables ---
    categorical_cols = ['TOWN', 'FLAT_TYPE', 'AGE_GROUP', 'PRICE_TIER']
    # Add 'SEASON' to the list if it was created
    if 'SEASON' in pandas_df.columns:
        categorical_cols.append('SEASON')
    
    print(f"Applying one-hot encoding on: {categorical_cols}")
    pandas_df = pd.get_dummies(pandas_df, columns=categorical_cols, dtype=int)
    print("One-hot encoding complete.")
    '''
    # --- 6. Save the Transformed DataFrame to a New Snowflake Table ---
    # The session's current schema is used to save the table to ensure you have permissions.
    target_table_name = "HDB_GOLD"
    print(f"Saving transformed data to '{target_table_name}' using session.write_pandas()...")
    
    # Reset index to fix the pandas index warning
    df_to_save = pandas_df.reset_index(drop=True)
    
    try:
        session.write_pandas(
            df_to_save,
            target_table_name,
            auto_create_table=True,
            overwrite=True
        )
        print(f"✅ Saved final dataset to Snowflake table: {target_table_name}")
    except Exception as e:
        print(f"❌ Failed to save table {target_table_name}: {e}")
        return session.create_dataframe(pd.DataFrame()) # Return an empty DataFrame on failure
        
    # --- 7. Final check and return DataFrame ---
    final_df = session.table(target_table_name)
    print("\nFinal transformed table schema and a sample of data:")
    final_df.show(3)
    
    # Return the final DataFrame for the Snowflake Python worksheet handler
    return final_df


In [None]:
**OUTPUT**

Loading data from 'HDB_SILVER' table...
Data loaded successfully. Shape: (300402, 9)
Starting data cleaning...
Removed 5302 rows with FLAT_TYPE in ['MULTI GENERATION', '1 ROOM', '2 ROOM'].
Dropped 'FLAT_MODEL' column.
Data cleaning complete. New shape: {pandas_df.shape}
Handling outliers on 'RESALE_PRICE' by capping at the 0.5 and 99.5 percentiles...
Outlier handling complete. Prices are now capped.
Performing feature engineering...
'SEASON' feature created successfully from 'MONTH_NUM'.
Applying one-hot encoding on: ['TOWN', 'FLAT_TYPE', 'AGE_GROUP', 'PRICE_TIER', 'SEASON']
One-hot encoding complete.
Saving transformed data to 'HDB_GOLD' using session.write_pandas()...
✅ Saved final dataset to Snowflake table: HDB_GOLD
Final transformed table schema and a sample of data:
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"STOREY_RANGE"  |"FLOOR_AREA_SQM"  |"RESALE_PRICE"  |"AGE"  |"YEAR"  |"MONTH_NUM"  |"is_outlier"  |"STOREY_NUMERIC"  |"PRICE_PER_SQM"    |"TOWN_ANG MO KIO"  |"TOWN_BEDOK"  |"TOWN_BISHAN"  |"TOWN_BUKIT BATOK"  |"TOWN_BUKIT MERAH"  |"TOWN_BUKIT PANJANG"  |"TOWN_BUKIT TIMAH"  |"TOWN_CENTRAL AREA"  |"TOWN_CHOA CHU KANG"  |"TOWN_CLEMENTI"  |"TOWN_GEYLANG"  |"TOWN_HOUGANG"  |"TOWN_JURONG EAST"  |"TOWN_JURONG WEST"  |"TOWN_KALLANG/WHAMPOA"  |"TOWN_MARINE PARADE"  |"TOWN_PASIR RIS"  |"TOWN_PUNGGOL"  |"TOWN_QUEENSTOWN"  |"TOWN_SEMBAWANG"  |"TOWN_SENGKANG"  |"TOWN_SERANGOON"  |"TOWN_TAMPINES"  |"TOWN_TOA PAYOH"  |"TOWN_WOODLANDS"  |"TOWN_YISHUN"  |"FLAT_TYPE_3 ROOM"  |"FLAT_TYPE_4 ROOM"  |"FLAT_TYPE_5 ROOM"  |"FLAT_TYPE_EXECUTIVE"  |"FLAT_TYPE_MULTI-GENERATION"  |"AGE_GROUP_New"  |"AGE_GROUP_Moderate"  |"AGE_GROUP_Old"  |"AGE_GROUP_Very Old"  |"PRICE_TIER_Budget"  |"PRICE_TIER_Mid-range"  |"PRICE_TIER_Premium"  |"PRICE_TIER_Luxury"  |"SEASON_Q1"  |"SEASON_Q2"  |"SEASON_Q3"  |"SEASON_Q4"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|10 TO 12        |67                |388000          |44     |2012    |9            |0             |11.0              |5791.044776119403  |0                  |0             |0              |0                   |1                   |0                     |0                   |0                    |0                     |0                |0               |0               |0                   |0                   |0                       |0                     |0                 |0               |0                  |0                 |0                |0                 |0                |0                 |0                 |0              |1                   |0                   |0                   |0                      |0                             |0                |0                     |0                |1                     |0                    |1                       |0                     |0                    |0            |0            |1            |0            |
|07 TO 09        |70                |395000          |49     |2012    |9            |0             |8.0               |5642.857142857143  |0                  |0             |0              |0                   |1                   |0                     |0                   |0                    |0                     |0                |0               |0               |0                   |0                   |0                       |0                     |0                 |0               |0                  |0                 |0                |0                 |0                |0                 |0                 |0              |1                   |0                   |0                   |0                      |0                             |0                |0                     |0                |1                     |0                    |1                       |0                     |0                    |0            |0            |1            |0            |
|04 TO 06        |59                |396000          |54     |2012    |9            |0             |5.0               |6711.864406779661  |0                  |0             |0              |0                   |1                   |0                     |0                   |0                    |0                     |0                |0               |0               |0                   |0                   |0                       |0                     |0                 |0               |0                  |0                 |0                |0                 |0                |0                 |0                 |0              |1                   |0                   |0                   |0                      |0                             |0                |0                     |0                |1                     |0                    |1                       |0                     |0                    |0            |0            |1            |0            |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Closing a session in a stored procedure is a no-op.