# Feature Engineering
## Goals
1. Create derived features (text length, temporal, ratios, etc.)
2. Encode categorical variables appropriately
3. Scale/normalize numerical features
4. Handle feature interactions and domain-specific features

In [672]:
# Imports
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.preprocessing import (
    LabelEncoder,
    MinMaxScaler,
    OrdinalEncoder,
    StandardScaler,
    TargetEncoder,
)

In [673]:
# Constants
PROCESSED_DATA_DIR = Path("../../data/cleaned/classification/")
FIGURES_DIR = Path("../../figures/classification/")

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [674]:
# Load metadata
with open(PROCESSED_DATA_DIR / "metadata.json", "r") as f:
    metadata = json.load(f)

X_train = pl.read_parquet(PROCESSED_DATA_DIR / "X_train.parquet")
y_train = pl.read_parquet(PROCESSED_DATA_DIR / "y_train.parquet")

X_val = pl.read_parquet(PROCESSED_DATA_DIR / "X_val.parquet")
y_val = pl.read_parquet(PROCESSED_DATA_DIR / "y_val.parquet")

X_test = pl.read_parquet(PROCESSED_DATA_DIR / "X_test.parquet")
y_test = pl.read_parquet(PROCESSED_DATA_DIR / "y_test.parquet")

print(f"\033[1mDatasets loaded:\033[0m")
print(f"  Train: X={X_train.shape}, y={y_train.shape}")
print(f"  Val:   X={X_val.shape}, y={y_val.shape}")
print(f"  Test:  X={X_test.shape}, y={y_test.shape}")

print(f"\n\033[1mMetadata:\033[0m")
print(f"  Total features: {metadata['total_features']}")
print(f"  Categorical features: {len(metadata['categorical_features'])}")
print(f"  Numeric features: {len(metadata['numeric_features'])}")

[1mDatasets loaded:[0m
  Train: X=(9440, 70), y=(9440, 1)
  Val:   X=(1977, 70), y=(1977, 1)
  Test:  X=(2025, 70), y=(2025, 1)

[1mMetadata:[0m
  Total features: 70
  Categorical features: 18
  Numeric features: 52


In [675]:
# Feature Groups
numeric_features = metadata["numeric_features"]
categorical_features = metadata["categorical_features"]

print(f"\033[1mNumerical Features ({len(numeric_features)}):\033[0m")
for feat in numeric_features:
    print(f"  • {feat}")

print(f"\n\033[1mCategorical Features ({len(categorical_features)}):\033[0m")
for feat in categorical_features:
    print(f"  • {feat}")

[1mNumerical Features (52):[0m
  • host_response_rate
  • host_acceptance_rate
  • host_total_listings_count
  • host_has_profile_pic
  • host_identity_verified
  • latitude
  • longitude
  • accommodates
  • bathrooms
  • bedrooms
  • beds
  • price
  • minimum_nights
  • maximum_nights
  • minimum_minimum_nights
  • maximum_minimum_nights
  • minimum_maximum_nights
  • maximum_maximum_nights
  • minimum_nights_avg_ntm
  • maximum_nights_avg_ntm
  • has_availability
  • availability_30
  • availability_60
  • availability_90
  • availability_365
  • number_of_reviews
  • number_of_reviews_ltm
  • number_of_reviews_l30d
  • review_scores_rating
  • review_scores_accuracy
  • review_scores_cleanliness
  • review_scores_checkin
  • review_scores_communication
  • review_scores_location
  • review_scores_value
  • instant_bookable
  • calculated_host_listings_count_entire_homes
  • calculated_host_listings_count_private_rooms
  • calculated_host_listings_count_shared_rooms
  • reviews_p

## Creating Derived Features
### Text Features

In [676]:
# Work with train set for fitting transformers
X_train_eng = X_train.clone()

text_features = ["amenities", "description", "name", "host_about"]


def derive_text_features(df: pl.DataFrame) -> pl.DataFrame:
    for feat in text_features:
        # Character Count
        df = df.with_columns(pl.col(feat).str.len_chars().alias(f"{feat}_length"))

        # Word count approximation (split by spaces)
        df = df.with_columns(
            pl.col(feat).str.split(" ").list.len().alias(f"{feat}_word_count")
        )

        # Drop Feature
        df.drop_in_place(feat)

        return df


X_train_eng = derive_text_features(X_train_eng)
X_val = derive_text_features(X_val)
X_test = derive_text_features(X_test)

### Temporal Features

In [677]:
# Check all the columns
print(f"{'\033[1mColumn\033[0m':68} \033[1mDtype\033[0m")
for col, dtype in zip(X_train.columns, X_train.dtypes):
    print(f"{col:60} {dtype}")

[1mColumn[0m                                                       [1mDtype[0m
last_scraped                                                 Date
name                                                         String
description                                                  String
host_since                                                   Date
host_location                                                String
host_about                                                   String
host_response_time                                           String
host_response_rate                                           Float64
host_acceptance_rate                                         Float64
host_neighbourhood                                           String
host_total_listings_count                                    Float64
host_verifications                                           String
host_has_profile_pic                                         Boolean
host_identity_verified           

In [678]:
def derive_temporal_features(df: pl.DataFrame) -> pl.DataFrame:
    # Host Tenure Days
    df = df.with_columns(
        (pl.col("last_scraped").cast(pl.Date) - pl.col("host_since").cast(pl.Date))
        .dt.total_days()
        .alias("host_tenure_days")
    )

    # Listing age (days since first_review or since created)
    df = df.with_columns(
        [
            (pl.col("last_scraped") - pl.col("first_review"))
            .dt.total_days()
            .fill_null(0)  # Never reviewed
            .alias("listing_age_days")
        ]
    )

    # Review recency (days since last_review)
    df = df.with_columns(
        [
            (pl.col("last_scraped") - pl.col("last_review"))
            .dt.total_days()
            .fill_null(0)  # Never reviewed
            .alias("days_since_last_review")
        ]
    )

    return df


X_train_eng = derive_temporal_features(X_train_eng)
X_val = derive_temporal_features(X_val)
X_test = derive_temporal_features(X_test)

### Interaction Features

In [679]:
def derive_interaction_features(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(
        (pl.col("bathrooms") / (pl.col("bedrooms") + 1)).alias("bathrooms_per_bedroom"),
        (pl.col("beds") / (pl.col("accommodates") + 1)).alias("beds_per_person"),
        (pl.col("price") / (pl.col("bedrooms") + 1)).alias("price_per_bedroom"),
    )

    return df


X_train_eng = derive_interaction_features(X_train_eng)
X_val = derive_interaction_features(X_val)
X_test = derive_interaction_features(X_test)

### Aggregation Features

In [680]:
def aggregate_features(df: pl.DataFrame) -> pl.DataFrame:
    # Review score composite (mean of all review scores)
    review_score_cols = [col for col in df.columns if col.startswith("review_scores_")]

    df = df.with_columns(
        pl.concat_list(review_score_cols).list.mean().alias("review_score_mean")
    )
    return df


X_train_eng = aggregate_features(X_train_eng)
X_val = aggregate_features(X_val)
X_test = aggregate_features(X_test)

### Summary of Features Added

In [681]:
print(f"  \033[1mOriginal feature count:\033[0m {X_train.shape[1]}")
print(f"  \033[1mNew feature count:\033[0m {X_train_eng.shape[1]}")
print(f"  \033[1mFeatures added:\033[0m {X_train_eng.shape[1] - X_train.shape[1]}")

new_features = [col for col in X_train_eng.columns if col not in X_train.columns]
print(f"\n  \033[1mNew features created:\033[0m")
for feat in new_features:
    print(f"    • {feat}")

  [1mOriginal feature count:[0m 70
  [1mNew feature count:[0m 78
  [1mFeatures added:[0m 8

  [1mNew features created:[0m
    • amenities_length
    • amenities_word_count
    • host_tenure_days
    • listing_age_days
    • days_since_last_review
    • bathrooms_per_bedroom
    • beds_per_person
    • price_per_bedroom
    • review_score_mean


## Categorical Encoding

In [682]:
# Identify Features

# Numerical Columns do not need to be encoded
categorical_columns = [
    col
    for col, dtype in zip(X_train_eng.columns, X_train_eng.dtypes)
    if not dtype in [pl.Date, pl.Datetime, pl.Int64, pl.UInt32, pl.Float64, pl.Boolean]
]

for col, dtype in zip(X_train_eng.columns, X_train_eng.dtypes):
    if col in categorical_columns:
        print(f"{col:60}: {dtype}")

# Binary features (cardinality = 2)
binary_features = [
    col for col in categorical_columns if X_train_eng[col].n_unique() <= 2
]

# Low cardinality (3-10 unique values)
low_card_features = [
    col for col in categorical_columns if 3 <= X_train_eng[col].n_unique() <= 10
]

# Medium cardinality (11-50 unique values)
medium_card_features = [
    col for col in categorical_columns if 11 <= X_train_eng[col].n_unique() <= 50
]

# High cardinality (>50 unique values)
high_card_features = [
    col for col in categorical_columns if X_train_eng[col].n_unique() > 50
]

name                                                        : String
description                                                 : String
host_location                                               : String
host_about                                                  : String
host_response_time                                          : String
host_neighbourhood                                          : String
host_verifications                                          : String
neighbourhood                                               : String
neighbourhood_cleansed                                      : String
neighbourhood_group_cleansed                                : String
property_type                                               : String
room_type                                                   : String
bathrooms_text                                              : String


In [683]:
# Encode Binary Features (Label Encoding)
# All Binary Features are already in Boolean types (True/False)

In [684]:
# Encode Ordinal Features (Ordinal Encoding)
print(low_card_features)
low_card_features.remove("host_verifications")
ordinal_mappings = {
    "last_scraped": X_train_eng["last_scraped"].unique().sort().to_list(),
    "host_response_time": [
        "N/A",
        "a few days or more",
        " within a day",
        "within a few hours",
        "within an hour",
    ],
    "room_type": ["Shared room", "Private room", "Entire home/apt", "Hotel room"],
}
ordinal_encoders = {}

['host_response_time', 'host_verifications', 'neighbourhood_group_cleansed', 'room_type']


In [685]:
for col in low_card_features:
    if col in ordinal_mappings and ordinal_mappings[col] is not None:
        mapping = {val: idx for idx, val in enumerate(ordinal_mappings[col])}
    else:
        # Frequency-based ordering (more common → higher number)
        freq = X_train_eng[col].value_counts().sort("count", descending=False)
        mapping = {val[0]: idx for idx, val in enumerate(freq.rows())}

    X_train_eng = X_train_eng.with_columns(
        pl.col(col)
        .map_elements(
            lambda x: mapping.get(x, -1),
            return_dtype=pl.Int64,
        )
        .alias(col)
    )

    X_val = X_val.with_columns(
        pl.col(col)
        .map_elements(
            lambda x: mapping.get(x, -1),
            return_dtype=pl.Int64,
        )
        .alias(col)
    )

    X_test = X_test.with_columns(
        pl.col(col)
        .map_elements(
            lambda x: mapping.get(x, -1),
            return_dtype=pl.Int64,
        )
        .alias(col)
    )

    ordinal_encoders[col] = mapping

In [686]:
# host_verifications is a multi-select option
all_verifications = set()
for verif_list in X_train_eng["host_verifications"].to_list():
    verifs = verif_list.strip("[]").replace("'", "").replace('"', "").split(",")
    for v in verifs:
        all_verifications.add(v.strip())

all_verifications = sorted(list(all_verifications))
all_verifications.remove("")
all_verifications

['email', 'phone', 'work_email']

In [687]:
# Create binary columns for each verification type
def encode_verification(df: pl.DataFrame) -> pl.DataFrame:
    for verif_type in all_verifications:
        col_name = f"verified_{verif_type.lower().replace(' ', '_')}"
        df = df.with_columns(
            pl.col("host_verifications")
            .str.contains(verif_type, literal=False)
            .cast(pl.Int8)
            .alias(col_name)
        )

    # Create count feature (how many verifications)
    df = df.with_columns(
        pl.col("host_verifications")
        .str.split(",")
        .list.len()
        .alias("verification_count")
    )

    # Drop original column
    return df.drop("host_verifications")


X_train_eng = encode_verification(X_train_eng)
X_val = encode_verification(X_val)
X_test = encode_verification(X_test)

In [688]:
# One-Hot Encode Medium Cardinality Features
onehot_features = []

for col in medium_card_features:
    # Get top 10 categories by frequency
    top_cats = [
        row[0]
        for row in X_train_eng[col]
        .value_counts()
        .sort("count", descending=True)
        .head(10)
        .rows()
    ]

    for cat in top_cats:
        new_col_name = f"{col}_{str(cat).lower().replace(' ', '_')}"
        X_train_eng = X_train_eng.with_columns(
            (pl.col(col) == cat).cast(pl.Int8).alias(new_col_name)
        )
        X_val = X_val.with_columns(
            (pl.col(col) == cat).cast(pl.Int8).alias(new_col_name)
        )
        X_test = X_test.with_columns(
            (pl.col(col) == cat).cast(pl.Int8).alias(new_col_name)
        )
        onehot_features.append(new_col_name)

    # Other Categories
    X_train_eng = X_train_eng.with_columns(
        pl.when(
            pl.all_horizontal(
                [
                    pl.col(f"{col}_{str(cat).lower().replace(' ', '_')}") == 0
                    for cat in top_cats
                ]
            )
        )
        .then(1)
        .otherwise(0)
        .cast(pl.Int8)
        .alias(f"{col}_other")
    )
    X_val = X_val.with_columns(
        pl.when(
            pl.all_horizontal(
                [
                    pl.col(f"{col}_{str(cat).lower().replace(' ', '_')}") == 0
                    for cat in top_cats
                ]
            )
        )
        .then(1)
        .otherwise(0)
        .cast(pl.Int8)
        .alias(f"{col}_other")
    )
    X_test = X_test.with_columns(
        pl.when(
            pl.all_horizontal(
                [
                    pl.col(f"{col}_{str(cat).lower().replace(' ', '_')}") == 0
                    for cat in top_cats
                ]
            )
        )
        .then(1)
        .otherwise(0)
        .cast(pl.Int8)
        .alias(f"{col}_other")
    )
    onehot_features.append(new_col_name)

    # Drop original column after encoding
    X_train_eng = X_train_eng.drop(col)
    X_val = X_val.drop(col)
    X_test = X_test.drop(col)
    print(f"  {col}: Created {len(top_cats) + 1} binary features (dropped original)")

print(f"\nCreated {len(onehot_features)} one-hot features")

  host_location: Created 11 binary features (dropped original)
  neighbourhood: Created 11 binary features (dropped original)
  neighbourhood_cleansed: Created 11 binary features (dropped original)
  property_type: Created 11 binary features (dropped original)
  bathrooms_text: Created 11 binary features (dropped original)

Created 55 one-hot features


In [689]:
X_train_eng[high_card_features]

name,description,host_about,host_neighbourhood
str,str,str,str
"""Triple,shared bathroom,2 mins …","""ST Signature Jalan Besar is am…","""""","""Rochor"""
"""Lovely Room with luxury View a…","""Studio Apartment on High Floor…","""""","""Queenstown"""
"""2-bedroom luxury penthouse + B…","""This penthouse is located on t…","""Mijn naam is Kim Li Ti Oeij. I…","""Pathum Wan"""
"""Condo Common Rm opposite of La…","""The epitome of luxurious livin…","""I am living in Australia / Sin…","""Jurong West"""
"""Lavender Bugis Long term Stay …","""+ 3m+ long term stay welcomed …","""Managing 88 rooms of various b…","""Kallang"""
…,…,…,…
"""HVIr3- Holland Village Condo, …","""- Single occupancy room <br />…","""I am a true-blue Singaporean i…","""Bukit Timah"""
"""Executive Studio Apartment in …","""ST Residences Balestier is str…","""""","""Novena"""
"""Two-Bedroom Executive Pan Paci…","""Enjoy a stylish experience at …","""I have been into Hospitality s…","""Queenstown"""
"""Exquisite & cosy room for rent…","""big beautiful balinese room fo…","""""","""Ang Mo Kio"""


In [690]:
# Feature encode each high-cardinality categorical feature
feature_encoders = dict()

for col in high_card_features:
    freq_encoding = X_train_eng[col].value_counts(normalize=True).to_dict()
    feature_encoders.update({col: freq_encoding})

    X_train_eng = X_train_eng.with_columns(
        pl.col(col)
        .map_elements(lambda x: freq_encoding.get(x, 0.0), return_dtype=pl.Float64)
        .alias(f"{col}_freq")
    ).drop(col)

In [691]:
for col in feature_encoders.keys():
    freq_encoding = feature_encoders[col]

    X_val = X_val.with_columns(
        pl.col(col)
        .map_elements(lambda x: freq_encoding.get(x, 0.0), return_dtype=pl.Float64)
        .alias(f"{col}_freq")
    ).drop(col)

    X_test = X_test.with_columns(
        pl.col(col)
        .map_elements(lambda x: freq_encoding.get(x, 0.0), return_dtype=pl.Float64)
        .alias(f"{col}_freq")
    ).drop(col)

In [692]:
for col, dtype in zip(X_train_eng.columns, X_train_eng.dtypes):
    print(f"{col:60}: {dtype}")

last_scraped                                                : Date
host_since                                                  : Date
host_response_time                                          : Int64
host_response_rate                                          : Float64
host_acceptance_rate                                        : Float64
host_total_listings_count                                   : Float64
host_has_profile_pic                                        : Boolean
host_identity_verified                                      : Boolean
neighbourhood_group_cleansed                                : Int64
latitude                                                    : Float64
longitude                                                   : Float64
room_type                                                   : Int64
accommodates                                                : Float64
bathrooms                                                   : Float64
bedrooms                        

In [693]:
# Let's drop the Date and DateTime columns since we already derived interaction features from them
X_train_eng = X_train_eng.drop(
    ["last_scraped", "host_since", "first_review", "last_review"]
)
X_test = X_test.drop(["last_scraped", "host_since", "first_review", "last_review"])
X_val = X_val.drop(["last_scraped", "host_since", "first_review", "last_review"])

In [694]:
import pickle

encoders = {
    "ordinal": ordinal_encoders,
    "high_card": target_encoders,
    "onehot_features": onehot_features,
}

with open(PROCESSED_DATA_DIR / "encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)

In [695]:
# Identify numerical features (exclude boolean/int8 one-hot encoded features)
numerical_features = [
    col
    for col, dtype in zip(X_train_eng.columns, X_train_eng.dtypes)
    if dtype in [pl.Float64, pl.Int64, pl.UInt32]
]

print(f"\033[1mNumerical features to scale ({len(numerical_features)}):\033[0m")
for feat in numerical_features:
    print(f"  • {feat}")

# Initialize scaler
scaler = StandardScaler()

# Fit on training data ONLY
X_train_array = X_train_eng[numerical_features].to_numpy()
scaler.fit(X_train_array)

# Transform all three sets
X_train_scaled = scaler.transform(X_train_array)
X_val_scaled = scaler.transform(X_val[numerical_features].to_numpy())
X_test_scaled = scaler.transform(X_test[numerical_features].to_numpy())

# Convert back to Polars DataFrames
X_train_eng_scaled = pl.DataFrame(
    X_train_scaled, schema=numerical_features
).with_columns(
    X_train_eng.select(
        [col for col in X_train_eng.columns if col not in numerical_features]
    )
)

X_val_scaled = pl.DataFrame(X_val_scaled, schema=numerical_features).with_columns(
    X_val.select([col for col in X_val.columns if col not in numerical_features])
)

X_test_scaled = pl.DataFrame(X_test_scaled, schema=numerical_features).with_columns(
    X_test.select([col for col in X_test.columns if col not in numerical_features])
)

print(
    f"\n\033[1mScaling complete:\033[0m\n"
    f"  Train shape: {X_train_eng_scaled.shape}\n"
    f"  Val shape: {X_val_scaled.shape}\n"
    f"  Test shape: {X_test_scaled.shape}"
)

[1mNumerical features to scale (53):[0m
  • host_response_time
  • host_response_rate
  • host_acceptance_rate
  • host_total_listings_count
  • neighbourhood_group_cleansed
  • latitude
  • longitude
  • room_type
  • accommodates
  • bathrooms
  • bedrooms
  • beds
  • price
  • minimum_nights
  • maximum_nights
  • minimum_minimum_nights
  • maximum_minimum_nights
  • minimum_maximum_nights
  • maximum_maximum_nights
  • minimum_nights_avg_ntm
  • maximum_nights_avg_ntm
  • availability_30
  • availability_60
  • availability_90
  • availability_365
  • number_of_reviews
  • number_of_reviews_ltm
  • number_of_reviews_l30d
  • review_scores_rating
  • review_scores_accuracy
  • review_scores_cleanliness
  • review_scores_checkin
  • review_scores_communication
  • review_scores_location
  • review_scores_value
  • calculated_host_listings_count_entire_homes
  • calculated_host_listings_count_private_rooms
  • calculated_host_listings_count_shared_rooms
  • reviews_per_month
  • am

In [696]:
# Save datasets
X_train_eng_scaled.write_parquet(PROCESSED_DATA_DIR / "X_train_scaled.parquet")
X_val_scaled.write_parquet(PROCESSED_DATA_DIR / "X_val_scaled.parquet")
X_test_scaled.write_parquet(PROCESSED_DATA_DIR / "X_test_scaled.parquet")

y_train.write_parquet(PROCESSED_DATA_DIR / "y_train.parquet")
y_val.write_parquet(PROCESSED_DATA_DIR / "y_val.parquet")
y_test.write_parquet(PROCESSED_DATA_DIR / "y_test.parquet")

# Save scaler
with open(PROCESSED_DATA_DIR / "scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Save feature metadata
feature_metadata = {
    "numerical_features": numerical_features,
    "total_features": X_train_eng_scaled.shape[1],
    "train_shape": X_train_eng_scaled.shape,
    "val_shape": X_val_scaled.shape,
    "test_shape": X_test_scaled.shape,
}

with open(PROCESSED_DATA_DIR / "feature_metadata.pkl", "wb") as f:
    pickle.dump(feature_metadata, f)

print("All files saved.")

All files saved.


In [697]:
# Check scaled distributions
print("\033[1mScaled feature statistics:\033[0m")
for col in numerical_features[:5]:  # Show first 5
    mean = X_train_eng_scaled[col].mean()
    std = X_train_eng_scaled[col].std()
    print(f"{col}: mean={mean:.2f}, std={std:.2f}")

[1mScaled feature statistics:[0m
host_response_time: mean=-0.00, std=1.00
host_response_rate: mean=-0.00, std=1.00
host_acceptance_rate: mean=-0.00, std=1.00
host_total_listings_count: mean=-0.00, std=1.00
neighbourhood_group_cleansed: mean=0.00, std=1.00


## Summary
We created 127 features from the original 70. Here's why we kept all of them:

1. **One-hot encoded features** (55 new): Each captures a distinct category that may have different predictive power
2. **Interaction features** (3 new): Capture non-linear relationships (e.g., bathrooms per bedroom might matter more than bathrooms alone)
3. **Temporal features** (3 new): Capture host experience and listing maturity
4. **Target-encoded features** (4 new): Compress high-cardinality categoricals while preserving predictive info

We'll let the models perform feature selection (e.g., tree models ignore irrelevant features, regularized models shrink coefficients to zero).