In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import (RFECV, SelectKBest, mutual_info_regression, 
                                      VarianceThreshold, SelectFromModel)
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import shap

In [None]:
df = pd.read_csv('../../../data/processed/land_dataset_final_v2.csv')
df.dropna(inplace=True)

In [None]:
grouped = df.groupby('h_id')['price_per_m2']
for stat in ['mean', 'max', 'median', 'min']:
    df[f'h_id_price_{stat}'] = grouped.transform(stat)

In [None]:
X = df.drop([
    'price_per_m2', 'longitude', 'latitude', 'address_subdivision',
    'h_id', 'address_locality', 'price', 'geometry'
], axis=1, errors='ignore')
y = df['price_per_m2']

In [None]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=np.number).columns.tolist()

In [None]:
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_encoded = X.copy()
X_encoded[cat_cols] = encoder.fit_transform(X[cat_cols])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

In [None]:
# 2. Initial filtering: Remove low-variance features

variance_selector = VarianceThreshold(threshold=0.01)
X_train_filtered = variance_selector.fit_transform(X_train)
selected_mask = variance_selector.get_support()
remaining_features = X_train.columns[selected_mask].tolist()
print(f"After variance threshold: {len(remaining_features)} features")

In [None]:
# 3. Univariate feature selection (Filter method)

univariate_selector = SelectKBest(score_func=mutual_info_regression, k=25)
X_train_univariate = univariate_selector.fit_transform(X_train_filtered, y_train)
univariate_mask = univariate_selector.get_support()
remaining_features = [f for f, keep in zip(remaining_features, univariate_mask) if keep]
print(f"After univariate selection: {len(remaining_features)} features")

In [None]:
# 4. Recursive Feature Elimination with Cross-Validation (Wrapper method)

model = GradientBoostingRegressor(random_state=42)
rfe_selector = RFECV(
    estimator=model,
    step=1,
    cv=5,
    scoring='r2',
    min_features_to_select=15,
    n_jobs=-1
)
X_train_rfe = rfe_selector.fit_transform(X_train_univariate, y_train)
rfe_mask = rfe_selector.get_support()
remaining_features = [f for f, keep in zip(remaining_features, rfe_mask) if keep]
print(f"After RFECV: {len(remaining_features)} features")

In [None]:
# 5. Embedded method: Feature importance with regularization

model = GradientBoostingRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    random_state=42
)
model.fit(X_train_rfe, y_train)


In [None]:
# Select features based on importance threshold
sfm_selector = SelectFromModel(model, threshold="median", prefit=True)
embedded_mask = sfm_selector.get_support()
remaining_features = [f for f, keep in zip(remaining_features, embedded_mask) if keep]
print(f"After embedded selection: {len(remaining_features)} features")


In [None]:
# 6. SHAP-based feature validation (Model interpretation)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train_rfe)

# Get SHAP importance scores
shap_importance = pd.DataFrame({
    'feature': remaining_features,
    'shap_importance': np.abs(shap_values).mean(0)
}).sort_values('shap_importance', ascending=False)


In [None]:
# Select top features by SHAP importance
shap_mask = shap_importance['shap_importance'] > shap_importance['shap_importance'].quantile(0.5)
final_features = shap_importance[shap_mask]['feature'].tolist()
print(f"Final selected features: {len(final_features)}")

# 7. Validate selection quality
X_train_final = X_train_rfe[:, embedded_mask][:, shap_mask]
X_test_processed = variance_selector.transform(X_test)
X_test_processed = univariate_selector.transform(X_test_processed)
X_test_processed = rfe_selector.transform(X_test_processed)
X_test_final = sfm_selector.transform(X_test_processed)[:, shap_mask]

model = GradientBoostingRegressor(random_state=42)
model.fit(X_train_final, y_train)
score = model.score(X_test_final, y_test)
print(f"Validation R² with selected features: {score:.4f}")

In [None]:
# Get selected categorical and numerical features
cat_selected = [col for col in final_features if col in cat_cols]
num_selected = [col for col in final_features if col not in cat_cols]

# Create base dataframe with selected features from ORIGINAL data (not encoded)
X_selected = X[final_features].copy()

# One-hot encode categorical features
if cat_selected:
    ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
    ohe.fit(X_selected[cat_selected])
    
    # Transform categorical features
    X_ohe = ohe.transform(X_selected[cat_selected])
    
    # Create DataFrame for encoded features
    ohe_columns = ohe.get_feature_names_out(cat_selected)
    X_ohe_df = pd.DataFrame(X_ohe, columns=ohe_columns, index=X_selected.index)
    
    # Combine with numerical features
    X_final = pd.concat([X_selected[num_selected], X_ohe_df], axis=1)
else:
    X_final = X_selected.copy()

In [None]:
# Add target and save
final_dataset = pd.concat([X_final, y], axis=1)
final_dataset.to_csv(
    "../../../data/preprocessed/selected_features_dataset.csv",
    index=False
)

# Save feature selection report and encoders
shap_importance.to_csv("feature_selection_report.csv", index=False)
joblib.dump(encoder, 'ordinal_encoder.pkl')  # Save for inference
if cat_selected:
    joblib.dump(ohe, 'onehot_encoder.pkl')   # Save for inference

print("Final dataset prepared with one-hot encoded categorical features")
print(f"Final feature count: {X_final.shape[1]}")