In [14]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, f_oneway
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [15]:
def correlation_ratio(categories, values):
    """Compute correlation ratio for categorical-continuous association"""
    categories = np.array(categories)
    values = np.array(values)
    f_cat, _ = pd.factorize(categories)
    cat_num = np.max(f_cat) + 1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    
    for i in range(cat_num):
        cat_measures = values[np.argwhere(f_cat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.mean(cat_measures)
    
    y_total_avg = np.sum(np.multiply(y_avg_array, n_array)) / np.sum(n_array)
    numerator = np.sum(np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2)))
    denominator = np.sum(np.power(np.subtract(values, y_total_avg), 2))
    
    if denominator == 0:
        return 0.0
    return np.sqrt(numerator / denominator)

In [16]:
def correlation_ratio(categories, values):
    """Compute correlation ratio for categorical-continuous association"""
    categories = np.array(categories)
    values = np.array(values)
    f_cat, _ = pd.factorize(categories)
    cat_num = np.max(f_cat) + 1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    
    for i in range(cat_num):
        cat_measures = values[np.argwhere(f_cat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.mean(cat_measures)
    
    y_total_avg = np.sum(np.multiply(y_avg_array, n_array)) / np.sum(n_array)
    numerator = np.sum(np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2)))
    denominator = np.sum(np.power(np.subtract(values, y_total_avg), 2))
    
    if denominator == 0:
        return 0.0
    return np.sqrt(numerator / denominator)

In [17]:
def multi_correlation_fs_regression(
    X,  # DataFrame of features (mixed types)
    y,  # Target Series
    metrics=['pearson', 'mi'],  # Metrics to use
    redundancy_threshold=0.8,   # Max allowed correlation between features
    top_k=None,                 # Max features to select
    random_state=None,          # Seed for reproducibility
    categorical_features=None   # List of categorical feature names
):
    """
    Feature selection for regression handling both numerical and categorical features
    Returns selected feature names and their scores
    """
    # Identify categorical features if not provided
    if categorical_features is None:
        categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = [col for col in X.columns if col not in categorical_features]
    
    # Preprocessing pipeline for numerical features
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Preprocessing pipeline for categorical features
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    # Full preprocessing
    preprocessor = ColumnTransformer([
        ('num', num_pipeline, numerical_features),
        ('cat', cat_pipeline, categorical_features)
    ])
    
    X_processed = preprocessor.fit_transform(X)
    X_processed = pd.DataFrame(X_processed, 
                               columns=(numerical_features + 
                                        list(preprocessor.named_transformers_['cat']
                                             .named_steps['encoder']
                                             .get_feature_names_out(categorical_features))))
    
    # Initialize scores DataFrame
    scores = pd.DataFrame(index=X.columns)
    abs_scores = pd.DataFrame(index=X.columns)
    
    # Compute relevance metrics for each feature
    for col in X.columns:
        if col in numerical_features:
            # Numerical feature metrics
            if 'pearson' in metrics:
                r, _ = pearsonr(X[col].fillna(X[col].median()), y)
                scores.loc[col, 'pearson'] = r
                abs_scores.loc[col, 'pearson'] = abs(r)
                
            if 'spearman' in metrics:
                r, _ = spearmanr(X[col].fillna(X[col].median()), y)
                scores.loc[col, 'spearman'] = r
                abs_scores.loc[col, 'spearman'] = abs(r)
                
        else:  # Categorical feature
            # Correlation ratio (eta)
            if 'pearson' in metrics:
                eta = correlation_ratio(X[col], y)
                scores.loc[col, 'pearson'] = eta
                abs_scores.loc[col, 'pearson'] = eta
                
            # ANOVA F-value (converted to correlation-like measure)
            if 'spearman' in metrics:
                groups = [y[X[col] == cat] for cat in X[col].unique()]
                f, _ = f_oneway(*groups)
                scores.loc[col, 'spearman'] = f
                abs_scores.loc[col, 'spearman'] = f
    
    # Mutual Information (handles both types after encoding)
    if 'mi' in metrics:
        # Mark all one-hot encoded columns as discrete
        discrete_features = [
            i for i, col in enumerate(X_processed.columns)
            if any([col.startswith(cat + '_') or col == cat for cat in categorical_features])
        ]
        mi_scores = mutual_info_regression(
            X_processed, y, 
            random_state=random_state,
            discrete_features=discrete_features
        )
        mi_df = pd.Series(mi_scores, index=X_processed.columns)
        
        # Aggregate MI for original categorical features (sum of one-hot components)
        for col in X.columns:
            if col in categorical_features:
                # Sum MI for all one-hot components of this categorical feature
                components = [c for c in mi_df.index if c.startswith(col + '_') or c == col]
                scores.loc[col, 'mi'] = mi_df[components].sum() if components else 0
            else:
                scores.loc[col, 'mi'] = mi_df[col]
                
        abs_scores['mi'] = scores['mi']
    
    # Normalize scores (0 to 1) and combine
    normalized = abs_scores.apply(lambda x: (x - x.min()) / (x.max() - x.min() + 1e-10))
    normalized['combined'] = normalized.mean(axis=1)
    
    # Sort features by combined score
    sorted_features = normalized['combined'].sort_values(ascending=False).index
    
    # Create encoded version for redundancy check
    le = LabelEncoder()
    X_encoded = X.copy()
    for col in categorical_features:
        X_encoded[col] = le.fit_transform(X[col].astype(str))
    
    # Select features with redundancy control
    selected = []
    for feature in sorted_features:
        if top_k is not None and len(selected) >= top_k:
            break
            
        # Calculate maximum correlation with already selected features
        max_corr = 0
        if selected:
            for sel in selected:
                # Handle different feature type combinations
                if feature in numerical_features and sel in numerical_features:
                    corr = abs(pearsonr(X_encoded[feature], X_encoded[sel])[0])
                elif feature in categorical_features and sel in categorical_features:
                    corr = abs(pearsonr(X_encoded[feature], X_encoded[sel])[0])
                else:  # Mixed types - use different metric
                    if feature in numerical_features:
                        corr = correlation_ratio(X_encoded[sel], X_encoded[feature])
                    else:
                        corr = correlation_ratio(X_encoded[feature], X_encoded[sel])
                max_corr = max(max_corr, corr)
        
        # Add feature if below redundancy threshold
        if max_corr < redundancy_threshold:
            selected.append(feature)
    
    return selected, normalized

In [18]:
df = pd.read_csv('../../../data/processed/land_dataset_final_v2.csv')

In [25]:
df


Unnamed: 0,address_subdivision,address_locality,address_line_2,h_id,price_per_m2,land_area,price,longitude,latitude,near_Koh_Pich_in_km,...,f_tertiary,f_track,f_trunk,f_trunk_link,f_unclassified,f_unused,h_id_price_mean,h_id_price_median,h_id_price_max,h_id_price_min
0,Phnom Penh,Mean Chey,Stueng Mean Chey,8865846a91fffff,3068.33,52,52,104.883100,11.552932,6,...,0,0,0,0,0,0,3011.727778,3047.460,3240.50,2622.25
1,Phnom Penh,Chamkar Mon,Phsar Daeum Thkov,8865846acbfffff,3632.23,178,178,104.915003,11.528833,3,...,0,0,0,0,0,0,3818.351026,3806.730,4961.91,2560.65
2,Phnom Penh,Saensokh,Phnom Penh Thmei,88658468cbfffff,3123.13,138,138,104.886163,11.586713,7,...,0,0,0,0,0,0,3702.541837,3593.130,4619.43,3059.76
3,Phnom Penh,Saensokh,Phnom Penh Thmei,8865846ab1fffff,3434.37,162,162,104.889529,11.575790,6,...,0,0,0,0,0,0,3308.530714,3242.455,4527.88,2855.43
4,Phnom Penh,Doun Penh,Chakto Mukh,8865846a39fffff,3855.90,200,200,104.958218,11.558388,1,...,0,0,0,0,0,0,3963.536364,3973.660,5619.81,1745.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9267,Phnom Penh,Chraoy Chongvar,Bak Kaeng,886586a699fffff,661.63,182,182,104.929226,11.701893,16,...,0,0,0,0,0,0,615.868333,636.920,699.64,496.69
9268,Phnom Penh,Chraoy Chongvar,Preaek Ta Sek,8865846995fffff,410.11,212,212,104.899855,11.667508,13,...,0,0,0,0,0,0,898.133333,895.275,1685.02,407.13
9269,Phnom Penh,Praek Pnov,Ponsang,8865846d85fffff,251.75,134,134,104.756877,11.633307,22,...,0,0,0,0,0,0,201.678889,184.580,251.75,167.49
9270,Phnom Penh,Pur SenChey,Kantaok,8865846e31fffff,1020.13,230,230,104.785133,11.523526,17,...,0,0,0,0,0,0,1007.350667,1020.120,1238.65,792.92


In [19]:
grouped = df.groupby('h_id')['price_per_m2']
df['h_id_price_mean'] = grouped.transform('mean')
df['h_id_price_median'] = grouped.transform('median')
df['h_id_price_max'] = grouped.transform('max')
df['h_id_price_min'] = grouped.transform('min')

In [20]:
# Define features and target
X = df.drop(['price_per_m2', 'longitude', 'latitude', 'address_subdivision', 
             'h_id', 'address_locality', 'price', 'geometry'], axis=1, errors='ignore')
y = df['price_per_m2']

# Identify categorical features
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Perform feature selection
selected_features, scores = multi_correlation_fs_regression(
    X, 
    y,
    metrics=['pearson', 'mi'],       # Use Pearson and MI
    redundancy_threshold=0.8,        # Moderate redundancy control
    top_k=60,                   
    random_state=42,
    categorical_features=cat_cols    # Use detected categorical columns
)

print("Selected features:", selected_features)
print("\nFeature scores:")
print(scores.sort_values('combined', ascending=False))

  r, _ = pearsonr(X[col].fillna(X[col].median()), y)


Selected features: ['address_line_2', 'nearest_cafe', 'nearest_mart', 'n_seven_eleven_in_1km', 'n_bank_in_1km', 'nearest_pre_school', 'nearest_secondary_school', 'nearest_primary_school', 'n_university_in_1km', 'nearest_seven_eleven', 'nearest_hotel', 'nearest_bank', 'Chroy_Changvar_Bridge_3_5km', 'Olympic_Stadium_1_2km', 'nearest_gas_station', 'Bassac_Lane_1_2km', 'Royal_Palace_1_2km', 'Koh_Pich_2_3km', 'Phsar_kandal_1_2km', 'Boeng_Keng_Kang_1_1_2km', 'f_footway', 'Sisowath_Riverside_Park_1_2km', 'Boeng_Keng_Kang_1_nearest', 'nearest_university', 'AEON_Mall_1_2_3km', 'Royal_Palace_nearest', 'Bassac_Lane_nearest', 'Olympic_Stadium_2_3km', 'AEON_Mall_1_1_2km', 'Phsar_Tmey_1_2km', 'Wat_Phnom_2_3km', 'Phnom_Penh_Airport_5_10km', 'Vattanac_Tower_2_3km', 'Russian_Market_2_3km', 'Phsar_Chas_2_3km', 'Phsar_Tmey_nearest', 'Vattanac_Tower_1_2km', 'f_tertiary', 'Phsar_Tmey_2_3km', 'Camko_City_3_5km', 'f_residential', 'Sisowath_Riverside_Park_2_3km', 'Wat_Phnom_3_5km', 'Phsar_Chas_nearest', 'Russ

In [27]:

# Select only the chosen features
X_selected = X[selected_features].copy()

# Identify which selected features are categorical
cat_selected = [col for col in cat_cols if col in selected_features]
num_selected = [col for col in selected_features if col not in cat_selected]

# One-hot encode categorical features if needed
if cat_selected:
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_cat_encoded = ohe.fit_transform(X_selected[cat_selected])
    encoded_cat_cols = ohe.get_feature_names_out(cat_selected)
    
    # Create DataFrame for encoded features
    X_cat_encoded_df = pd.DataFrame(X_cat_encoded, 
                                   index=X_selected.index, 
                                   columns=encoded_cat_cols)
    
    # Combine with numerical features
    X_final = pd.concat([X_selected[num_selected], X_cat_encoded_df], axis=1)
else:
    X_final = X_selected.copy()

# Add target back for saving
final_df = pd.concat([X_selected, y], axis=1)

# Save to CSV
final_df.to_csv("../../../data/preprocessed/feature_selection_with_aggregates_60features_encoed.csv", index=False)
print(f"Saved selected features to CSV. Shape: {final_df.shape}")
print(f"Numerical features: {len(num_selected)}")
print(f"Categorical features: {len(cat_selected)}")
# print(f"Encoded categorical columns: {len(encoded_cat_cols) if cat_selected else 0}")
# print(f"Total columns: {len(num_selected) + (len(encoded_cat_cols) if cat_selected else 0) + 1} (including target)")

Saved selected features to CSV. Shape: (9272, 61)
Numerical features: 59
Categorical features: 1


In [23]:
final_df.head()

Unnamed: 0,address_line_2,nearest_cafe,nearest_mart,n_seven_eleven_in_1km,n_bank_in_1km,nearest_pre_school,nearest_secondary_school,nearest_primary_school,n_university_in_1km,nearest_seven_eleven,...,Wat_Phnom_1_2km,Koh_Pich_1_2km,Boeng_Keng_Kang_1_2_3km,Bassac_Lane_2_3km,Royal_Palace_2_3km,f_service,Vattanac_Tower_nearest,Wat_Phnom_nearest,n_borey_in_1km_to_2km,price_per_m2
0,Stueng Mean Chey,4,0,1,5,1,0,2,4,0,...,0,0,0,0,0,1,0,0,2,3068.33
1,Phsar Daeum Thkov,0,0,3,0,1,1,2,0,0,...,0,0,1,1,0,0,0,0,3,3632.23
2,Phnom Penh Thmei,1,1,1,0,1,0,1,2,0,...,0,0,0,0,0,1,0,0,5,3123.13
3,Phnom Penh Thmei,2,1,1,0,4,3,2,20,1,...,0,0,0,0,0,0,0,0,5,3434.37
4,Chakto Mukh,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,3855.9


In [24]:
# # Select only the chosen features
# X_selected = X[selected_features].copy()

# # Identify which selected features are categorical
# cat_selected = [col for col in cat_cols if col in selected_features]

# from sklearn.preprocessing import OneHotEncoder

# if cat_selected:
#     ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
#     ohe.fit(X_selected[cat_selected])
#     X_ohe = pd.DataFrame(
#         ohe.transform(X_selected[cat_selected]),
#         index=X_selected.index,
#         columns=ohe.get_feature_names_out(cat_selected)
#     )
#     # Drop original categorical columns and concatenate encoded columns
#     X_final = pd.concat([X_selected.drop(columns=cat_selected), X_ohe], axis=1)
# else:
#     X_final = X_selected.copy()

# # Add target back for saving
# final_df = pd.concat([X_final, y], axis=1)

# # Save to CSV
# final_df.to_csv("../../../data/preprocessed/feature_selection_by_multi_corr_final_data_60feature.csv", index=False)
# print("Saved selected and encoded features to CSV.")