In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_csv('../../../data/processed/land_dataset_final_v2.csv')

In [3]:
df.head()

Unnamed: 0,address_subdivision,address_locality,address_line_2,h_id,price_per_m2,land_area,price,longitude,latitude,near_Koh_Pich_in_km,...,f_road,f_secondary,f_service,f_steps,f_tertiary,f_track,f_trunk,f_trunk_link,f_unclassified,f_unused
0,Phnom Penh,Mean Chey,Stueng Mean Chey,8865846a91fffff,3068.33,52,52,104.8831,11.552932,6,...,0,1,1,0,0,0,0,0,0,0
1,Phnom Penh,Chamkar Mon,Phsar Daeum Thkov,8865846acbfffff,3632.23,178,178,104.915003,11.528833,3,...,0,0,0,0,0,0,0,0,0,0
2,Phnom Penh,Saensokh,Phnom Penh Thmei,88658468cbfffff,3123.13,138,138,104.886163,11.586713,7,...,0,0,1,0,0,0,0,0,0,0
3,Phnom Penh,Saensokh,Phnom Penh Thmei,8865846ab1fffff,3434.37,162,162,104.889529,11.57579,6,...,0,0,0,0,0,0,0,0,0,0
4,Phnom Penh,Doun Penh,Chakto Mukh,8865846a39fffff,3855.9,200,200,104.958218,11.558388,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.shape

(9272, 234)

In [5]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [6]:
df.shape

(9272, 234)

In [7]:
grouped = df.groupby('h_id')['price_per_m2']
df['h_id_price_mean'] = grouped.mean()
df['h_id_price_max'] = grouped.max()
df['h_id_price_median'] = grouped.median()
df['h_id_price_min'] = grouped.min()

In [8]:
X = df.drop([
    'price_per_m2', 'longitude', 'latitude', 'address_subdivision',
    'h_id', 'address_locality', 'price', 'geometry'
], axis=1, errors='ignore')
y = df['price_per_m2']

In [9]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [10]:
encoder = OrdinalEncoder()
X_encoded = X.copy()
X_encoded[cat_cols] = encoder.fit_transform(X[cat_cols])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

In [12]:
# Feature selection with RFE
model = DecisionTreeRegressor(random_state=42)
selector = RFE(model, n_features_to_select=30, step=1)
selector.fit(X_train, y_train)

In [13]:
# Get and display selected features
selected_features = X.columns[selector.support_]
print("Selected features:", selected_features.tolist())

Selected features: ['address_line_2', 'land_area', 'near_AEON_Mall_1_in_km', 'near_Koh_Norea_in_km', 'near_Camko_City_in_km', 'near_Boeng_Keng_Kang_1_in_km', 'near_Wat_Phnom_in_km', 'near_Vattanac_Tower_in_km', 'near_Royal_Palace_in_km', 'near_Phnom_Penh_Airport_in_km', 'n_gas_station_5km', 'n_pre_school_in_1km', 'n_pre_school_in_1km_to_2km', 'n_pre_school_in_2km_to_3km', 'n_primary_school_5km', 'n_primary_school_in_2km_to_3km', 'n_primary_school_in_3km_to_5km', 'n_university_5km', 'n_university_in_1km_to_2km', 'n_seven_eleven_5km', 'n_seven_eleven_in_1km', 'nearest_resturant', 'n_resturant_in_2km_to_3km', 'n_super_market_in_2km_to_3km', 'n_super_market_in_3km_to_5km', 'n_borey_5km', 'n_bank_5km', 'n_atm_5km', 'n_atm_in_1km_to_2km', 'n_atm_in_3km_to_5km']


In [14]:
from sklearn.preprocessing import OneHotEncoder

# Select only the categorical columns that are in the selected features
cat_selected = [col for col in cat_cols if col in selected_features]

# One-hot encode categorical features in X[selected_features]
if cat_selected:
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    ohe.fit(X[cat_selected])

    # Transform and create DataFrame for encoded features
    X_ohe = pd.DataFrame(
        ohe.transform(X[cat_selected]),
        index=X.index,
        columns=ohe.get_feature_names_out(cat_selected)
    )

    # Drop original categorical columns and concatenate encoded columns
    X_sel_enc = pd.concat([X[selected_features].drop(columns=cat_selected), X_ohe], axis=1)
else:
    X_sel_enc = X[selected_features].copy()

In [15]:
full_selected = pd.concat([X_sel_enc, y], axis=1)
full_selected.to_csv(
    "../../../data/preprocessed/feature_selection_by_model_final_data_30feature.csv",
    index=False
)