In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_regression


In [2]:
df = pd.read_csv('../../data/processed/realestates_kh_v7.csv')

In [15]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,land_area,address_line_2,category_name,is_parent,type,latitude,longitude,...,Sisowath_Riverside_Park_5_10km,Phnom_Penh_Airport_nearest,Phnom_Penh_Airport_1_2km,Phnom_Penh_Airport_2_3km,Phnom_Penh_Airport_3_5km,Phnom_Penh_Airport_5_10km,mean,median,max,min
0,1100000.0,7.0,7.0,124.0,Chakto Mukh,Shophouse,False,residential,11.57561,104.92025,...,0,0,0,0,0,1,3908.371821,3829.257246,8870.967742,104.166667
1,680000.0,4.0,5.0,80.0,BKK 3,House,False,residential,11.55,104.93,...,0,0,0,0,0,0,2730.69707,2129.508909,8500.0,370.37037
2,550000.0,3.0,4.0,66.0,Chey Chumneah,Retreat,False,residential,11.57561,104.92025,...,0,0,0,0,0,1,3818.434615,2963.87345,8333.333333,1012.658228
3,750000.0,9.0,10.0,116.0,Tonle Bassac,Villa,False,residential,11.5445,104.913586,...,0,0,0,0,0,1,2945.379851,2878.787879,6465.517241,254.77707
4,420000.0,5.0,6.0,65.0,Chroy Changvar,House,False,residential,11.58,104.93,...,0,0,0,0,0,0,2047.471213,1754.120879,6461.538462,170.135882


In [11]:
df = df[df['price_per_m2'] >= 100].copy()

In [7]:
df[['bedrooms', 'bathrooms']] = df[['bedrooms', 'bathrooms']].fillna(0)

In [12]:
# Calculate mean, max, min of 'price_per_m2' grouped by 'h_id'
price_stats = df.groupby('address_line_2')['price_per_m2'].agg(['mean','median', 'max', 'min']).reset_index()

# Merge these stats back to the original dataframe on 'h_id'
df = df.merge(price_stats, on='address_line_2', how='left', suffixes=('', '_price_per_m2'))

# The columns 'mean', 'max', 'min' are now added to df

In [40]:
df.drop(columns=['price_display'], inplace=True)

In [41]:
y = df[['price']]
X = df.drop(columns=['price'])

In [59]:
X.describe()

Unnamed: 0,bedrooms,bathrooms,land_area,latitude,longitude,price_per_m2,nearest_cafe,n_cafe_in_1km,n_cafe_in_1km_to_2km,n_cafe_in_2km_to_3km,...,Phnom_Penh_Airport_nearest,Phnom_Penh_Airport_1_2km,Phnom_Penh_Airport_2_3km,Phnom_Penh_Airport_3_5km,Phnom_Penh_Airport_5_10km,population,median,mean,max,min
count,3371.0,3371.0,3371.0,3371.0,3371.0,3371.0,3371.0,3371.0,3371.0,3371.0,...,3371.0,3371.0,3371.0,3371.0,3371.0,3371.0,3371.0,3371.0,3371.0,3371.0
mean,3.377039,3.511124,612.484596,11.554581,104.912421,2221.092081,3.248591,6.877188,20.614951,29.242065,...,0.00178,0.008009,0.00623,0.041827,0.538416,12285.622367,1993.317532,2221.092081,5635.367056,396.594314
std,7.15085,3.222157,3356.87414,0.036927,0.030075,1327.808769,3.331162,5.977236,13.75539,17.508442,...,0.042157,0.08915,0.078693,0.200224,0.498596,8788.871003,543.76125,571.19753,1995.710478,505.522346
min,0.0,0.0,1.0,11.342391,104.64547,25.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,47.5,47.5,47.5,25.0
25%,1.0,1.0,74.4,11.534182,104.89897,1328.125,0.0,1.0,9.0,8.5,...,0.0,0.0,0.0,0.0,0.0,4621.0,1762.454212,1961.308867,4687.5,66.0
50%,3.0,3.0,117.0,11.55,104.91667,1904.761905,2.0,5.0,21.0,35.0,...,0.0,0.0,0.0,0.0,1.0,8223.0,2059.202059,2271.513714,6250.0,203.160271
75%,4.0,5.0,300.0,11.57561,104.93,2900.0,6.0,12.0,32.0,44.0,...,0.0,0.0,0.0,0.0,1.0,20064.0,2343.75,2717.918796,6461.538462,500.0
max,343.0,46.0,104635.0,11.939206,105.064019,8870.967742,14.0,26.0,59.0,67.0,...,1.0,1.0,1.0,1.0,1.0,39519.0,6435.643564,6435.643564,8870.967742,6435.643564


In [22]:
df.dropna(inplace=True)

In [60]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train.shape, X_test.shape

In [42]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns


In [43]:
cat_cols

Index(['address_line_2', 'category_name', 'type', 'h_id'], dtype='object')

In [44]:
# Temporarily encode categorical columns for feature selection
X_temp = X.copy()
if len(cat_cols) > 0:
    encoder = OrdinalEncoder()
    X_temp[cat_cols] = encoder.fit_transform(X_temp[cat_cols])


In [45]:
# Feature selection 
selector = SelectKBest(mutual_info_regression, k=30)
selector.fit(X_temp, y)
selected_features = X.columns[selector.get_support()]


  y = column_or_1d(y, warn=True)


In [46]:
# Subset original data to selected features
X_selected = X[selected_features]

In [47]:
X_selected

Unnamed: 0,bedrooms,bathrooms,land_area,address_line_2,category_name,latitude,longitude,population,h_id,n_cafe_in_1km_to_2km,...,n_primary_school_in_3km_to_5km,n_university_in_3km_to_5km,n_resturant_in_3km_to_5km,n_super_market_in_3km_to_5km,n_atm_in_2km_to_3km,n_atm_in_3km_to_5km,mean,median,max,min
0,7.0,7.0,124.0,Chakto Mukh,Shophouse,11.575610,104.920250,16252.0,8865846aadfffff,33,...,28,46,8,29,27,31,3908.371821,3829.257246,8870.967742,104.166667
1,4.0,5.0,80.0,BKK 3,House,11.550000,104.930000,7658.0,8865846ae9fffff,32,...,31,54,4,20,89,33,2730.697070,2129.508909,8500.000000,370.370370
2,3.0,4.0,66.0,Chey Chumneah,Retreat,11.575610,104.920250,16252.0,8865846aadfffff,33,...,28,46,8,29,27,31,3818.434615,2963.873450,8333.333333,1012.658228
3,9.0,10.0,116.0,Tonle Bassac,Villa,11.544500,104.913586,23239.0,8865846ac7fffff,36,...,35,62,68,35,59,83,2945.379851,2878.787879,6465.517241,254.777070
4,5.0,6.0,65.0,Chroy Changvar,House,11.580000,104.930000,5351.0,886584685bfffff,27,...,33,64,16,34,47,36,2047.471213,1754.120879,6461.538462,170.135882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6679,1.0,1.0,833.0,Boeung Kak 1,Condo,11.550000,104.930000,7658.0,8865846ae9fffff,32,...,31,54,4,20,89,33,2376.653525,2000.000000,6000.000000,108.031212
6680,0.0,0.0,800.0,BKK 1,project,11.545695,104.923107,13796.0,8865846ac5fffff,49,...,33,58,31,23,73,57,3009.237377,2815.533981,6428.571429,100.000000
6681,0.0,0.0,480.0,Chakto Mukh,Business,11.575837,104.920096,16252.0,8865846aadfffff,31,...,28,46,8,30,27,31,3908.371821,3829.257246,8870.967742,104.166667
6682,0.0,0.0,7960.0,Tien,Land/Development,11.439340,104.849064,751.0,88658460c5fffff,0,...,0,0,0,0,0,0,180.000000,120.000000,320.000000,100.000000


In [48]:
# One-hot encode only the selected categorical features
selected_cat_cols = [col for col in selected_features if col in cat_cols]
X_final = pd.get_dummies(X_selected, columns=selected_cat_cols, drop_first=True)
X_final = X_final.astype(int)


In [49]:
df_KBest = pd.concat([X_final, y], axis=1)


In [50]:
df_KBest.head()

Unnamed: 0,bedrooms,bathrooms,land_area,latitude,longitude,population,n_cafe_in_1km_to_2km,n_cafe_in_2km_to_3km,n_cafe_in_3km_to_5km,n_gas_station_in_2km_to_3km,...,h_id_88658478a3fffff,h_id_88658478b3fffff,h_id_88658478b7fffff,h_id_88658478bbfffff,h_id_8865847993fffff,h_id_886586a691fffff,h_id_886586a693fffff,h_id_886586a699fffff,h_id_886586a69bfffff,price
0,7,7,124,11,104,16252,33,44,76,26,...,0,0,0,0,0,0,0,0,0,1100000.0
1,4,5,80,11,104,7658,32,49,62,53,...,0,0,0,0,0,0,0,0,0,680000.0
2,3,4,66,11,104,16252,33,44,76,26,...,0,0,0,0,0,0,0,0,0,550000.0
3,9,10,116,11,104,23239,36,37,85,35,...,0,0,0,0,0,0,0,0,0,750000.0
4,5,6,65,11,104,5351,27,37,87,26,...,0,0,0,0,0,0,0,0,0,420000.0


In [51]:
df_KBest.to_csv('../../data/preprocessed/realestates_kh_SelectKBest_v3.csv', index=False)