In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_regression


In [2]:
df = pd.read_csv('../../data/processed/realestates_kh_v7.csv')

In [3]:
df.head()

Unnamed: 0,id,price,bedrooms,bathrooms,land_area,address_subdivision,address_locality,address_line_2,category_name,is_parent,...,Sisowath_Riverside_Park_nearest,Sisowath_Riverside_Park_1_2km,Sisowath_Riverside_Park_2_3km,Sisowath_Riverside_Park_3_5km,Sisowath_Riverside_Park_5_10km,Phnom_Penh_Airport_nearest,Phnom_Penh_Airport_1_2km,Phnom_Penh_Airport_2_3km,Phnom_Penh_Airport_3_5km,Phnom_Penh_Airport_5_10km
0,185714.0,1100000.0,7.0,7.0,124.0,Phnom Penh,Daun Penh,Chakto Mukh,Shophouse,False,...,0,1,0,0,0,0,0,0,0,1
1,185539.0,680000.0,4.0,5.0,80.0,Phnom Penh,Chamkarmon,BKK 3,House,False,...,0,0,1,0,0,0,0,0,0,0
2,217752.0,550000.0,3.0,4.0,66.0,Phnom Penh,Daun Penh,Chey Chumneah,Retreat,False,...,0,1,0,0,0,0,0,0,0,1
3,228897.0,750000.0,9.0,10.0,116.0,Phnom Penh,Chamkarmon,Tonle Bassac,Villa,False,...,0,0,0,1,0,0,0,0,0,1
4,190024.0,420000.0,5.0,6.0,65.0,Phnom Penh,Chroy Changvar,Chroy Changvar,House,False,...,0,1,0,0,0,0,0,0,0,0


In [4]:
df = df[df['price_per_m2'] >= 100].copy()

In [5]:
df[['bedrooms', 'bathrooms']] = df[['bedrooms', 'bathrooms']].fillna(0)

In [6]:
# Calculate mean, max, min of 'price_per_m2' grouped by 'h_id'
price_stats = df.groupby('address_line_2')['price_per_m2'].agg(['mean','median', 'max', 'min']).reset_index()

# Merge these stats back to the original dataframe on 'h_id'
df = df.merge(price_stats, on='address_line_2', how='left', suffixes=('', '_price_per_m2'))

# The columns 'mean', 'max', 'min' are now added to df

In [7]:
df

Unnamed: 0,id,price,bedrooms,bathrooms,land_area,address_subdivision,address_locality,address_line_2,category_name,is_parent,...,Sisowath_Riverside_Park_5_10km,Phnom_Penh_Airport_nearest,Phnom_Penh_Airport_1_2km,Phnom_Penh_Airport_2_3km,Phnom_Penh_Airport_3_5km,Phnom_Penh_Airport_5_10km,mean,median,max,min
0,185714.0,1100000.0,7.0,7.0,124.0,Phnom Penh,Daun Penh,Chakto Mukh,Shophouse,False,...,0,0,0,0,0,1,3908.371821,3829.257246,8870.967742,104.166667
1,185539.0,680000.0,4.0,5.0,80.0,Phnom Penh,Chamkarmon,BKK 3,House,False,...,0,0,0,0,0,0,2730.697070,2129.508909,8500.000000,370.370370
2,217752.0,550000.0,3.0,4.0,66.0,Phnom Penh,Daun Penh,Chey Chumneah,Retreat,False,...,0,0,0,0,0,1,3818.434615,2963.873450,8333.333333,1012.658228
3,228897.0,750000.0,9.0,10.0,116.0,Phnom Penh,Chamkarmon,Tonle Bassac,Villa,False,...,0,0,0,0,0,1,2945.379851,2878.787879,6465.517241,254.777070
4,190024.0,420000.0,5.0,6.0,65.0,Phnom Penh,Chroy Changvar,Chroy Changvar,House,False,...,0,0,0,0,0,0,2047.471213,1754.120879,6461.538462,170.135882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6679,160087.0,89990.0,1.0,1.0,833.0,Phnom Penh,Toul Kork,Boeung Kak 1,Condo,False,...,0,0,0,0,0,0,2376.653525,2000.000000,6000.000000,108.031212
6680,141851.0,85000.0,0.0,0.0,800.0,Phnom Penh,Chamkarmon,BKK 1,project,True,...,0,0,0,0,0,1,3009.237377,2815.533981,6428.571429,100.000000
6681,247950.0,50000.0,0.0,0.0,480.0,Phnom Penh,Daun Penh,Chakto Mukh,Business,False,...,0,0,0,0,0,1,3908.371821,3829.257246,8870.967742,104.166667
6682,243954.0,796000.0,0.0,0.0,7960.0,Phnom Penh,Dangkao,Tien,Land/Development,False,...,0,0,0,0,0,0,180.000000,120.000000,320.000000,100.000000


In [8]:
df.drop(columns=['price','id', 'address_subdivision', 'land_area', 'address_locality', 'information', 'geometry', 'index_right', 'price_display', 'h_id'], inplace=True)

In [9]:
df.dropna(inplace=True)

In [10]:
y = df[['price_per_m2']]
X = df.drop(columns=['price_per_m2'])

In [11]:
X.describe()

Unnamed: 0,bedrooms,bathrooms,latitude,longitude,population,nearest_cafe,n_cafe_in_1km,n_cafe_in_1km_to_2km,n_cafe_in_2km_to_3km,n_cafe_in_3km_to_5km,...,Sisowath_Riverside_Park_5_10km,Phnom_Penh_Airport_nearest,Phnom_Penh_Airport_1_2km,Phnom_Penh_Airport_2_3km,Phnom_Penh_Airport_3_5km,Phnom_Penh_Airport_5_10km,mean,median,max,min
count,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,...,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0
mean,3.398797,3.53203,11.554426,104.912497,12356.590376,3.25594,6.890827,20.675188,29.349173,66.389774,...,0.280602,0.001805,0.00782,0.006316,0.042105,0.541353,2240.76927,2022.064255,5527.894712,411.885884
std,7.191245,3.225698,0.0355,0.029206,8769.382051,3.3282,5.97003,13.706912,17.435713,30.010089,...,0.449327,0.042444,0.088088,0.079227,0.200844,0.498324,645.244735,663.921244,1419.275798,342.650671
min,0.0,0.0,11.430122,104.748841,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,164.932003,120.0,179.864005,100.0
25%,1.0,1.0,11.534079,104.89897,4621.0,0.0,1.0,9.0,9.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1858.919985,1661.111111,4999.984375,195.396707
50%,3.0,3.0,11.55,104.916255,8396.0,2.0,5.0,21.0,35.0,71.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2145.079724,1888.372093,6222.222222,278.448116
75%,4.0,5.0,11.57561,104.93,20064.0,6.0,12.0,32.0,44.0,85.0,...,1.0,0.0,0.0,0.0,0.0,1.0,2730.69707,2470.396904,6428.571429,500.0
max,343.0,46.0,11.720574,104.996857,39519.0,14.0,26.0,59.0,67.0,125.0,...,1.0,1.0,1.0,1.0,1.0,1.0,3949.618811,3925.511588,8870.967742,2666.666667


In [12]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train.shape, X_test.shape

In [13]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns


In [14]:
cat_cols

Index(['address_line_2', 'category_name', 'type'], dtype='object')

In [15]:
# Temporarily encode categorical columns for feature selection
X_temp = X.copy()
if len(cat_cols) > 0:
    encoder = OrdinalEncoder()
    X_temp[cat_cols] = encoder.fit_transform(X_temp[cat_cols])


In [16]:
# Feature selection 
selector = SelectKBest(mutual_info_regression, k=30)
selector.fit(X_temp, y)
selected_features = X.columns[selector.get_support()]


  y = column_or_1d(y, warn=True)


In [17]:
# Subset original data to selected features
X_selected = X[selected_features]

In [18]:
X_selected

Unnamed: 0,address_line_2,category_name,latitude,longitude,population,n_cafe_in_1km_to_2km,n_cafe_in_2km_to_3km,n_cafe_in_3km_to_5km,n_gas_station_in_1km_to_2km,n_gas_station_in_2km_to_3km,...,n_university_in_3km_to_5km,n_resturant_in_3km_to_5km,n_super_market_in_2km_to_3km,n_super_market_in_3km_to_5km,n_atm_in_2km_to_3km,n_atm_in_3km_to_5km,mean,median,max,min
0,Chakto Mukh,Shophouse,11.575610,104.920250,16252.0,33,44,76,20,26,...,46,8,25,29,27,31,3908.371821,3829.257246,8870.967742,104.166667
1,BKK 3,House,11.550000,104.930000,7658.0,32,49,62,9,53,...,54,4,30,20,89,33,2730.697070,2129.508909,8500.000000,370.370370
2,Chey Chumneah,Retreat,11.575610,104.920250,16252.0,33,44,76,20,26,...,46,8,25,29,27,31,3818.434615,2963.873450,8333.333333,1012.658228
3,Tonle Bassac,Villa,11.544500,104.913586,23239.0,36,37,85,20,35,...,62,68,29,35,59,83,2945.379851,2878.787879,6465.517241,254.777070
4,Chroy Changvar,House,11.580000,104.930000,5351.0,27,37,87,17,26,...,64,16,26,34,47,36,2047.471213,1754.120879,6461.538462,170.135882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6679,Boeung Kak 1,Condo,11.550000,104.930000,7658.0,32,49,62,9,53,...,54,4,30,20,89,33,2376.653525,2000.000000,6000.000000,108.031212
6680,BKK 1,project,11.545695,104.923107,13796.0,49,27,81,27,29,...,58,31,26,23,73,57,3009.237377,2815.533981,6428.571429,100.000000
6681,Chakto Mukh,Business,11.575837,104.920096,16252.0,31,45,77,21,26,...,46,8,26,30,27,31,3908.371821,3829.257246,8870.967742,104.166667
6682,Tien,Land/Development,11.439340,104.849064,751.0,0,0,1,0,0,...,0,0,0,0,0,0,180.000000,120.000000,320.000000,100.000000


In [19]:
# One-hot encode only the selected categorical features
selected_cat_cols = [col for col in selected_features if col in cat_cols]
X_final = pd.get_dummies(X_selected, columns=selected_cat_cols, drop_first=True)
X_final = X_final.astype(int)


In [20]:
df_KBest = pd.concat([X_final, y], axis=1)


In [21]:
df_KBest.head()

Unnamed: 0,latitude,longitude,population,n_cafe_in_1km_to_2km,n_cafe_in_2km_to_3km,n_cafe_in_3km_to_5km,n_gas_station_in_1km_to_2km,n_gas_station_in_2km_to_3km,n_gas_station_in_3km_to_5km,n_hospital_in_1km_to_2km,...,category_name_Terrace,category_name_Twin Villa,category_name_Unit,category_name_Villa,category_name_Warehouse,category_name_condo,category_name_other,category_name_project,category_name_residential,price_per_m2
0,11,104,16252,33,44,76,20,26,82,48,...,0,0,0,0,0,0,0,0,0,8870.967742
1,11,104,7658,32,49,62,9,53,57,29,...,0,0,0,0,0,0,0,0,0,8500.0
2,11,104,16252,33,44,76,20,26,82,48,...,0,0,0,0,0,0,0,0,0,8333.333333
3,11,104,23239,36,37,85,20,35,76,36,...,0,0,0,1,0,0,0,0,0,6465.517241
4,11,104,5351,27,37,87,17,26,63,20,...,0,0,0,0,0,0,0,0,0,6461.538462


In [22]:
df_KBest.to_csv('../../data/preprocessed/realestates_kh_SelectKBest_v6.csv', index=False)