# **Feature Engineering and Training Process Base Price**

We will try to take some feature engineering process and then train data to make base price model. Before create a model, we need to explore the dataset to ensure that there is no outliers and missing values in dataset.

In [191]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBRegressor
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler

We will load the dataset using pandas library.

In [192]:
df = pd.read_csv('unitavgbaseprice.csv')

we will see information of the data.

In [193]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 869 entries, 0 to 868
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   property_name             869 non-null    object 
 1   property_type             869 non-null    float64
 2   property_design           869 non-null    float64
 3   distance_to_coastline     869 non-null    float64
 4   area_squad                869 non-null    float64
 5   area_distance_to_airport  869 non-null    float64
 6   airport_pickup_price_idr  869 non-null    float64
 7   area_airport_name         869 non-null    object 
 8   region_id                 869 non-null    float64
 9   unit_id                   869 non-null    object 
 10  average_baseline_price    869 non-null    float64
 11  number_of_bookings        869 non-null    int64  
 12  bedroom                   869 non-null    float64
 13  bathroom                  869 non-null    float64
 14  beds      

Drop the outliers from data using z-score.

In [195]:
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
z_scores = stats.zscore(df[numerical_cols])

threshold = 5

outlier_mask = (abs(z_scores) > threshold).any(axis=1)

df_clean = df[~outlier_mask]

Next drop duplicates from data based on unit_id.

In [196]:
data = df_clean.drop_duplicates(subset=['unit_id'])

In [197]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 716 entries, 0 to 868
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   property_name             716 non-null    object 
 1   property_type             716 non-null    float64
 2   property_design           716 non-null    float64
 3   distance_to_coastline     716 non-null    float64
 4   area_squad                716 non-null    float64
 5   area_distance_to_airport  716 non-null    float64
 6   airport_pickup_price_idr  716 non-null    float64
 7   area_airport_name         716 non-null    object 
 8   region_id                 716 non-null    float64
 9   unit_id                   716 non-null    object 
 10  average_baseline_price    716 non-null    float64
 11  number_of_bookings        716 non-null    int64  
 12  bedroom                   716 non-null    float64
 13  bathroom                  716 non-null    float64
 14  beds           

Next drop unit_id column from data.

In [198]:
data = data.drop(columns=['unit_id'])

In [199]:
data.describe()

Unnamed: 0,property_type,property_design,distance_to_coastline,area_squad,area_distance_to_airport,airport_pickup_price_idr,region_id,average_baseline_price,number_of_bookings,bedroom,...,lock,luggage_drop_off,parking,pool,private_entrance,shampoo,tv,washer,wifi,workspace
count,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,...,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0,716.0
mean,3.789106,0.925978,853.312303,2.134078,52.541899,324552.146648,1.455307,827887.5,57.804469,1.613128,...,0.837989,0.72067,1.0,0.875698,0.817039,0.759777,0.734637,0.0,1.0,0.927374
std,0.910817,1.431453,604.256062,1.06379,10.419437,28027.992492,0.78764,602206.4,68.537373,1.030878,...,0.368719,0.448983,0.0,0.330156,0.386905,0.427518,0.441835,0.0,0.0,0.259703
min,1.0,0.0,26.7774,0.0,30.0,195000.0,1.0,160050.0,5.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,4.0,0.0,171.19228,1.0,45.0,325000.0,1.0,393063.8,13.0,1.0,...,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
50%,4.0,0.0,1093.22,2.0,60.0,325000.0,1.0,611560.1,30.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
75%,4.0,1.0,1173.9868,3.0,60.0,325000.0,2.0,1057721.0,75.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
max,5.0,5.0,2755.41,4.0,60.0,362089.0,3.0,4134564.0,456.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0


create new feature which is amenities or total facility.

In [201]:
data['total_fas'] = data['ac'] + data['balcony'] + data['beachfront'] + data['breakfast'] + data['building_staff'] + data['cable_tv'] + data['essentials'] + data['garden'] + data['gym'] + data['hair_dryer'] + data['hanger'] + data['heating'] + data['hot_water'] + data['kitchen'] + data['linens'] + data['lock'] + data['luggage_drop_off'] + data['parking'] + data['pool'] + data['private_entrance'] + data['shampoo'] + data['tv'] + data['washer'] + data['wifi'] + data['workspace']

Create new features with feature extraction.

In [202]:
data['bathroom_to_bedroom_ratio'] = data['bathroom'] / data['bedroom']

In [203]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 716 entries, 0 to 868
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   property_name              716 non-null    object 
 1   property_type              716 non-null    float64
 2   property_design            716 non-null    float64
 3   distance_to_coastline      716 non-null    float64
 4   area_squad                 716 non-null    float64
 5   area_distance_to_airport   716 non-null    float64
 6   airport_pickup_price_idr   716 non-null    float64
 7   area_airport_name          716 non-null    object 
 8   region_id                  716 non-null    float64
 9   average_baseline_price     716 non-null    float64
 10  number_of_bookings         716 non-null    int64  
 11  bedroom                    716 non-null    float64
 12  bathroom                   716 non-null    float64
 13  beds                       716 non-null    float64
 14 

Using One Hot Encoding and StandardScaler.

In [205]:
average_baseline_price = data['average_baseline_price']
data = data.drop(columns=['average_baseline_price'])

categorical_cols = ['property_name', 'area_airport_name', 'area_name']
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns

onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')
categorical_encoded = onehot_encoder.fit_transform(data[categorical_cols])

categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=onehot_encoder.get_feature_names_out(categorical_cols))

scaler = StandardScaler()
numerical_scaled = scaler.fit_transform(data[numerical_cols])

numerical_scaled_df = pd.DataFrame(numerical_scaled, columns=numerical_cols)

df_final = pd.concat([numerical_scaled_df, categorical_encoded_df, average_baseline_price.reset_index(drop=True)], axis=1)

df_final.head()


Unnamed: 0,property_type,property_design,distance_to_coastline,area_squad,area_distance_to_airport,airport_pickup_price_idr,region_id,number_of_bookings,bedroom,bathroom,...,area_name_Maguwoharjo,area_name_Nusa Penida,area_name_Padang Padang,area_name_Seminyak,area_name_Sinduharjo,area_name_Tamantirto,area_name_Uluwatu,area_name_Ungasan,area_name_Yogyakarta,average_baseline_price
0,0.231705,-0.647332,0.531064,-0.126126,0.716288,0.01599,-0.578469,0.586886,-0.595179,-0.478547,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,388196.713861
1,0.231705,-0.647332,0.531064,-0.126126,0.716288,0.01599,-0.578469,-0.362165,-0.595179,-0.478547,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,432090.909091
2,0.231705,0.750827,0.531064,-0.126126,0.716288,0.01599,-0.578469,0.937305,-0.595179,-0.478547,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,432767.309114
3,0.231705,0.750827,0.531064,-0.126126,0.716288,0.01599,-0.578469,0.251068,-0.595179,-1.368519,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,543905.904444
4,0.231705,0.750827,0.531064,-0.126126,0.716288,0.01599,-0.578469,-0.624979,-0.595179,-1.368519,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,280109.393333


Separate features and target.

In [207]:
X = df_final.drop(columns=['average_baseline_price'])
y = df_final['average_baseline_price']

Divide data for training and testing.

In [208]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Define some models that we used.

In [209]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Machine": SVR(),
    "XGB Regressor":XGBRegressor()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    results[name] = mae
    print(f"{name} MAE: {mae:.2f}")


Linear Regression MAE: 188686446560268.31
Decision Tree MAE: 237393.67
Random Forest MAE: 215800.45
Gradient Boosting MAE: 269120.54
Support Vector Machine MAE: 441290.99
XGB Regressor MAE: 222264.97


In [210]:
# Cari model dengan MAE terendah
best_model_name = min(results, key=results.get)
best_mae = results[best_model_name]
print(f"Best model: {best_model_name} with MAE: {best_mae:.2f}")


Best model: Random Forest with MAE: 215800.45
