In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBRegressor
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [2]:
def GetModelMAE(trained_model, X_test, y_test):
  mae = mean_absolute_error(y_test, trained_model.predict(X_test))
  return mae

In [3]:
def drop_columns(df, columns_to_drop):
    """
    Drop specified columns from a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame from which columns should be dropped.
    columns_to_drop (list): A list of column names to be dropped.

    Returns:
    pd.DataFrame: The DataFrame with specified columns dropped.
    """
    # Drop the specified columns
    df_dropped = df.drop(columns=columns_to_drop)
    return df_dropped

## Grab Dataset

In [54]:
data = pd.read_csv('/content/FE_RoomPrice.csv')

## Drop useless columns

In [55]:
columns_to_drop = ['Unnamed: 0.1','Unnamed: 0','room_id',	'unit_id',	'booking_id','booking_check_in', 'created_at','booking_check_out', 'earnings_in_idr', 'total_earnings']
data = drop_columns(data, columns_to_drop)

Convert boolean to int

In [56]:
data['contain_national_holiday'] = data['contain_national_holiday'].astype(float)

## Obtain Target Label

In [57]:
earnings_per_day = data['earnings_per_day']

In [58]:
data = data.drop(columns=['earnings_per_day'])

## Preprocessing : One hot encode and standard scaling

In [59]:
categorical_cols = ['month']
onehot_encoder = OneHotEncoder(sparse_output=False)
categorical_encoded = onehot_encoder.fit_transform(data[categorical_cols])

categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=onehot_encoder.get_feature_names_out(categorical_cols))

In [60]:
numerical_cols = [ 'rating', 'review_sentiment_score', 'communication', 'cleanliness', 'accuracy', 'stay_duration', 'booking_day_of_week', 'booking_lead_time', 'price_fluctuation', 'total_review_score']
scaler = StandardScaler()
numerical_scaled = scaler.fit_transform(data[numerical_cols])

numerical_scaled_df = pd.DataFrame(numerical_scaled, columns=numerical_cols)

In [69]:
df_final = pd.concat([numerical_scaled_df, categorical_encoded_df, data['contain_national_holiday'],data['average_baseline_price'], earnings_per_day.reset_index(drop=True)], axis=1)

In [70]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29766 entries, 0 to 29765
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   rating                    29766 non-null  float64
 1   review_sentiment_score    29766 non-null  float64
 2   communication             29766 non-null  float64
 3   cleanliness               29766 non-null  float64
 4   accuracy                  29766 non-null  float64
 5   stay_duration             29766 non-null  float64
 6   booking_day_of_week       29766 non-null  float64
 7   booking_lead_time         29766 non-null  float64
 8   price_fluctuation         29766 non-null  float64
 9   total_review_score        29766 non-null  float64
 10  month_april               29766 non-null  float64
 11  month_august              29766 non-null  float64
 12  month_december            29766 non-null  float64
 13  month_february            29766 non-null  float64
 14  month_

## Split Dataset

In [71]:
X = df_final.drop(columns=['earnings_per_day'])
y = df_final['earnings_per_day']

## Training

### Parameters

In [72]:
TEST_RATIO = 0.1
RANDOM_STATE = 1234
params = {
    "n_estimators": 1000,
    "max_depth": 64,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "absolute_error",
}

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_RATIO, random_state=42)

### Training script

In [75]:
LR = LinearRegression()
DTR = DecisionTreeRegressor(random_state=RANDOM_STATE)
RFR = RandomForestRegressor(n_estimators=params['n_estimators'],max_depth=params["max_depth"],random_state=RANDOM_STATE)
XGB = XGBRegressor(n_estimators=params['n_estimators'], max_depth=params["max_depth"], learning_rate=params['learning_rate'])
ModelsList = [
              XGB,
              RFR,
              DTR,
              LR,]
modelMAEs = []
for model in ModelsList:
  model.fit(X_train,y_train)
  modelMAEs.append((model.__class__.__name__, GetModelMAE(model, X_test, y_test)))
print(modelMAEs)

[('XGBRegressor', 10236.977416606775), ('RandomForestRegressor', 4499.649541772381), ('DecisionTreeRegressor', 7758.641172283309), ('LinearRegression', 1.2704052420055852e-06)]


In [76]:
model = RandomForestRegressor(n_estimators=1,max_depth=params["max_depth"],random_state=RANDOM_STATE)
mae_threshold = 5000
max_estimators = params['n_estimators']
# Custom training loop with early stopping
for i in range(1, params['n_estimators'] + 1):
    model.n_estimators = i
    model.fit(X_train, y_train)

    # Predict on validation set
    y_pred = model.predict(X_test)

    # Calculate validation MAE
    val_mae = mean_absolute_error(y_test, y_pred)
    print(f"Iteration {i}, Validation MAE: {val_mae:.4f}")

    # Check if validation MAE is below the threshold
    if val_mae < mae_threshold:
        print(f"Early stopping at iteration {i} with Validation MAE: {val_mae:.4f}")
        break


Iteration 1, Validation MAE: 11664.5013
Iteration 2, Validation MAE: 8853.9852
Iteration 3, Validation MAE: 8048.4459
Iteration 4, Validation MAE: 7800.0353
Iteration 5, Validation MAE: 7256.0694
Iteration 6, Validation MAE: 6689.7764
Iteration 7, Validation MAE: 6457.9043
Iteration 8, Validation MAE: 6373.8430
Iteration 9, Validation MAE: 6076.8090
Iteration 10, Validation MAE: 5980.2931
Iteration 11, Validation MAE: 5837.8432
Iteration 12, Validation MAE: 5741.9000
Iteration 13, Validation MAE: 5643.7416
Iteration 14, Validation MAE: 5638.8780
Iteration 15, Validation MAE: 5453.7303
Iteration 16, Validation MAE: 5337.3444
Iteration 17, Validation MAE: 5390.2860
Iteration 18, Validation MAE: 5281.2587
Iteration 19, Validation MAE: 5222.7343
Iteration 20, Validation MAE: 5134.5171
Iteration 21, Validation MAE: 5080.6279
Iteration 22, Validation MAE: 5080.6138
Iteration 23, Validation MAE: 5052.2900
Iteration 24, Validation MAE: 5007.8254
Iteration 25, Validation MAE: 4965.0268
Early st

### Save Model

In [77]:
import joblib

In [78]:
joblib.dump(model, 'random_forest_model.pkl')

['random_forest_model.pkl']