In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBRegressor
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

Datasets required to run this script:
- Feature engineered base price data


## Utility Functions

### Data preprocessing utilities

In [12]:
# Onehot encode values with auxillary document
def custom_one_hot_encode(df, all_categories, column_name):
    """
    One-hot encode a specific column in a DataFrame, including all possible categories.

    Parameters:
    df (pd.DataFrame): The DataFrame to be one-hot encoded.
    all_categories (pd.Series): A Series containing all possible categorical values.
    column_name (str): The name of the column to be one-hot encoded.

    Returns:
    pd.DataFrame: The DataFrame with one-hot encoded columns.
    """
    all_categories = all_categories.unique()

    # One-hot encode the column with all possible categories
    one_hot_df = pd.get_dummies(df[column_name], prefix=column_name)

    # Create a DataFrame with all possible categories set to 0
    all_categories_df = pd.DataFrame(columns=[f"{column_name}_{category}" for category in all_categories])
    all_categories_df = all_categories_df.reindex(columns=all_categories_df.columns, fill_value=0)

    # Concatenate the one-hot encoded df with the all_categories_df
    final_df = pd.concat([one_hot_df, all_categories_df], axis=1)

    # Ensure the final DataFrame has all columns from all_categories_df
    final_df = final_df.reindex(columns=all_categories_df.columns, fill_value=0)

    # Drop the original categorical column from the original DataFrame and concatenate with the final one-hot encoded DataFrame
    df = df.drop(columns=[column_name])
    df = pd.concat([df, final_df], axis=1)

    return df

In [13]:
def drop_columns(df, columns_to_drop):
    """
    Drop specified columns from a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame from which columns should be dropped.
    columns_to_drop (list): A list of column names to be dropped.

    Returns:
    pd.DataFrame: The DataFrame with specified columns dropped.
    """
    # Drop the specified columns
    df_dropped = df.drop(columns=columns_to_drop)
    return df_dropped

### Training Utilities

In [14]:
def TrainFitPredict(model,iterations,X, y,X_test ,y_test): #Takes in split dataset, returns test scores, train history, and the model itself.
  model.fit(X,y)
  test_score = np.zeros((iterations,), dtype=np.float64)
  for i, y_pred in enumerate(model.staged_predict(X_test)):
    test_score[i] = mean_absolute_error(y_test, y_pred)
  return test_score, model.train_score_, model

In [15]:
def GetModelMAE(trained_model, X_test, y_test):
  mae = mean_absolute_error(y_test, trained_model.predict(X_test))
  return mae

In [16]:
def PlotPerformance(test_scores, train_scores): #plots training + test by receiving test and training scores provided by TrainFitPredict
  fig = plt.figure(figsize=(6, 6))
  plt.subplot(1, 1, 1)
  plt.title("MSE")
  plt.plot(
      np.arange(len(train_scores)) + 1,
      train_scores,
      "b-",
      label="Training Set Deviance",
  )
  plt.plot(
      np.arange(len(test_scores)) + 1, test_scores, "r-", label="Test Set Deviance"
  )
  plt.legend(loc="upper right")
  plt.xlabel("Boosting Iterations")
  plt.ylabel("Deviance")
  fig.tight_layout()
  plt.show()

Import Data

In [17]:
data = pd.read_csv('/content/FE_RoomBasePrice.csv')

In [19]:
columnsToDropBeforeTraining = ['Unnamed: 0.1','Unnamed: 0', 'room_id']

## Obtain target label

In [20]:
average_baseline_price = data['average_baseline_price']

## Labels ready for training

In [25]:
ready_cols = ['ac', 'balcony', 'beachfront', 'breakfast', 'building_staff',
                             'cable_tv', 'essentials', 'garden', 'gym', 'hair_dryer',
                             'hanger', 'heating', 'hot_water', 'kitchen', 'linens',
                             'lock', 'luggage_drop_off', 'parking', 'pool',
                             'private_entrance', 'shampoo', 'tv', 'washer', 'wifi',
                             'workspace']
ready_cols_df = pd.DataFrame(data, columns=ready_cols)

## Preprocess Categoricals
One hot encode all categoricals

In [26]:
categorical_cols = ['unit_type_name', 'property_design', 'property_type','area_name']
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')
categorical_encoded = onehot_encoder.fit_transform(data[categorical_cols])

categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=onehot_encoder.get_feature_names_out(categorical_cols))

## Preprocess numericals
Either normalize them, or not do anything at all.

In [28]:
numerical_cols = ['bedroom','bathroom','beds','capacity','lat','lng','distance_to_coastline','area_distance_to_airport','average_baseline_price', 'total_fas','ratio_bedroom_bathroom','ratio_bedroom_cap',
                  'avg_price_distance_to_coast', 'avg_price_distance_to_airport', 'avg_price_bedroom','avg_price_beds','avg_price_bathroom','avg_price_total_fas']
scaler = StandardScaler()
numerical_scaled = scaler.fit_transform(data[numerical_cols])

numerical_scaled_df = pd.DataFrame(numerical_scaled, columns=numerical_cols)


## Compiling Preprocessed Data

In [29]:
df_final = pd.concat([numerical_scaled_df, categorical_encoded_df,ready_cols_df, average_baseline_price.reset_index(drop=True)], axis=1)

##Training

In [32]:
X = df_final.drop(columns=['average_baseline_price'])
y = df_final['average_baseline_price']

### Hyperparameters

In [64]:
TEST_RATIO = 0.3
RANDOM_STATE = 123
params = {
    "n_estimators": 1000,
    "max_depth": 64,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "absolute_error",
}

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_RATIO, random_state=42)

In [43]:
LR = LinearRegression()
DTR = DecisionTreeRegressor(random_state=RANDOM_STATE)
RFR = RandomForestRegressor(n_estimators=params['n_estimators'],max_depth=params["max_depth"],random_state=RANDOM_STATE)
XGB = XGBRegressor(n_estimators=params['n_estimators'], max_depth=params["max_depth"], learning_rate=params['learning_rate'])
ModelsList = [
              XGB,
              RFR,
              DTR,
              LR,]
modelMAEs = []
for model in ModelsList:
  model.fit(X_train,y_train)
  modelMAEs.append((model.__class__.__name__, GetModelMAE(model, X_test, y_test)))
print(modelMAEs)

[('XGBRegressor', 110273.54280896107), ('RandomForestRegressor', 106512.54040518399), ('DecisionTreeRegressor', 105637.92082925077), ('LinearRegression', 3.757578594329328e+17)]


## Custom Training loop with RFR

In [70]:
model = RandomForestRegressor(n_estimators=1,max_depth=params["max_depth"],random_state=RANDOM_STATE)
mae_threshold = 77000
max_estimators = params['n_estimators']
# Custom training loop with early stopping
for i in range(1, params['n_estimators'] + 1):
    model.n_estimators = i
    model.fit(X_train, y_train)

    # Predict on validation set
    y_pred = model.predict(X_test)

    # Calculate validation MAE
    val_mae = mean_absolute_error(y_test, y_pred)
    print(f"Iteration {i}, Validation MAE: {val_mae:.4f}")

    # Check if validation MAE is below the threshold
    if val_mae < mae_threshold:
        print(f"Early stopping at iteration {i} with Validation MAE: {val_mae:.4f}")
        break


Iteration 1, Validation MAE: 107153.4905
Iteration 2, Validation MAE: 103800.7361
Iteration 3, Validation MAE: 110707.8833
Iteration 4, Validation MAE: 111933.1760
Iteration 5, Validation MAE: 92496.4421
Iteration 6, Validation MAE: 98227.1856
Iteration 7, Validation MAE: 93302.6182
Iteration 8, Validation MAE: 91887.1323
Iteration 9, Validation MAE: 94925.1095
Iteration 10, Validation MAE: 92439.0756
Iteration 11, Validation MAE: 85843.2345
Iteration 12, Validation MAE: 85491.5715
Iteration 13, Validation MAE: 84299.7381
Iteration 14, Validation MAE: 82782.6879
Iteration 15, Validation MAE: 84344.2154
Iteration 16, Validation MAE: 86618.6409
Iteration 17, Validation MAE: 82698.8524
Iteration 18, Validation MAE: 85355.9756
Iteration 19, Validation MAE: 84160.3118
Iteration 20, Validation MAE: 84095.8120
Iteration 21, Validation MAE: 82783.0421
Iteration 22, Validation MAE: 81289.5039
Iteration 23, Validation MAE: 81662.0750
Iteration 24, Validation MAE: 81480.1839
Iteration 25, Validat