In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBRegressor
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler

Datasets required to run this script:
- Feature engineered base price data


## Utility Functions

### Data preprocessing utilities

In [3]:
# Onehot encode values with auxillary document
def custom_one_hot_encode(df, all_categories, column_name):
    """
    One-hot encode a specific column in a DataFrame, including all possible categories.

    Parameters:
    df (pd.DataFrame): The DataFrame to be one-hot encoded.
    all_categories (pd.Series): A Series containing all possible categorical values.
    column_name (str): The name of the column to be one-hot encoded.

    Returns:
    pd.DataFrame: The DataFrame with one-hot encoded columns.
    """
    all_categories = all_categories.unique()

    # One-hot encode the column with all possible categories
    one_hot_df = pd.get_dummies(df[column_name], prefix=column_name)

    # Create a DataFrame with all possible categories set to 0
    all_categories_df = pd.DataFrame(columns=[f"{column_name}_{category}" for category in all_categories])
    all_categories_df = all_categories_df.reindex(columns=all_categories_df.columns, fill_value=0)

    # Concatenate the one-hot encoded df with the all_categories_df
    final_df = pd.concat([one_hot_df, all_categories_df], axis=1)

    # Ensure the final DataFrame has all columns from all_categories_df
    final_df = final_df.reindex(columns=all_categories_df.columns, fill_value=0)

    # Drop the original categorical column from the original DataFrame and concatenate with the final one-hot encoded DataFrame
    df = df.drop(columns=[column_name])
    df = pd.concat([df, final_df], axis=1)

    return df

In [4]:
def drop_columns(df, columns_to_drop):
    """
    Drop specified columns from a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame from which columns should be dropped.
    columns_to_drop (list): A list of column names to be dropped.

    Returns:
    pd.DataFrame: The DataFrame with specified columns dropped.
    """
    # Drop the specified columns
    df_dropped = df.drop(columns=columns_to_drop)
    return df_dropped

### Training Utilities

In [5]:
def TrainFitPredict(model,iterations,X, y,X_test ,y_test): #Takes in split dataset, returns test scores, train history, and the model itself.
  model.fit(X,y)
  test_score = np.zeros((iterations,), dtype=np.float64)
  for i, y_pred in enumerate(model.staged_predict(X_test)):
    test_score[i] = mean_squared_error(y_test, y_pred)
  return test_score, model.train_score_, model

In [6]:
def GetModelMAE(trained_model, X_test, y_test):
  mae = mean_absolute_error(y_test, trained_model.predict(X_test))
  return mae

In [7]:
def PlotPerformance(test_scores, train_scores): #plots training + test by receiving test and training scores provided by TrainFitPredict
  fig = plt.figure(figsize=(6, 6))
  plt.subplot(1, 1, 1)
  plt.title("MSE")
  plt.plot(
      np.arange(len(train_scores)) + 1,
      train_scores,
      "b-",
      label="Training Set Deviance",
  )
  plt.plot(
      np.arange(len(test_scores)) + 1, test_scores, "r-", label="Test Set Deviance"
  )
  plt.legend(loc="upper right")
  plt.xlabel("Boosting Iterations")
  plt.ylabel("Deviance")
  fig.tight_layout()
  plt.show()

Import Data

In [8]:
data = pd.read_csv('/content/CLEAN_RoomBasePrice_11_06_2024.csv')

In [20]:
columnsToDropBeforeTraining = ['Unnamed: 0', 'room_id']

## Preprocess Categoricals
One hot encode all categoricals

## Preprocess numericals
Either normalize them, or not do anything at all.

##Training

In [None]:
X = df_final.drop(columns=['average_baseline_price'])
y = df_final['average_baseline_price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Hyperparameters

In [None]:
TEST_RATIO = 0.25
RANDOM_STATE = 123
params = {
    "n_estimators": 1000,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

In [None]:
LR = LinearRegression()
DTR = DecisionTreeRegressor(random_state=RANDOM_STATE)
RFR = RandomForestRegressor(n_estimators=10000, random_state=RANDOM_STATE)
XGB = XGBRegressor(n_estimators=10000, max_depth=2, learning_rate=0.1)
ModelsList = [
              XGB,
              RFR,
              DTR,
              LR,]
modelMAEs = []
for model in ModelsList:
  model.fit(X_train,y_train)
  modelMAEs.append((model.__class__.__name__, GetModelMAE(model, X_test, y_test)))
print(modelMAEs)