In [1]:
import pandas as pd
import numpy as np
import os
import sys

sys.path.append(os.path.join(os.pardir))

from functions import *
from model_tuner import *
from sklearn.linear_model import Lasso, Ridge, SGDRegressor, ElasticNet

from xgboost import XGBRegressor

In [2]:
# `base_path`` represents the parent directory of your current working directory
base_path = os.path.join(os.pardir)

# Go up one level from 'notebooks' to the parent directory, then into the 'data' folder
data_path = os.path.join("../data")

# Use the function to ensure the 'data' directory exists
ensure_directory(data_path)

Directory exists: ../data


In [3]:
df = pd.read_csv(os.path.join(data_path, "redfin_2024-04-16-15-59-17.csv"))

In [4]:
df = df.drop(df.index[0])

In [5]:
X = df[["BEDS", "BATHS", "SQUARE FEET", "LOT SIZE"]]
y = df[["PRICE"]]

In [6]:
# Define the set of hyperparameters to tune
parameters_lasso = [
    {
        "lasso__alpha": [0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
        "lasso__fit_intercept": [True, False],
        "lasso__precompute": [False],
        "lasso__copy_X": [True, False],
        "lasso__max_iter": [100, 500],
        "lasso__tol": [1e-4, 1e-3],
        "lasso__warm_start": [True, False],
        "lasso__positive": [True, False],
    }
]

## Lasso

In [None]:
lasso_reg = Lasso()

estimator_name = "lasso"
# Set the parameters by cross-validation

kfold = False
calibrate = False

In [None]:
model1 = Model(
    name="Redfin_model",
    estimator_name=estimator_name,
    model_type="regression",
    calibrate=calibrate,
    estimator=lasso_reg,
    kfold=kfold,
    stratify=True,
    grid=parameters_lasso,
    randomized_grid=False,
    impute=True,
    # n_iter=3,
    scoring=["r2"],
    # n_splits=2,
    random_state=3,
)

model1.grid_search_param_tuning(X, y)

model1.fit(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [None]:
model1.predict(X_test)

## Ridge Regression

In [None]:
ridge_reg = Ridge()

estimator_name = "ridge"
# Set the parameters by cross-validation

kfold = False
calibrate = False

In [None]:
model2 = Model(
    name="Redfin_model",
    estimator_name=estimator_name,
    model_type="regression",
    calibrate=calibrate,
    estimator=ridge_reg,
    kfold=kfold,
    stratify=True,
    grid=[],
    randomized_grid=False,
    impute=True,
    # n_iter=3,
    scoring=["r2"],
    # n_splits=2,
    random_state=3,
)

# model.grid_search_param_tuning(X, y)

model2.fit(X, y)

In [None]:
model2.predict(X_test)

### SGD Regressor

In [None]:
sgd_reg = SGDRegressor(random_state=3)

estimator_name = "sgdregressor"
# Set the parameters by cross-validation

kfold = False
calibrate = False

In [None]:
parameters_sgd = [
    {
        "sgdregressor__loss": [
            "squared_error",
            "huber",
            "epsilon_insensitive",
            "squared_epsilon_insensitive",
        ],
        "sgdregressor__penalty": [None, "l2", "l1", "elasticnet"][:1],
        "sgdregressor__alpha": [0.0001, 0.001, 0.01, 0.1][:1],
        "sgdregressor__l1_ratio": [
            0.15,
            0.25,
            0.5,
            0.75,
        ][
            :1
        ],  # Only used if penalty is 'elasticnet'
        "sgdregressor__fit_intercept": [True, False][:1],
        "sgdregressor__max_iter": [1000, 2000, 3000][:1],
        "sgdregressor__tol": [1e-3, 1e-4][:1],
        "sgdregressor__epsilon": [
            0.1,
            0.2,
        ],  # Only used for 'huber' and 'epsilon_insensitive'
        "sgdregressor__learning_rate": [
            "constant",
            "optimal",
            "invscaling",
            "adaptive",
        ][:1],
        "sgdregressor__eta0": [
            0.01,
            0.1,
        ][
            :1
        ],  # Initial learning rate (used when learning_rate is 'constant', 'invscaling', or 'adaptive')
        "sgdregressor__power_t": [
            0.25,
            0.5,
        ][
            :1
        ],  # The exponent for inverse scaling learning rate
        "sgdregressor__early_stopping": [True, False][:1],
        "sgdregressor__validation_fraction": [
            0.1,
            0.2,
        ][
            :1
        ],  # The proportion of training data to set aside as validation set for early stopping
        "sgdregressor__n_iter_no_change": [
            5,
            10,
        ][
            :1
        ],  # Number of iterations with no improvement to wait before stopping
        "sgdregressor__warm_start": [True, False][:1],
        "sgdregressor__average": [
            False,
            True,
            10,
        ][
            :1
        ],  # Whether to average over some number of last updates
    }
]

In [None]:
model3 = Model(
    name="Redfin_model",
    estimator_name=estimator_name,
    model_type="regression",
    calibrate=calibrate,
    estimator=sgd_reg,
    kfold=kfold,
    stratify=True,
    grid=parameters_sgd,
    randomized_grid=False,
    impute=True,
    # n_iter=3,
    scoring=["r2"],
    # n_splits=2,
    random_state=3,
)

model3.grid_search_param_tuning(X, y)

model3.fit(X, y)

In [None]:
model3.predict(X_test)

## ElasticNet

In [None]:
elastic_net = ElasticNet(random_state=3)

estimator_name = "elasticnet"
# Set the parameters by cross-validation

kfold = False
calibrate = False

In [None]:
elastic_net

In [None]:
# Define the set of hyperparameters to tune
parameters_enet = [
    {
        "elasticnet__alpha": [0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
        "elasticnet__l1_ratio": [0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 1],
        "elasticnet__fit_intercept": [True, False],
        "elasticnet__precompute": [
            False
        ],  # Typically False when using cv, as not all solvers support it
        "elasticnet__copy_X": [True, False],
        "elasticnet__max_iter": [
            100,
            500,
            1000,
        ],  # Elastic Net might require more i`terations
        "elasticnet__tol": [1e-4, 1e-3],
        "elasticnet__warm_start": [True, False],
        "elasticnet__positive": [True, False],
        "elasticnet__selection": [
            "cyclic",
            "random",
        ],  # Option to choose the type of feature selection
    }
]

In [None]:
model4 = Model(
    name="Redfin_model",
    estimator_name=estimator_name,
    model_type="regression",
    calibrate=calibrate,
    estimator=elastic_net,
    kfold=kfold,
    stratify=True,
    grid=parameters_enet,
    randomized_grid=False,
    impute=True,
    # n_iter=3,
    scoring=["r2"],
    # n_splits=2,
    random_state=3,
)

model4.grid_search_param_tuning(X, y)

model4.fit(X, y)

In [None]:
model4.predict(X_test)

## XGBoost

In [7]:
xg_boost = XGBRegressor(random_state=3)

estimator_name = "xgb"
# Set the parameters by cross-validation

kfold = False
calibrate = False

In [8]:
xg_boost

In [21]:
# Define the hyperparameters for XGBoost
xgb_learning_rates = [0.1, 0.01, 0.05][:1]  # Learning rate or eta
xgb_n_estimators = [100, 200, 300][
    :1
]  # Number of trees. Equivalent to n_estimators in GB
xgb_max_depths = [3, 5, 7][:1]  # Maximum depth of the trees
xgb_subsamples = [0.8, 1.0][:1]  # Subsample ratio of the training instances
xgb_colsample_bytree = [0.8, 1.0][:1]
xgb_eval_metric = ["logloss"]
xgb_early_stopping_rounds = [10]
# xgb_tree_method = ["gpu_hist"]
# early_stopping_mode = ['min']
# early_stopping_patience = [5]
xgb_verbose = [False]
# Subsample ratio of columns when constructing each tree

# Combining the hyperparameters in a dictionary
xgb_parameters = [
    {
        "xgb__learning_rate": xgb_learning_rates,
        "xgb__n_estimators": xgb_n_estimators,
        "xgb__max_depth": xgb_max_depths,
        "xgb__subsample": xgb_subsamples,
        "xgb__colsample_bytree": xgb_colsample_bytree,
        "xgb__eval_metric": xgb_eval_metric,
        "xgb__early_stopping_rounds": xgb_early_stopping_rounds,
        # 'xgb__early_stopping_patience': early_stopping_patience,
        # "xgb_tree_method": xgb_tree_method,
        "xgb__verbose": xgb_verbose,
    }
]

In [22]:
model4 = Model(
    name="Redfin_model",
    estimator_name=estimator_name,
    model_type="regression",
    calibrate=calibrate,
    estimator=xg_boost,
    kfold=kfold,
    stratify=True,
    grid=xgb_parameters,
    randomized_grid=False,
    impute=True,
    # n_iter=3,
    scoring=["r2"],
    # n_splits=2,
    random_state=3,
    xgboost_early=True,
)

eval_set = [X, y]
model4.grid_search_param_tuning(X, y)

X_valid, y_valid = model4.get_valid_data(X, y)

model4.fit(X, y, validation_data=(X_valid, y_valid))

Parameters: { "verbose" } are not used.

100%|██████████| 1/1 [00:00<00:00, 15.79it/s]

Best score/param set found on validation set:
{'params': {'xgb__colsample_bytree': 0.8,
            'xgb__early_stopping_rounds': 10,
            'xgb__eval_metric': 'logloss',
            'xgb__learning_rate': 0.1,
            'xgb__max_depth': 3,
            'xgb__n_estimators': 100,
            'xgb__subsample': 0.8,
            'xgb__verbose': False},
 'score': 0.6382496179453679}
Best r2: 0.638 




Parameters: { "verbose" } are not used.

