In [1]:
import pandas as pd
import numpy as np
import os
import sys

sys.path.append(os.path.join(os.pardir))

from functions import *
from model_tuner import *
from sklearn.linear_model import Lasso, Ridge, SGDRegressor, ElasticNet

from xgboost import XGBRegressor

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [2]:
df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
df2 = pd.DataFrame({"Y": [5, 6]})

In [3]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,A,B,Y
0,1,3,5
1,2,4,6


In [4]:
# # `base_path`` represents the parent directory of your current working directory
base_path = os.path.join(os.pardir)

# Go up one level from 'notebooks' to the parent directory, then into the 'data' folder
data_path = "../public_data/"

# Use the function to ensure the 'data' directory exists
ensure_directory(data_path)

Directory exists: ../public_data/


In [5]:
df = pd.read_excel(os.path.join(data_path, "redfin_2024-04-16-15-59-17.xlsx"))

In [6]:
df = df.drop(df.index[0])

In [7]:
X = df[["BEDS", "BATHS", "SQUARE FEET", "LOT SIZE"]]
y = df[["PRICE"]]

In [8]:
# Define the set of hyperparameters to tune
parameters_lasso = [
    {
        "lasso__fit_intercept": [True, False],
        "lasso__precompute": [False],
        "lasso__copy_X": [True, False],
        "lasso__max_iter": [100, 500, 1000, 2000],
        "lasso__tol": [1e-4, 1e-3],
        "lasso__warm_start": [True, False],
        "lasso__positive": [True, False],
    }
]

## Lasso

In [9]:
lasso_reg = Lasso()

estimator_name = "lasso"
# Set the parameters by cross-validation

kfold = False
calibrate = False

In [11]:
model1 = Model(
    name="Redfin_model",
    estimator_name=estimator_name,
    model_type="regression",
    calibrate=calibrate,
    estimator=lasso_reg,
    kfold=kfold,
    stratify_y=False,
    grid=parameters_lasso,
    randomized_grid=True,
    impute=True,
    # n_iter=3,
    scoring=["r2"],
    # n_splits=2,
    random_state=3,
)

model1.grid_search_param_tuning(X, y)

X_train, y_train = model1.get_train_data(X, y)
X_test, y_test = model1.get_test_data(X, y)
X_valid, y_valid = model1.get_valid_data(X, y)

model1.fit(X_train, y_train)

print("Validation Metrics")
model1.return_metrics(X_valid, y_valid)
print("Test Metrics")
model1.return_metrics(X_test, y_test)
print("Bootstrap Metrics")

X_test = np.array(X_test)
y_test = np.array(y_test)
model1.return_bootstrap_metrics(
    X_test,
    y_test,
    metrics=["r2", "explained_variance"],
    n_samples=2,
    num_resamples=300,
    balance=True,
)

  0%|          | 0/100 [00:00<?, ?it/s]


ValueError: Input X contains NaN.
Lasso does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

## Ridge Regression

In [None]:
ridge_reg = Ridge()

estimator_name = "ridge"
# Set the parameters by cross-validation

grid_search_params = {
    f"{estimator_name}__max_iter": [100, 200, 500],
    f"{estimator_name}__alpha": [0.1, 1, 0.5],
    "selectKBest__k": [1, 2, 3],
}

kfold = False
calibrate = False

In [None]:
import warnings

from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings(action="ignore", category=DataConversionWarning)

In [None]:
model2 = Model(
    name="Redfin_model",
    estimator_name=estimator_name,
    model_type="regression",
    calibrate=calibrate,
    estimator=ridge_reg,
    kfold=kfold,
    selectKBest=True,
    stratify_y=False,
    grid=grid_search_params,
    randomized_grid=False,
    impute=True,
    scoring=["r2"],
    n_splits=5,
    random_state=3,
)

model2.grid_search_param_tuning(X, y)

### If KFold then the whole dataset is fed to the
### return metrics function
model2.fit(X, y)
model2.return_metrics(X, y)

In [None]:
model2.predict(X_test)

### SGD Regressor

In [None]:
sgd_reg = SGDRegressor(random_state=3)

estimator_name = "sgdregressor"
# Set the parameters by cross-validation

kfold = False
calibrate = False

In [None]:
parameters_sgd = [
    {
        "sgdregressor__loss": [
            "squared_error",
            "huber",
            "epsilon_insensitive",
            "squared_epsilon_insensitive",
        ],
        "sgdregressor__penalty": [None, "l2", "l1", "elasticnet"][:1],
        "sgdregressor__alpha": [0.0001, 0.001, 0.01, 0.1][:1],
        "sgdregressor__l1_ratio": [
            0.15,
            0.25,
            0.5,
            0.75,
        ][
            :1
        ],  # Only used if penalty is 'elasticnet'
        "sgdregressor__fit_intercept": [True, False][:1],
        "sgdregressor__max_iter": [1000, 2000, 3000][:1],
        "sgdregressor__tol": [1e-3, 1e-4][:1],
        "sgdregressor__epsilon": [
            0.1,
            0.2,
        ],  # Only used for 'huber' and 'epsilon_insensitive'
        "sgdregressor__learning_rate": [
            "constant",
            "optimal",
            "invscaling",
            "adaptive",
        ][:1],
        "sgdregressor__eta0": [
            0.01,
            0.1,
        ][:1],
        "sgdregressor__power_t": [
            0.25,
            0.5,
        ][:1],
        "sgdregressor__early_stopping": [True, False][:1],
        "sgdregressor__validation_fraction": [
            0.1,
            0.2,
        ][:1],
        "sgdregressor__n_iter_no_change": [
            5,
            10,
        ][:1],
        "sgdregressor__warm_start": [True, False][:1],
        "sgdregressor__average": [
            False,
            True,
            10,
        ][:1],
    }
]

In [None]:
model3 = Model(
    name="Redfin_model",
    estimator_name=estimator_name,
    model_type="regression",
    calibrate=calibrate,
    estimator=sgd_reg,
    kfold=kfold,
    stratify_y=False,
    grid=parameters_sgd,
    randomized_grid=False,
    impute=True,
    # n_iter=3,
    scoring=["r2"],
    # n_splits=2,
    random_state=3,
)
model3.grid_search_param_tuning(X, y)


X_train, y_train = model3.get_train_data(X, y)
X_test, y_test = model3.get_test_data(X, y)
X_valid, y_valid = model3.get_valid_data(X, y)

model3.fit(X_train, y_train)

print("Validation Metrics")
model3.return_metrics(X_valid, y_valid)
print("Test Metrics")
model3.return_metrics(X_test, y_test)

In [None]:
model3.predict(X_test)

## ElasticNet

In [None]:
elastic_net = ElasticNet(random_state=3)

estimator_name = "elasticnet"
# Set the parameters by cross-validation

calibrate = False

In [None]:
elastic_net

In [None]:
# Define the set of hyperparameters to tune
parameters_enet = [
    {
        "elasticnet__alpha": [0.01, 0.05, 0.1, 0.5],
        "elasticnet__l1_ratio": [
            0.1,
            0.2,
            0.3,
        ],
        "elasticnet__fit_intercept": [True, False],
        "elasticnet__precompute": [False],
        "elasticnet__tol": [1e-4, 1e-3],
        "elasticnet__warm_start": [True, False],
        "elasticnet__positive": [True, False],
        "elasticnet__selection": [
            "cyclic",
            "random",
        ],
    }
]

In [None]:
model4 = Model(
    name="Redfin_model",
    estimator_name=estimator_name,
    model_type="regression",
    calibrate=calibrate,
    estimator=elastic_net,
    kfold=True,
    stratify_y=False,
    grid=parameters_enet,
    randomized_grid=False,
    impute=True,
    # n_iter=3,
    scoring=["r2"],
    # n_splits=2,
    random_state=3,
)


model4.grid_search_param_tuning(X, y)

### If KFold then the whole dataset is fed to the
### return metrics function
model4.fit(X, y)
model4.return_metrics(X, y)

In [None]:
# model4.kfold = False
model4.predict(X_test)

## XGBoost

In [None]:
xg_boost = XGBRegressor(random_state=3)

estimator_name = "xgb"
# Set the parameters by cross-validation

kfold = False
calibrate = False

In [None]:
xg_boost

In [None]:
# Define the hyperparameters for XGBoost
xgb_learning_rates = [0.1, 0.01, 0.05][:1]  # Learning rate or eta
xgb_n_estimators = [100, 200, 300][
    :1
]  # Number of trees. Equivalent to n_estimators in GB
xgb_max_depths = [3, 5, 7][:1]  # Maximum depth of the trees
xgb_subsamples = [0.8, 1.0][:1]  # Subsample ratio of the training instances
xgb_colsample_bytree = [0.8, 1.0][:1]
xgb_eval_metric = ["logloss"]
xgb_early_stopping_rounds = [10]
# xgb_tree_method = ["gpu_hist"]
# early_stopping_mode = ['min']
# early_stopping_patience = [5]
xgb_verbose = [False]
# Subsample ratio of columns when constructing each tree

# Combining the hyperparameters in a dictionary
xgb_parameters = [
    {
        "xgb__learning_rate": xgb_learning_rates,
        "xgb__n_estimators": xgb_n_estimators,
        "xgb__max_depth": xgb_max_depths,
        "xgb__subsample": xgb_subsamples,
        "xgb__colsample_bytree": xgb_colsample_bytree,
        "xgb__eval_metric": xgb_eval_metric,
        "xgb__early_stopping_rounds": xgb_early_stopping_rounds,
        # 'xgb__early_stopping_patience': early_stopping_patience,
        # "xgb_tree_method": xgb_tree_method,
        "xgb__verbose": xgb_verbose,
    }
]

In [None]:
X = df[["BEDS", "BATHS", "SQUARE FEET", "LOT SIZE"]]
y = df[["PRICE"]]

In [None]:
X = X.to_numpy()
y = y.to_numpy()

In [None]:
model4 = Model(
    name="Redfin_model",
    estimator_name=estimator_name,
    model_type="regression",
    calibrate=calibrate,
    estimator=xg_boost,
    kfold=kfold,
    stratify_y=False,
    grid=xgb_parameters,
    randomized_grid=False,
    impute=True,
    # n_iter=3,
    scoring=["r2"],
    # n_splits=2,
    random_state=3,
    xgboost_early=True,
)

eval_set = [X, y]
model4.grid_search_param_tuning(X, y)

X_train, X_valid, X_test, y_train, y_valid, y_test = model4.train_val_test_split(
    X,
    y,
    stratify_y=False,
    stratify_cols=None,
    train_size=0.6,
    validation_size=0.2,
    test_size=0.2,
    calibrate=model4.calibrate,
    random_state=model4.random_state,
)

model4.fit(X_train, y_train, validation_data=(X_valid, y_valid))

model4.return_metrics(X_test, y_test)

In [None]:
y