In [1]:
import pandas as pd
import numpy as np
import sys
import os

from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso

# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))

from functions import *  # import custom functions

from model_tuner import *

print(f"This project uses Python {sys.version.split()[0]}.")

This project uses Python 3.10.6.


In [2]:
# `base_path`` represents the parent directory of your current working directory
base_path = os.path.join(os.pardir)

# Go up one level from 'notebooks' to the parent directory, then into the 'data' folder
data_path = os.path.join("../data")

# Use the function to ensure the 'data' directory exists
ensure_directory(data_path)

Directory exists: ../data


In [3]:
df = pd.read_csv(os.path.join(data_path, "redfin_2024-04-16-15-59-17.csv"))

In [4]:
df = df.drop(df.index[0])

In [5]:
X = df[["BEDS", "BATHS", "SQUARE FEET", "LOT SIZE"]]
y = df[["PRICE"]]

In [6]:
# Define the set of hyperparameters to tune
parameters_lasso = [
    {
        "lasso__alpha": [0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
        "lasso__fit_intercept": [True, False],
        "lasso__precompute": [False],
        "lasso__copy_X": [True, False],
        "lasso__max_iter": [100, 500],
        "lasso__tol": [1e-4, 1e-3],
        "lasso__warm_start": [True, False],
        "lasso__positive": [True, False],
    }
]

In [7]:
lasso_reg = Lasso()

estimator_name = "lasso"
# Set the parameters by cross-validation

kfold = False
calibrate = False

In [8]:
model = Model(
    name="Redfin_model",
    estimator_name=estimator_name,
    model_type="regression",
    calibrate=calibrate,
    estimator=lasso_reg,
    kfold=kfold,
    stratify=True,
    grid=parameters_lasso,
    randomized_grid=False,
    impute=True,
    # n_iter=3,
    scoring=["r2"],
    # n_splits=2,
    random_state=3,
)

# model.grid_search_param_tuning(X, y)

model.fit(X, y)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [10]:
model.predict(X_test)

array([13073989.11036093,  7316536.11193256, 40599243.53496064,
        6454794.54892854, 15545972.68138833,  9255932.84106737,
        4416296.35381776, 16018870.618109  ,  6929912.55485607,
       12271101.92259133, 12100667.30777553, 30212601.9363673 ,
         615832.16405754, 19123040.05301389,  8864882.96347097,
       14906320.02822273,  6879201.97276068, 26852114.19526781,
       14908818.89031851, 10752242.4944117 ,  2005738.66162506,
         340292.33370156,   663332.6470173 , 17550442.06711074,
        6303587.13109749,  8383795.74700374, 11644405.28702934,
       13192922.31577393, 14951540.17813939, 23184039.7734936 ,
        9822735.97060197, 14939681.80428924, 14830321.97089816,
        2771165.52399362,  2895848.01298962, 14631234.18210411,
        6495349.57561595, 13342721.12424324, 16373044.38092147,
       68323516.63217366, 15073811.56867501, 20269106.81997997,
       24658083.82715943, 14951373.32880565,  9464559.76205897,
        9125608.20495271, 41305571.95464