In [4]:
import pandas as pd

ames_housing = pd.read_csv(
    "../datasets/ames_housing_no_missing.csv",
    na_filter=False,  # required for pandas>2.0
)
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]

In [5]:
numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]


In [6]:
#Q1

from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import numpy as np

linear_model = make_pipeline(
    StandardScaler(), LinearRegression()
)

cv_results_linear = cross_validate(
    linear_model,
    data_numerical,
    target,
    cv=10,
    # scoring="neg_mean_squared_error",
    # return_estimator=True,
)

tree = DecisionTreeRegressor(random_state=0)

cv_results_tree = cross_validate(
    tree,
    data_numerical,
    target,
    cv=10,
    # scoring="neg_mean_squared_error",
    # return_estimator=True,
)

count = 0
for linear_score, tree_score in zip(cv_results_linear["test_score"], cv_results_tree["test_score"]):
    if (linear_score > tree_score):
        count += 1
        
print(f'the linear model is better in {count} situations')

the linear model is better in 9 situations


In [17]:
#Q2

from sklearn.model_selection import GridSearchCV

param_grid = {"max_depth": np.arange(1, 16)}
tree_reg = GridSearchCV(tree, param_grid=param_grid, cv=10)

cv_results = cross_validate(
    tree_reg, data_numerical, target, cv=10, n_jobs=2, return_estimator=True
)

for cv_fold, estimator_in_fold in enumerate(cv_results["estimator"]):
    print(
        f"Best hyperparameters for fold #{cv_fold + 1}:\n"
        f"{estimator_in_fold.best_params_}"
    )


count = 0
for linear_score, tree_score in zip(cv_results_linear["test_score"], cv_results["test_score"]):
    if (linear_score > tree_score):
        count += 1
        
print(f'the linear model is better in {count} situations')

print(
    "A tree with an optimized depth is better than linear regression for "
    f'{sum(cv_results["test_score"] > cv_results_linear["test_score"])} CV '
    "iterations out of 10 folds."
)

Best hyperparameters for fold #1:
{'max_depth': 5}
Best hyperparameters for fold #2:
{'max_depth': 7}
Best hyperparameters for fold #3:
{'max_depth': 6}
Best hyperparameters for fold #4:
{'max_depth': 6}
Best hyperparameters for fold #5:
{'max_depth': 8}
Best hyperparameters for fold #6:
{'max_depth': 6}
Best hyperparameters for fold #7:
{'max_depth': 7}
Best hyperparameters for fold #8:
{'max_depth': 8}
Best hyperparameters for fold #9:
{'max_depth': 7}
Best hyperparameters for fold #10:
{'max_depth': 6}
the linear model is better in 8 situations
A tree with an optimized depth is better than linear regression for 2 CV iterations out of 10 folds.


In [37]:
#Q4

from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
# new_data = data.drop(columns="GarageArea")

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)

preprocessor = ColumnTransformer(
    [("categorical", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), categorical_columns),
    ('numerical', "passthrough", numerical_features)], #no need to scale numerical data for decision tree,
) #by default it drops the other columns 


new_tree = make_pipeline(
    preprocessor,
    DecisionTreeRegressor(max_depth=7),
)

cv_results_new = cross_validate(
    new_tree, data, target, cv=10, n_jobs=2, return_estimator=True
)


print(
    "A tree with numerical and categorical data is better than linear regression for "
    f'{sum(cv_results_new["test_score"] > cv_results["test_score"])} CV '
    "iterations out of 10 folds."
)





A tree with numerical and categorical data is better than linear regression for 6 CV iterations out of 10 folds.
