In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

Consider four possible models for predicting house prices:

1. Using only the size and number of rooms.
2. Using size, number of rooms, and building type.
3. Using size and building type, and their interaction.
4. Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

In [2]:
ames = pd.read_csv("https://www.dropbox.com/scl/fi/g0n5le5p6fr136ggetfsf/AmesHousing.csv?rlkey=jlr9xtz1o6u5rghfo29a5c02f&dl=1")
ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [4]:
X = ames[["Gr Liv Area", "TotRms AbvGrd", "Bldg Type"]]
y = ames["SalePrice"]

In [6]:
X.head()

Unnamed: 0,Gr Liv Area,TotRms AbvGrd,Bldg Type
0,1656,7,1Fam
1,896,5,1Fam
2,1329,6,1Fam
3,2110,8,1Fam
4,1629,6,1Fam


In [7]:
y.head()

Unnamed: 0,SalePrice
0,215000
1,105000
2,172000
3,244000
4,189900


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [10]:
# model 1
ct1 = ColumnTransformer([
    ("scale", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
])
pipe1 = Pipeline([("pre", ct1), ("lr", LinearRegression())])

In [11]:
# model 2
ct2 = ColumnTransformer([
    ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
    ("scale", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
])
pipe2 = Pipeline([("pre", ct2), ("lr", LinearRegression())])

In [12]:
# model 3
ct3 = ColumnTransformer([
    ("poly", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False),
     ["Gr Liv Area"]),
    ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"])
])
pipe3 = Pipeline([("pre", ct3), ("lr", LinearRegression())])

In [13]:
# model 4
ct4 = ColumnTransformer([
    ("poly", PolynomialFeatures(degree=5, include_bias=False), ["Gr Liv Area", "TotRms AbvGrd"]),
    ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"])
])
pipe4 = Pipeline([("pre", ct4), ("lr", LinearRegression())])

In [14]:
pipes = {"Model 1": pipe1, "Model 2": pipe2, "Model 3": pipe3, "Model 4": pipe4}

In [15]:
rmses = {}
for name, p in pipes.items():
    p.fit(X_train, y_train)
    preds = p.predict(X_test)
    rmses[name] = np.sqrt(mean_squared_error(y_test, preds))

In [16]:
rmses

{'Model 1': np.float64(59261.71322786227),
 'Model 2': np.float64(57078.218094312484),
 'Model 3': np.float64(56949.820811739235),
 'Model 4': np.float64(59494.4651581882)}

smallest RMSE = **Model 3** (size, type, interaction)

Once again consider four modeling options for house price:

1. Using only the size and number of rooms.
2. Using size, number of rooms, and building type.
3. Using size and building type, and their interaction.
4. Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.

Which do you prefer? Does this agree with your conclusion from earlier?

In [19]:
cv_rmse = {}
for name, p in pipes.items():
  # score based on root mean squared error
    scores = cross_val_score(p, X, y, cv = 5, scoring = "neg_root_mean_squared_error")
    cv_rmse[name] = -scores.mean()

In [20]:
cv_rmse

{'Model 1': np.float64(55806.32634926364),
 'Model 2': np.float64(54168.08142919383),
 'Model 3': np.float64(54344.55481548086),
 'Model 4': np.float64(65602.79037840001)}

lowest RMSE = **Model 2**

because cross validation is more reiable due to finding the average error across several splits, I trust that Model 2 is a better fit that Model 3 for generalizing this data.



Consider one hundred modeling options for house price:

* House size, trying degrees 1 through 10
* Number of rooms, trying degrees 1 through 10
* Building Type

Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [22]:
ct_poly = ColumnTransformer([
    ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
    ("poly", PolynomialFeatures(include_bias=False), ["Gr Liv Area", "TotRms AbvGrd"])
])

pipe_poly = Pipeline([
    ("pre", ct_poly),
    ("lr", LinearRegression())
])

In [23]:
grid = {"pre__poly__degree": np.arange(1, 11)}

In [24]:
gscv = GridSearchCV(pipe_poly, grid, cv = 5, scoring = "neg_root_mean_squared_error")
gscv.fit(X, y)

In [25]:
results = pd.DataFrame({
    "Degree": np.arange(1, 11),
    "CV RMSE": -gscv.cv_results_["mean_test_score"]
})
results

Unnamed: 0,Degree,CV RMSE
0,1,54168.081429
1,2,54312.649488
2,3,53805.941322
3,4,56630.638924
4,5,65602.790379
5,6,91084.191883
6,7,191946.235835
7,8,351209.073861
8,9,610707.327744
9,10,610434.824139


Q1: The degree 3 model performed the best with the lowest RMSE
Q2: Even with the just 10 models, gridsearch takes a really long time to compute. As the degree increases we may alo run into overfitting and complexity issues. We can narrow the grid by starting with smaller ranges for the degrees.