In [73]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, KFold
from numpy import mean, sqrt
from sklearn.model_selection import GridSearchCV

In [74]:
# Reading in the data
ames = pd.read_csv('/content/AmesHousing.csv')
ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


<H2>13.2.5 Your turn</H2>
<H4>Practice Activity</H4>
Consider four possible models for predicting house prices:

Using only the size and number of rooms.
Using size, number of rooms, and building type.
Using size and building type, and their interaction.
Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

In [75]:
X = ames.drop('SalePrice', axis=1)
y = ames['SalePrice']

# Assuming X is feature matrix and y is the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# preprocessor for numerical and categorical features
numeric_features = ['Gr Liv Area', 'TotRms AbvGrd']
numeric_transformer = StandardScaler()

categorical_features = ['Bldg Type']
categorical_transformer = OneHotEncoder()

# preprocessor for Model 1
preprocessor1 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

# preprocessor for Model 2
preprocessor2 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# preprocessor for Model 3
interaction = PolynomialFeatures(interaction_only=True)
numeric_transformer3 = Pipeline(steps=[('scaler', StandardScaler()),
                                       ('interaction', interaction)])

preprocessor3 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer3, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# preprocessor for Model 4
poly = PolynomialFeatures(degree=5)
numeric_transformer4 = Pipeline(steps=[('scaler', StandardScaler()),
                                       ('poly', poly)])

preprocessor4 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer4, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [76]:
# models with the preprocessors
models = {
    "model1": Pipeline([
        ("preprocessor", preprocessor1),
        ("reg", LinearRegression())
    ]),
    "model2": Pipeline([
        ("preprocessor", preprocessor2),
        ("reg", LinearRegression())
    ]),
    "model3": Pipeline([
        ("preprocessor", preprocessor3),
        ("reg", LinearRegression())
    ]),
    "model4": Pipeline([
        ("preprocessor", preprocessor4),
        ("reg", LinearRegression())
    ])
}

# Fitting the models and computing RMSE
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    print(f"RMSE for {name}: {rmse}")

RMSE for model1: 55372.453007850665
RMSE for model2: 54083.12550273998
RMSE for model3: 53297.401907980704
RMSE for model4: 97441.25029442331


Model 3 is the best one with an RMSE of 53297.40

In [77]:

# cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# RMSE for each model using cross-validation
for name, model in models.items():
    # Using cross_val_score with scoring='neg_mean_squared_error' since cross_val_score does not support RMSE directly
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
    # square root of the scores and negate them to get RMSE as cross_val_score returns negative scores
    rmse_scores = sqrt(-scores)
    print(f"Cross-validated RMSE for {name}: {mean(rmse_scores)}")


Cross-validated RMSE for model1: 55769.33212965836
Cross-validated RMSE for model2: 53975.92253624382
Cross-validated RMSE for model3: 53782.90126975814
Cross-validated RMSE for model4: 84852.67117352634


After cross-validation Model 3 is still the best one.

In [78]:
# preprocessors for 'size' and 'number_of_rooms'
size_transformer = Pipeline(steps=[('poly', PolynomialFeatures()),
                                   ('scaler', StandardScaler())])

rooms_transformer = Pipeline(steps=[('poly', PolynomialFeatures()),
                                    ('scaler', StandardScaler())])

categorical_features = ['Bldg Type']
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('size', size_transformer, ['Gr Liv Area']),
        ('rooms', rooms_transformer, ['TotRms AbvGrd']),
        ('cat', categorical_transformer, categorical_features)])

# pipeline
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('reg', LinearRegression())])

# parameter grid
param_grid = {
    'preprocessor__size__poly__degree': list(range(1, 11)),
    'preprocessor__rooms__poly__degree': list(range(1, 11)),
}

# grid search
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='r2')
grid.fit(X, y)

print(f"Best parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")


Best parameters: {'preprocessor__rooms__poly__degree': 1, 'preprocessor__size__poly__degree': 3}
Best score: 0.5576406129585605


Trying all possible model options can be computationally expensive and time-consuming, especially if the number of options is large. It can also lead to overfitting if the best model is too complex.

We can choose a smaller number of tuning values to try,if we have the domain knowledge to narrow down the range of plausible values. We  could also use a more efficient search strategy, like randomized search or Bayesian optimization, which can find good values with fewer iterations.

As our goal is to balance the computational cost of trying many hyperparameters with the potential benefit of finding a better model. It’s also important to evaluate the model on a separate validation set to ensure that it generalizes well to new data.