In [58]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [3]:
lr = LinearRegression()

ames = pd.read_csv("/content/AmesHousing.csv")
X = ames[["Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train_s = (X_train - X_train.mean())/X_train.std()

lr_fitted = lr.fit(X_train_s, y_train)
lr_fitted.coef_

array([ 70683.03115143, -18423.52343376])

Consider four possible models for predicting house prices:

Using only the size and number of rooms.
Using size, number of rooms, and building type.
Using size and building type, and their interaction.
Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

In [4]:
print(ames.columns)

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

# Pipelines and MSE

## Model 1: Using only the size and number of rooms

In [5]:
lr = LinearRegression()

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

lr = LinearRegression()

ct = ColumnTransformer(
  [
    ("standardize", StandardScaler(),["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop")


lr_pipeline = Pipeline([("preprocessing", ct),("linear_regression", LinearRegression())]
).set_output(transform="pandas")

model1 = lr_pipeline.fit(X_train, y_train)

y_preds = model1.predict(X_test)

# Calculating RMSE
mse = mean_squared_error(y_test, y_preds)
print(f'The MSE of model 2 is: {mse}')

# Compute the R^2 for the the model
r_squared = r2_score(y_test, y_preds)
print(f"The R_squared of model 2: {r_squared}")


The MSE of model 2 is: 3345358437.504374
The R_squared of model 2: 0.5275907538063935


## Model 2: Using size, number of rooms, and building type.

In [6]:
# Creating the column transformer
ct2 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd']),
        ('cat', OneHotEncoder(), ['Bldg Type'])
    ])

# Creating the pipeline
lr_pipeline2 = Pipeline(steps=[('preprocessing_model2', ct2),
                           ('linear_regression', LinearRegression())])

# Training the model pipeline
model2 = lr_pipeline2.fit(X_train, y_train)

# Predicting the results
y_preds = model2.predict(X_test)

# Calculating MSE
mse_2 = mean_squared_error(y_test, y_preds)
print(f'The MSE of model 2 is: {mse_2}')

# Compute the R^2 for the the model
r_squared_2 = r2_score(y_test, y_preds)
print(f"The R_squared of model 2: {r_squared_2}")


The MSE of model 2 is: 3186849056.9753246
The R_squared of model 2: 0.5499743932187982


## Model 3: Using size and building type, and their interaction.

In [7]:
# printing all the unique values in the building type column
ames["Bldg Type"].unique()

array(['1Fam', 'TwnhsE', 'Twnhs', 'Duplex', '2fmCon'], dtype=object)

In [42]:
# Define the ColumnTransformer to encode the categorical feature
ct3 = ColumnTransformer(
    [("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"])],
    remainder="passthrough").set_output(transform = "pandas")

ct_inter = ColumnTransformer(
  [ ("interaction1", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_1Fam"]),
    ("interaction2", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_TwnhsE"]),
    ("interaction3", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_Twnhs"]),
    ("interaction4", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_Duplex"]),
    ("interaction5", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_2fmCon"]),
  ],remainder = "drop").set_output(transform = "pandas")

lr_pipeline3 = Pipeline(
  [("dummifying", ct3), ("interaction", ct_inter),
  ("linear_regression", LinearRegression())]
)

model3= lr_pipeline3.fit(X_train, y_train)

# Predicting the results
y_preds = model3.predict(X_test)

# Calculating MSE
mse_3 = mean_squared_error(y_test, y_preds)
print(f'The MSE of model 3 is: {mse_3}')

# Compute the R^2 for the the model
r_squared_3 = r2_score(y_test, y_preds)
print(f"The R_squared of model 3: {r_squared_3}")

The MSE of model 3 is: 3117401743.2767324
The R_squared of model 3: 0.5597812805008071


## Model 4: Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type


In [43]:
ct4 = ColumnTransformer (
  [
  ('degree 5', PolynomialFeatures(degree=5, include_bias = False), ['TotRms AbvGrd']),
  ('degree 5_building', PolynomialFeatures(degree=5, include_bias = False),["Gr Liv Area"]),
  ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
  ],
  remainder = "drop").set_output(transform = "pandas")

lr_pipeline4 = Pipeline([("preprocessing", ct4),("linear_regression", LinearRegression())]
)

model4 = lr_pipeline4.fit(X_train, y_train)

y_preds = model4.predict(X_test)

# Calculating MSE
mse_4 = mean_squared_error(y_test, y_preds)
print(f'The MSE of model 4 is: {mse_4}')

# Compute the R^2 for the the model
r_squared_4 = r2_score(y_test, y_preds)
print(f"The R_squared of model 4: {r_squared_4}")

The MSE of model 4 is: 3259208298.12946
The R_squared of model 4: 0.5397563029282203


In [44]:
pd.DataFrame(data = {"Model": [1,2,3,4], "MSE": [mse,mse_2,mse_3,mse_4], "R-squared":[r_squared, r_squared_2, r_squared_3, r_squared_4]})

Unnamed: 0,Model,MSE,R-squared
0,1,3345358000.0,0.527591
1,2,3186849000.0,0.549974
2,3,3117402000.0,0.559781
3,4,3259208000.0,0.539756


Based on these results, the model that performed the best was Model 2, which used size, building type and number of rooms as the predictor variable. This model had the lowest MSE and the highest r-squared.

# Cross-Validation

**Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.Which do you prefer? Does this agree with your conclusion from earlier?**

In [45]:
scores_model1 = cross_val_score(lr_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
scores_model1.mean()

scores_model2 = cross_val_score(lr_pipeline2, X, y, cv=5, scoring='neg_mean_squared_error')
scores_model2.mean()

scores_model3 = cross_val_score(lr_pipeline3, X, y, cv=5, scoring='neg_mean_squared_error')
scores_model3.mean()

scores_model4 = cross_val_score(lr_pipeline4, X, y, cv=5, scoring='neg_mean_squared_error')
scores_model4.mean()


-3199088807.571119

In [46]:
pd.DataFrame(data = {"Model": [1,2,3,4], "Cross-Validated MSE": [-scores_model1.mean(),
                    -scores_model2.mean(),-scores_model3.mean(),-scores_model4.mean()]}).sort_values(by="Cross-Validated MSE", ascending = True)

Unnamed: 0,Model,Cross-Validated MSE
2,3,2871228000.0
1,2,2951994000.0
0,1,3136139000.0
3,4,3199089000.0


In [48]:
-scores_model3.mean()

2871227973.4670935

Based on these results, I think my conclusion from earlier was accurate. The model that performed the best was Model 2, which used size, building type and number of rooms as the predictor variables. This model has the lowest, cross-validated MSE at 2951993958.

# Tuning

Consider one hundred modeling options for house price:

- House size, trying degrees 1 through 10
- Number of rooms, trying degrees 1 through 10
- Building Type

Hint: The dictionary of possible values that you make to give to

GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [53]:
ct_tuning = ColumnTransformer(
  [("area_polynomial", PolynomialFeatures(), ["Gr Liv Area"]),
    ("room_polynomial", PolynomialFeatures(), ["TotRms AbvGrd"]),
    ("dummify", OneHotEncoder(sparse_output=False), ['Bldg Type'])],
  remainder = "drop").set_output(transform = "pandas")

lr_pipeline3 = Pipeline(
  [("preprocessing", ct_tuning),
  ("linear_regression", LinearRegression())]
).set_output(transform = "pandas")

degrees = {'preprocessing__area_polynomial__degree': np.arange(1, 10),'preprocessing__room_polynomial__degree': np.arange(1, 10) }

gscv = GridSearchCV(lr_pipeline3, degrees, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

results = gscv_fitted.cv_results_['mean_test_score']

In [50]:
len(results)

81

In [56]:
area_degrees = gscv_fitted.cv_results_['param_preprocessing__area_polynomial__degree']
room_degrees = gscv_fitted.cv_results_['param_preprocessing__room_polynomial__degree']


In [57]:
pd.DataFrame(data = {"area_degrees": area_degrees, "scores": results, "room_degrees": room_degrees}).sort_values(by="scores", ascending = False)

Unnamed: 0,area_degrees,scores,room_degrees
18,3,0.557641,1
19,3,0.556857,2
30,4,0.556855,4
31,4,0.556531,5
20,3,0.554039,3
...,...,...,...
78,9,-4.545597,7
75,9,-4.545597,4
77,9,-4.545597,6
76,9,-4.545597,5


The model that performed the best was the model containing a polynomial of degree 3 for the size on "area" and a polynomial of degree 1 for rooms, as this model resulted in the highest R^2 value of 0.557641.

If you were to try all possible model options, the downsides include:
- The risk of overfitting the model to the testing set because you're effectively testing every combination to find the one that performs best on the given data split.

- After a certain point, the improvement in model performance is marginal when compared to the additional computational resources and time spent on training more models.

- The time it takes to execute a grid search can be very, time-consuming for each model.

To choose a smaller number of tuning values to try, I might just select a random subset of the parameter space to train. Often this can possibly find a good model faster than an exhaustive grid search.
Or use my domain knowledge to choose a range for tuning so I can significantly reduce the search space.