In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

Consider four possible models for predicting house prices:

Using only the size and number of rooms.
Using size, number of rooms, and building type.
Using size and building type, and their interaction.
Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

In [6]:

housing = pd.read_csv("/Users/williamkapner/Documents/GSB_544/Data/AmesHousing.csv")
housing.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [7]:
X = housing.drop("SalePrice", axis = 1)
y = housing["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)



lr_fitted = lr_pipeline.fit(X_train, y_train)



# Predictions for train and test sets
y_train_pred = lr_fitted.predict(X_train)
y_test_pred = lr_fitted.predict(X_test)


# Calculate MSE for training and testing data
r1 = r2_score(y_test, y_test_pred)
intercept1 = lr_fitted.named_steps['linear_regression'].intercept_
coefficients1 = lr_fitted.named_steps['linear_regression'].coef_


print("R_Squared:", r1)
print("Slope:", coefficients1)
print("Intercept:", intercept1)

R_Squared: 0.4710253540755466
Slope: [ 70088.51672653 -16248.478439  ]
Intercept: 180311.05917159765


In [None]:
from sklearn.compose import ColumnTransformer

ct1 = ColumnTransformer(
  [  
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct1),
  ("linear_regression", LinearRegression())]
)


lr_fitted = lr_pipeline.fit(X_train, y_train)


# Predictions for train and test sets
y_train_pred = lr_fitted.predict(X_train)
y_test_pred = lr_fitted.predict(X_test)


# Calculate MSE for training and testing data
r2 = r2_score(y_test, y_test_pred)
intercept2 = lr_fitted.named_steps['linear_regression'].intercept_
coefficients2 = lr_fitted.named_steps['linear_regression'].coef_


print("R_Squared:", r2)
print("Slope:", coefficients2)
print("Intercept:", intercept2)

R_Squared: 0.5083375184293497
Slope: [ 64902.30441758  -8860.14117367  19933.68887638 -33519.63459957
 -28078.66488306  -1309.37154119  42973.98214744]
Intercept: 162180.98173715002


In [None]:
from sklearn.compose import ColumnTransformer

ct2 = ColumnTransformer(
  [  
    ("standardize", StandardScaler(), ["Gr Liv Area"]),
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])
  ],
  remainder = "drop"
).set_output(transform="pandas")

X_train_dummified = ct2.fit_transform(X_train)
X_train_dummified

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv", "dummify__Bldg Type_1Fam"]),
    ("interaction", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv", "dummify__Bldg Type_1Fam"]),
    ("interaction", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv", "dummify__Bldg Type_1Fam"]),
    ("interaction", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv", "dummify__Bldg Type_1Fam"]),
    ("interaction", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv", "dummify__Bldg Type_1Fam"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

lr_pipeline = Pipeline(
  [("preprocessing", ct2),
  ("linear_regression", LinearRegression())]
)


# lr_fitted = lr_pipeline.fit(X_train, y_train)


# # Predictions for train and test sets
# y_train_pred = lr_fitted.predict(X_train)
# y_test_pred = lr_fitted.predict(X_test)


# # Calculate MSE for training and testing data
# r2 = r2_score(y_test, y_test_pred)
# intercept2 = lr_fitted.named_steps['linear_regression'].intercept_
# coefficients2 = lr_fitted.named_steps['linear_regression'].coef_


# print("R_Squared:", r2)
# print("Slope:", coefficients2)
# print("Intercept:", intercept2)

Unnamed: 0,standardize__Gr Liv Area,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE
2667,-0.486507,0.0,0.0,1.0,0.0,0.0
2097,2.640638,1.0,0.0,0.0,0.0,0.0
249,0.396219,1.0,0.0,0.0,0.0,0.0
2532,-0.876411,1.0,0.0,0.0,0.0,0.0
966,1.029565,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
1209,-0.650781,1.0,0.0,0.0,0.0,0.0
2861,-0.621093,1.0,0.0,0.0,0.0,0.0
2498,2.280423,1.0,0.0,0.0,0.0,0.0
443,0.677266,1.0,0.0,0.0,0.0,0.0
