In [None]:
from google.colab import files
uploaded = files.upload()

Saving AmesHousing.csv to AmesHousing.csv


In [None]:
import pandas as pd
ames = pd.read_csv("AmesHousing.csv")

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='r2')
scores

array([0.53197809, 0.53225302, 0.43051812, 0.56616942, 0.60636221])

In [None]:
scores.mean()

0.5334561732637108

In [None]:
#using only size and number of rooms
pipeline1 = Pipeline([
    ("preprocessing", ColumnTransformer([("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])])),
    ("linear_regression", LinearRegression())
])

score = cross_val_score(pipeline1, X, y, cv=5, scoring='r2')
score.mean()


0.504208752508862

In [None]:
pipeline2 = Pipeline([
    ("preprocessing", ct),
    ("linear_regression", LinearRegression())
])

score2 = cross_val_score(pipeline2, X, y, cv=5, scoring='r2')
score2.mean()

0.5334561732637108

In [None]:
#Define preprocessing for numerical features (standardization)
num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

#Define preprocessing for categorical features (one-hot encoding)
cat_pipeline = Pipeline([
    ("onehot", OneHotEncoder(drop='first', sparse_output=False))  # Drop first to avoid multicollinearity
])

#Combine numerical and categorical pipelines
preprocessor = ColumnTransformer([
    ("num", num_pipeline, ["Gr Liv Area", "TotRms AbvGrd"]),
    ("cat", cat_pipeline, ["Bldg Type"])
]).set_output(transform="pandas")

#Define the full pipeline with preprocessing and interaction terms
pipeline3 = Pipeline([
    ("preprocessing", preprocessor),
    ("interaction_terms", PolynomialFeatures(interaction_only=True, include_bias=False)),
    ("linear_regression", LinearRegression())
]).set_output(transform="pandas")

# Assuming X and y are defined, evaluate the pipeline
score3 = cross_val_score(pipeline3, X, y, cv=5, scoring='r2')
score3.mean()


0.5463144430596485

In [None]:
#Define 5th-degree polynomial transformation for "Gr Liv Area" (size)
size_poly_pipeline = Pipeline([
    ("scaler", StandardScaler()),  # Standardize before polynomial transformation
    ("poly_size", PolynomialFeatures(degree=5, include_bias=False))
])

#Define 5th-degree polynomial transformation for "TotRms AbvGrd" (number of rooms)
rooms_poly_pipeline = Pipeline([
    ("scaler", StandardScaler()),  # Standardize before polynomial transformation
    ("poly_rooms", PolynomialFeatures(degree=5, include_bias=False))
])

#Define preprocessing for categorical feature "Bldg Type" (one-hot encoding)
cat_pipeline = Pipeline([
    ("onehot", OneHotEncoder(drop="first", sparse_output=False))  # Drop first to avoid multicollinearity
])

#Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer([
    ("size_poly", size_poly_pipeline, ["Gr Liv Area"]),
    ("rooms_poly", rooms_poly_pipeline, ["TotRms AbvGrd"]),
    ("cat", cat_pipeline, ["Bldg Type"])
]).set_output(transform="pandas")

#Define the final pipeline with preprocessing and linear regression
pipeline4 = Pipeline([
    ("preprocessing", preprocessor),
    ("linear_regression", LinearRegression())
]).set_output(transform="pandas")

# Assuming X and y are defined, evaluate the pipeline
score4 = cross_val_score(pipeline4, X, y, cv=5, scoring="r2")
print(score4.mean())


0.5106643234404011


In [None]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')
gscv_fitted = gscv.fit(X, y)
gscv_fitted.cv_results_

{'mean_fit_time': array([0.01666403, 0.01771197, 0.05081792, 0.03243332, 0.08590102,
        0.10378246, 0.03866625, 0.19426036, 0.21909919]),
 'std_fit_time': array([0.0008808 , 0.00049183, 0.03536209, 0.01271905, 0.05684763,
        0.02291074, 0.01062002, 0.17530559, 0.18005133]),
 'mean_score_time': array([0.00914831, 0.00930233, 0.01359663, 0.01604276, 0.02933974,
        0.04036646, 0.01579337, 0.03324227, 0.03070593]),
 'std_score_time': array([8.94177621e-05, 3.00345960e-04, 4.14646576e-03, 6.44970773e-03,
        2.31556683e-03, 1.22920689e-02, 4.96587153e-03, 8.61474134e-03,
        1.27272384e-02]),
 'param_preprocessing__polynomial__degree': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value=999999),
 'params': [{'preprocessing__polynomial__degree': 1},
  {'preprocessing__polynomial__degree': 2},
  {'preprocessing__polynomial__degree': 3},
  {'preprocessing

In [None]:
gscv_fitted.cv_results_['mean_test_score']

array([ 5.32882439e-01,  5.31258707e-01,  5.41002645e-01,  5.30983938e-01,
        3.99898446e-01, -1.41054655e+00, -2.07937471e+01, -1.32190776e+02,
       -5.68868517e+02])

In [None]:
pd.DataFrame(data = {"degrees": np.arange(1, 10), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,0.532882
1,2,0.531259
2,3,0.541003
3,4,0.530984
4,5,0.399898
5,6,-1.410547
6,7,-20.793747
7,8,-132.190776
8,9,-568.868517


In [None]:
# Define the ColumnTransformer with polynomial features for both house size and number of rooms
ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
    ("size_poly", PolynomialFeatures(include_bias=False), ["Gr Liv Area"]),
    ("rooms_poly", PolynomialFeatures(include_bias=False), ["TotRms AbvGrd"])
  ],
  remainder="drop"
)

lr_pipeline_poly = Pipeline([
    ("preprocessing", ct_poly),
    ("linear_regression", LinearRegression())
]).set_output(transform="pandas")

degrees = {
    'preprocessing__size_poly__degree': np.arange(1, 11),   # Degrees 1 through 10 for Gr Liv Area
    'preprocessing__rooms_poly__degree': np.arange(1, 11)   # Degrees 1 through 10 for TotRms AbvGrd
}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv=5, scoring='r2')
gscv_fitted = gscv.fit(X, y)

results_df = pd.DataFrame({
    "size_degree": gscv.cv_results_['param_preprocessing__size_poly__degree'],
    "rooms_degree": gscv.cv_results_['param_preprocessing__rooms_poly__degree'],
    "mean_test_score": gscv.cv_results_['mean_test_score']
})
results_df

  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,size_degree,rooms_degree,mean_test_score
0,1,1,0.532882
1,2,1,0.537472
2,3,1,0.557641
3,4,1,0.549278
4,5,1,0.451860
...,...,...,...
95,6,10,0.053896
96,7,10,0.400101
97,8,10,-0.968095
98,9,10,-4.545598


In [None]:
#13.3.3 Q1
best_params = gscv.best_params_

#13.3.3 Q2
best_score = gscv.best_score_

best_params, best_score

({'preprocessing__rooms_poly__degree': 1,
  'preprocessing__size_poly__degree': 3},
 0.5576405999448386)