In [7]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from plotnine import *

In [8]:
ames = pd.read_csv("C:\\Users\\Tyler Clyde\\Documents\\GSB 544\\AmesHousing.csv")


Part 1

In [9]:
X = ames[["Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

ct = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

lr_fitted = lr_pipeline.fit(X_train, y_train)
y_preds = lr_fitted.predict(X_test)
mean_squared_error(y_preds, y_test)

3511950654.701387

In [10]:
X = ames[["Gr Liv Area", "TotRms AbvGrd", 'Bldg Type']]
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

lr_fitted = lr_pipeline.fit(X_train, y_train)
y_preds = lr_fitted.predict(X_test)
mean_squared_error(y_preds, y_test)

3257922980.8219028

In [11]:
X = ames[["Gr Liv Area", 'Bldg Type']]
y = ames["SalePrice"]


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[['Bldg Type']])
X_test_encoded = encoder.transform(X_test[['Bldg Type']])


X_train_combined = np.concatenate([X_train_encoded, X_train[['Gr Liv Area']]], axis=1)
X_test_combined = np.concatenate([X_test_encoded, X_test[['Gr Liv Area']]], axis=1)


ct = ColumnTransformer(
    transformers=[
        ("interaction", PolynomialFeatures(interaction_only=True), [0, 1]),  
    ],
    remainder="passthrough"
)


lr_pipeline = Pipeline([
    ("preprocessing", ct),
    ("linear_regression", LinearRegression())
])

lr_fitted = lr_pipeline.fit(X_train_combined, y_train)
y_preds = lr_fitted.predict(X_test_combined)

mean_squared_error(y_preds, y_test)

3243282090.489238

In [12]:

X = ames[["TotRms AbvGrd", 'Bldg Type']]
y = ames["SalePrice"]


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


ct = ColumnTransformer(
    transformers=[
        ("dummify", OneHotEncoder(sparse_output=False), ['Bldg Type']),  
        ("polynomial", PolynomialFeatures(degree=5, include_bias=False), [0])  
    ],
    remainder="passthrough"
)


lr_pipeline = Pipeline([
    ("preprocessing", ct),
    ("linear_regression", LinearRegression())
])


lr_fitted = lr_pipeline.fit(X_train, y_train)
y_preds = lr_fitted.predict(X_test)


mean_squared_error(y_preds, y_test)


4615152521.8619995

Based on the above models, the 3rd model had the loweres MSE and thus performed the best.

Part 2

In [22]:
X = ames[["Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

ct = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')

array([-56130.63127001, -53933.09830807, -53342.43375389, -56520.18909137,
       -52744.44054367])

In [23]:
X = ames[["Gr Liv Area", "TotRms AbvGrd", 'Bldg Type']]
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')

array([-54071.43462503, -52830.82795482, -51679.06516304, -54234.13328564,
       -51683.54916464])

In [28]:
X = ames[["Gr Liv Area", 'Bldg Type']]
y = ames["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


numeric_features = ['Gr Liv Area']
categorical_features = ['Bldg Type']

numeric_transformer = PolynomialFeatures(interaction_only=True)
categorical_transformer = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_regression', LinearRegression())
])


cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')

array([-54250.32935233, -53461.62681198, -51920.53171949, -53788.84133768,
       -52563.55181494])

In [29]:
X = ames['Gr Liv Area']
y = ames["SalePrice"]




ct = ColumnTransformer(
    transformers=[ 
        ("polynomial", PolynomialFeatures(degree=5, include_bias=False), [0])  
    ],
    remainder="passthrough"
)


lr_pipeline = Pipeline([
    ("preprocessing", ct),
    ("linear_regression", LinearRegression())
])

cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')

array([-64617.57759205, -66825.73234598, -61967.40373356, -61772.06944238,
       -64869.12626848])

Based on the above models, model number 2 performed slightly better than model 3. It had the overall lowest MSE.

Part 3

In [46]:
from sklearn.model_selection import GridSearchCV

X = ames["Gr Liv Area"]
y = ames["SalePrice"]
X = X.values.reshape(-1, 1)

ct = ColumnTransformer(
  [
    ("polynomial", PolynomialFeatures(), [0])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

degrees = {'preprocessing__polynomial__degree': np.arange(1, 11)}

gscv = GridSearchCV(lr_pipeline, degrees, cv = 5, scoring='neg_root_mean_squared_error')

gscv_fitted = gscv.fit(X, y)
pd.DataFrame(data = {"degrees": np.arange(1, 11), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,-56682.640357
1,2,-56598.192911
2,3,-55666.018799
3,4,-56058.000686
4,5,-58214.2933
5,6,-62623.658301
6,7,-72233.78035
7,8,-97088.994557
8,9,-149965.515741
9,10,-242581.50761


In [45]:
X = ames['TotRms AbvGrd']
y = ames["SalePrice"]
X = X.values.reshape(-1, 1)

ct = ColumnTransformer(
  [
    ("polynomial", PolynomialFeatures(), [0])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

degrees = {'preprocessing__polynomial__degree': np.arange(1, 11)}

gscv = GridSearchCV(lr_pipeline, degrees, cv = 5, scoring='neg_root_mean_squared_error')

gscv_fitted = gscv.fit(X, y)
pd.DataFrame(data = {"degrees": np.arange(1, 11), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,-69576.124787
1,2,-69759.584431
2,3,-69347.503639
3,4,-69418.29175
4,5,-70014.990947
5,6,-73595.504267
6,7,-70811.277758
7,8,-88746.418724
8,9,-86298.70791
9,10,-983332.227694


In [44]:
X = pd.get_dummies(ames["Bldg Type"], prefix='Bldg Type')
y = ames["SalePrice"]

ct = ColumnTransformer(
  [
    ("polynomial", PolynomialFeatures(), [0])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

degrees = {'preprocessing__polynomial__degree': np.arange(1, 11)}

gscv = GridSearchCV(lr_pipeline, degrees, cv = 5, scoring='neg_root_mean_squared_error')

gscv_fitted = gscv.fit(X, y)
pd.DataFrame(data = {"degrees": np.arange(1, 11), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,-79495.620568
1,2,-79495.620568
2,3,-79495.620568
3,4,-79495.620568
4,5,-79495.620568
5,6,-79495.620568
6,7,-79495.620568
7,8,-79495.620568
8,9,-79495.620568
9,10,-79495.620568


Here, the first model performed the best with the lowest MSE. Since there is the issue of overfitting with large degree values, it may be better to use only the first few degrees for testing such as degrees 1 through 4. This keeps the possibility of overfitting lower than doing to the 10th degree.