In [272]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, make_scorer, mean_squared_error,root_mean_squared_error

In [273]:
ames = pd.read_csv('/Users/benji/Documents/Machine Learning/Data/AmesHousing.csv')

In [274]:
ames

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,2926,923275080,80,RL,37.0,7937,Pave,,IR1,Lvl,...,0,,GdPrv,,0,3,2006,WD,Normal,142500
2926,2927,923276100,20,RL,,8885,Pave,,IR1,Low,...,0,,MnPrv,,0,6,2006,WD,Normal,131000
2927,2928,923400125,85,RL,62.0,10441,Pave,,Reg,Lvl,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,132000
2928,2929,924100070,20,RL,77.0,10010,Pave,,Reg,Lvl,...,0,,,,0,4,2006,WD,Normal,170000


In [275]:
ames.isna().sum()

Order               0
PID                 0
MS SubClass         0
MS Zoning           0
Lot Frontage      490
                 ... 
Mo Sold             0
Yr Sold             0
Sale Type           0
Sale Condition      0
SalePrice           0
Length: 82, dtype: int64

In [276]:
ames = ames.drop(['Lot Shape','Lot Frontage'],axis =1,)

In [277]:
ames.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Area', 'Street',
       'Alley', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu',
       'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars',
   

In [278]:
X = ames[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']] 
y = ames['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

In [279]:
# size_and_rooms = ['Gr Liv Area', 'TotRms AbvGrd']
# building_type = ['Bldg Type']

In [280]:
# 1. Model with only size and number of rooms
pipeline1 = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('size_rooms', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd'])
    ])),
    ('regressor', LinearRegression())
])

# 2. Model with size, number of rooms, and building type
pipeline2 = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('size_rooms', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd']),
        ('building_type', OneHotEncoder(), ['Bldg Type'])
    ])),
    ('regressor', LinearRegression())
])

# 3. Model with size, building type, and their interaction
pipeline3 = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('size', StandardScaler(), ['Gr Liv Area']),
        ('building_type', OneHotEncoder(), ['Bldg Type'])
    ])),
    ('polynomial', PolynomialFeatures(interaction_only=True, include_bias=False)),
    ('regressor', LinearRegression())
])

# 4. Model with 5-degree polynomial on size and number of rooms, and building type
pipeline4 = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('size_rooms_poly', PolynomialFeatures(degree=5), ['TotRms AbvGrd']),
        ('building_type', OneHotEncoder(), ['Bldg Type'])
    ])),
    ('scaler', StandardScaler()),  # Scale polynomial features
    ('regressor', LinearRegression())
])

In [281]:
# Train each model and calculate RMSE
pipelines = [pipeline1, pipeline2, pipeline3, pipeline4]
rmse_results = []

for i, pipeline in enumerate(pipelines, 1):
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    rmse_results.append((f'Model {i}', rmse))

# Output the RMSE for each model
for model, rmse in rmse_results:
    print(f"{model} RMSE: {rmse:.2f}")

# Determine the best model based on RMSE
best_model = min(rmse_results, key=lambda x: x[1])
print(f"Best Model: {best_model[0]} with RMSE: {best_model[1]:.2f}")



Model 1 RMSE: 57264.12
Model 2 RMSE: 54773.42
Model 3 RMSE: 52953.67
Model 4 RMSE: 59663.81
Best Model: Model 3 with RMSE: 52953.67


In [282]:
rmse_scorer = make_scorer(root_mean_squared_error)

# Train each model with cross-validation and calculate RMSE
pipelines = [pipeline1, pipeline2, pipeline3, pipeline4]
cv_rmse_results = []

for i, pipeline in enumerate(pipelines, 1):
    # Perform cross-validation with 5 folds
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring=rmse_scorer)
    mean_rmse = np.mean(cv_scores)
    cv_rmse_results.append((f'Model {i}', mean_rmse))
    print(f"Model {i} Cross-Validated RMSE: {mean_rmse:.2f}")

# Determine the best model based on mean cross-validated RMSE
best_cv_model = min(cv_rmse_results, key=lambda x: x[1])
print(f"\nBest Model Based on Cross-Validation: {best_cv_model[0]} with RMSE: {best_cv_model[1]:.2f}")

Model 1 Cross-Validated RMSE: 55806.33
Model 2 Cross-Validated RMSE: 54168.08
Model 3 Cross-Validated RMSE: 53400.64
Model 4 Cross-Validated RMSE: 65357.73

Best Model Based on Cross-Validation: Model 3 with RMSE: 53400.64


For me model 3 worked best, with the RSME value and using rsme in the cross validation method.

ChatGPt was used to under stand the following code

In [288]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area","TotRms AbvGrd" ])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 11)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [290]:
gscv_fitted = gscv.fit(X, y)
gscv_fitted.cv_results_['mean_test_score']
pd.DataFrame(data = {"degrees": np.arange(1, 11), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,0.532882
1,2,0.531259
2,3,0.541003
3,4,0.530648
4,5,0.402272
5,6,-1.396912
6,7,-20.889703
7,8,-133.074632
8,9,-568.493615
9,10,-2206.275192


Model with degree 3 have the highest R-squared score of 0.541.

Using these many possible models is computationaly demanding. It is like wasting the computing resources.

To reduce the computation we can use cross validation score with fewer degrees.