In [9]:
# Load the processed train/test datasets
import pandas as pd

train_df = pd.read_csv('train_df_processed.csv')
test_df = pd.read_csv('test_df_processed.csv')

print(train_df.shape, test_df.shape)  # Check if they are correct

(2335, 215) (584, 215)


In [10]:
# Install if not yet installed
# !pip install statsmodels scikit-learn

# Import
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [11]:
# Set X (features) and y (target)

X_train = train_df.drop(columns=['SalePrice'])
y_train = train_df['SalePrice']

X_test = test_df.drop(columns=['SalePrice'])
y_test = test_df['SalePrice']

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(2335, 214) (2335,)
(584, 214) (584,)


In [12]:
selected_features = [
    'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 
    'YearBuilt', 'ExterQual', 'KitchenQual', 'BuyerID_enc', 'SizePref_enc', 'NewOldPref_enc'
]

# 重新整理資料
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Add intercept manually
X_train_with_intercept = sm.add_constant(X_train_selected).astype(float)
X_test_with_intercept = sm.add_constant(X_test_selected).astype(float)

# Define the model
md = sm.MixedLM(
    endog=y_train, 
    exog=X_train_with_intercept, 
    groups=X_train['BuyerID_enc']
)

# Fit the model
mdf = md.fit(reml=False)
print(mdf.summary())



          Mixed Linear Model Regression Results
Model:             MixedLM Dependent Variable: SalePrice 
No. Observations:  2335    Method:             ML        
No. Groups:        100     Scale:              0.6011    
Min. group size:   13      Log-Likelihood:     -2728.2933
Max. group size:   36      Converged:          No        
Mean group size:   23.4                                  
---------------------------------------------------------
               Coef.  Std.Err.   z    P>|z| [0.025 0.975]
---------------------------------------------------------
const           0.001    0.018  0.031 0.975 -0.034  0.035
OverallQual     0.181    0.029  6.252 0.000  0.124  0.237
GrLivArea       0.253    0.021 12.173 0.000  0.212  0.294
GarageCars      0.075    0.022  3.424 0.001  0.032  0.117
TotalBsmtSF     0.107    0.019  5.480 0.000  0.069  0.145
YearBuilt       0.029    0.023  1.273 0.203 -0.016  0.073
ExterQual       0.041    0.027  1.517 0.129 -0.012  0.094
KitchenQual     0.104   



In [13]:
# Predict on test set
y_pred_lmm = mdf.predict(X_test_with_intercept)

# Evaluation metrics
mse_lmm = mean_squared_error(y_test, y_pred_lmm)  # 先算 MSE
rmse_lmm = np.sqrt(mse_lmm)                      # 再開根號得到 RMSE
mae_lmm = mean_absolute_error(y_test, y_pred_lmm)
r2_lmm = r2_score(y_test, y_pred_lmm)

print(f"LMM Test RMSE: {rmse_lmm:.4f}")
print(f"LMM Test MAE: {mae_lmm:.4f}")
print(f"LMM Test R²: {r2_lmm:.4f}")

LMM Test RMSE: 0.7969
LMM Test MAE: 0.5192
LMM Test R²: 0.3800


In [14]:
# Train and predict Linear Regression
lr = LinearRegression()
lr.fit(X_train_selected, y_train)
y_pred_lr = lr.predict(X_test_selected)

# Train and predict Lasso Regression
lasso = Lasso(alpha=0.01, max_iter=10000)
lasso.fit(X_train_selected, y_train)
y_pred_lasso = lasso.predict(X_test_selected)

# Train and predict Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_selected, y_train)
y_pred_ridge = ridge.predict(X_test_selected)

# Evaluation
models = {
    "LMM": (y_pred_lmm),
    "Linear Regression": (y_pred_lr),
    "Lasso Regression": (y_pred_lasso),
    "Ridge Regression": (y_pred_ridge)
}

for name, preds in models.items():
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(f"{name} --> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")

LMM --> RMSE: 0.7969, MAE: 0.5192, R²: 0.3800
Linear Regression --> RMSE: 0.7970, MAE: 0.5193, R²: 0.3798
Lasso Regression --> RMSE: 0.7982, MAE: 0.5182, R²: 0.3780
Ridge Regression --> RMSE: 0.7970, MAE: 0.5193, R²: 0.3798


In [15]:
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Step 1: Select only numerical features
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Check how many numerical features we have
print(f"Total {len(numeric_cols)} numerical features")

# Step 2: Redefine X_train and X_test to only include numerical columns
X_train_numeric = X_train[numeric_cols]
X_test_numeric = X_test[numeric_cols]

# Add intercept
X_train_numeric_intercept = sm.add_constant(X_train_numeric).astype(float)
X_test_numeric_intercept = sm.add_constant(X_test_numeric).astype(float)

# Step 3: Build the LMM model
md_numeric = sm.MixedLM(
    endog=y_train,
    exog=X_train_numeric_intercept,
    groups=X_train['BuyerID_enc']
)

# Fit the model
mdf_numeric = md_numeric.fit(reml=False)
print(mdf_numeric.summary())

# Step 4: Predict and evaluate
y_pred_lmm_numeric = mdf_numeric.predict(X_test_numeric_intercept)

# Calculate evaluation metrics
mse_lmm_numeric = mean_squared_error(y_test, y_pred_lmm_numeric)
rmse_lmm_numeric = np.sqrt(mse_lmm_numeric)
mae_lmm_numeric = mean_absolute_error(y_test, y_pred_lmm_numeric)
r2_lmm_numeric = r2_score(y_test, y_pred_lmm_numeric)

print("LMM (using only numerical features) Test Set Performance:")
print(f"Test RMSE: {rmse_lmm_numeric:.4f}")
print(f"Test MAE: {mae_lmm_numeric:.4f}")
print(f"Test R²: {r2_lmm_numeric:.4f}")


Total 49 numerical features




                Mixed Linear Model Regression Results
Model:                  MixedLM     Dependent Variable:     SalePrice 
No. Observations:       2335        Method:                 ML        
No. Groups:             100         Scale:                  0.5539    
Min. group size:        13          Log-Likelihood:         -2634.2214
Max. group size:        36          Converged:              No        
Mean group size:        23.4                                          
----------------------------------------------------------------------
               Coef.   Std.Err.    z    P>|z|    [0.025       0.975]  
----------------------------------------------------------------------
const           0.001      0.017  0.059 0.953       -0.033       0.035
Id             -0.124      0.016 -7.610 0.000       -0.156      -0.092
MSSubClass     -0.039      0.020 -1.956 0.050       -0.079       0.000
LotFrontage    -0.000      0.020 -0.024 0.981       -0.040       0.039
LotArea         0.082  



In [16]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge

# Step 1: Train and predict Linear Regression
lr_numeric = LinearRegression()
lr_numeric.fit(X_train_numeric, y_train)
y_pred_lr_numeric = lr_numeric.predict(X_test_numeric)

# Step 2: Train and predict Lasso Regression
lasso_numeric = Lasso(alpha=0.01, max_iter=10000)
lasso_numeric.fit(X_train_numeric, y_train)
y_pred_lasso_numeric = lasso_numeric.predict(X_test_numeric)

# Step 3: Train and predict Ridge Regression
ridge_numeric = Ridge(alpha=1.0)
ridge_numeric.fit(X_train_numeric, y_train)
y_pred_ridge_numeric = ridge_numeric.predict(X_test_numeric)

# Step 4: Evaluate all models
models_numeric = {
    "LMM (Numerical Features)": y_pred_lmm_numeric,
    "Linear Regression (Numerical Features)": y_pred_lr_numeric,
    "Lasso Regression (Numerical Features)": y_pred_lasso_numeric,
    "Ridge Regression (Numerical Features)": y_pred_ridge_numeric
}

for name, preds in models_numeric.items():
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(f"{name} --> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")

LMM (Numerical Features) --> RMSE: 0.7644, MAE: 0.5209, R²: 0.4294
Linear Regression (Numerical Features) --> RMSE: 0.7645, MAE: 0.5215, R²: 0.4293
Lasso Regression (Numerical Features) --> RMSE: 0.7692, MAE: 0.5168, R²: 0.4223
Ridge Regression (Numerical Features) --> RMSE: 0.7644, MAE: 0.5213, R²: 0.4295


In [17]:
# Calculate Pearson correlation between each feature and SalePrice
correlations = train_df.corr()['SalePrice'].abs().sort_values(ascending=False)

# Show top features
print(correlations)

# Select top 30 features (excluding 'SalePrice' itself)
top_features = correlations.index[1:151].tolist()

print(f"Top features selected: {top_features}")

# Only keep top features
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

# Add intercept
X_train_top_intercept = sm.add_constant(X_train_top).astype(float)
X_test_top_intercept = sm.add_constant(X_test_top).astype(float)

# Build Mixed Linear Model
md_top = sm.MixedLM(
    endog=y_train,
    exog=X_train_top_intercept,
    groups=X_train['BuyerID_enc']
)

# Fit the model
mdf_top = md_top.fit(reml=False)
print(mdf_top.summary())

# Predict
y_pred_lmm_top = mdf_top.predict(X_test_top_intercept)

# Evaluate
mse_lmm_top = mean_squared_error(y_test, y_pred_lmm_top)
rmse_lmm_top = np.sqrt(mse_lmm_top)
mae_lmm_top = mean_absolute_error(y_test, y_pred_lmm_top)
r2_lmm_top = r2_score(y_test, y_pred_lmm_top)

print("LMM (Top Features Only) Test Set Performance:")
print(f"Test RMSE: {rmse_lmm_top:.4f}")
print(f"Test MAE: {mae_lmm_top:.4f}")
print(f"Test R²: {r2_lmm_top:.4f}")


SalePrice              1.000000
OverallQual            0.548696
GrLivArea              0.512934
KitchenQual            0.465052
ExterQual              0.464720
                         ...   
Condition2_RRAn             NaN
RoofMatl_Metal              NaN
RoofMatl_Roll               NaN
Exterior1st_ImStucc         NaN
Exterior2nd_Other           NaN
Name: SalePrice, Length: 215, dtype: float64
Top features selected: ['OverallQual', 'GrLivArea', 'KitchenQual', 'ExterQual', 'GarageCars', 'BsmtQual', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'TotRmsAbvGrd', 'FullBath', 'YearBuilt', 'YearRemodAdd', 'GarageFinish_Unf', 'Foundation_PConc', 'MasVnrArea', 'GarageYrBlt', 'Fireplaces', 'HeatingQC', 'Neighborhood_NridgHt', 'BsmtFinType1_GLQ', 'LotFrontage', 'SaleType_New', 'SaleCondition_Partial', 'BsmtFinSF1', 'Neighborhood_NoRidge', 'GarageType_Detchd', 'LotArea', 'BsmtExposure_No', 'WoodDeckSF', 'Exterior1st_VinylSd', 'Exterior2nd_VinylSd', 'Foundation_CBlock', '2ndFlrSF', 'HalfBath', 'OpenPorc



             Mixed Linear Model Regression Results
Model:                MixedLM   Dependent Variable:   SalePrice 
No. Observations:     2335      Method:               ML        
No. Groups:           100       Scale:                0.5122    
Min. group size:      13        Log-Likelihood:       -2538.9270
Max. group size:      36        Converged:            Yes       
Mean group size:      23.4                                      
----------------------------------------------------------------
                      Coef.  Std.Err.   z    P>|z| [0.025 0.975]
----------------------------------------------------------------
const                 -0.369    0.439 -0.843 0.399 -1.229  0.490
OverallQual            0.124    0.033  3.794 0.000  0.060  0.188
GrLivArea             -0.034    0.224 -0.153 0.879 -0.472  0.404
KitchenQual            0.060    0.025  2.381 0.017  0.011  0.109
ExterQual              0.029    0.028  1.047 0.295 -0.026  0.085
GarageCars             0.051    0.039  



In [18]:
import numpy as np
import pandas as pd

# Step 1: Calculate correlation with SalePrice
correlations = train_df.corr()

# Step 2: Keep only features strongly correlated to SalePrice
strong_corr_features = correlations['SalePrice'].abs().sort_values(ascending=False)
selected_features = strong_corr_features[strong_corr_features > 0.2].index.tolist()  # You can adjust 0.2

# Remove 'SalePrice' itself
selected_features = [f for f in selected_features if f != 'SalePrice']

print(f"Selected features (correlation > 0.2): {selected_features}")

# Step 3: From selected_features, remove highly collinear ones
# Create correlation matrix for selected features
corr_matrix = train_df[selected_features].corr().abs()

# Upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.8
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]

print(f"Highly correlated features to drop: {to_drop}")

# Final feature set
final_features = [f for f in selected_features if f not in to_drop]

print(f"Final selected features: {final_features}")

# Step 4: Redefine X_train and X_test
X_train_final = X_train[final_features]
X_test_final = X_test[final_features]

# Add intercept
X_train_final_intercept = sm.add_constant(X_train_final).astype(float)
X_test_final_intercept = sm.add_constant(X_test_final).astype(float)

# Step 5: Build LMM model
md_final = sm.MixedLM(
    endog=y_train,
    exog=X_train_final_intercept,
    groups=X_train['BuyerID_enc']
)

# Fit model
mdf_final = md_final.fit(reml=False)
print(mdf_final.summary())

# Step 6: Predict and evaluate
y_pred_lmm_final = mdf_final.predict(X_test_final_intercept)

mse_lmm_final = mean_squared_error(y_test, y_pred_lmm_final)
rmse_lmm_final = np.sqrt(mse_lmm_final)
mae_lmm_final = mean_absolute_error(y_test, y_pred_lmm_final)
r2_lmm_final = r2_score(y_test, y_pred_lmm_final)

print("LMM (Strong Non-Collinear Features) Test Set Performance:")
print(f"Test RMSE: {rmse_lmm_final:.4f}")
print(f"Test MAE: {mae_lmm_final:.4f}")
print(f"Test R²: {r2_lmm_final:.4f}")

Selected features (correlation > 0.2): ['OverallQual', 'GrLivArea', 'KitchenQual', 'ExterQual', 'GarageCars', 'BsmtQual', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'TotRmsAbvGrd', 'FullBath', 'YearBuilt', 'YearRemodAdd', 'GarageFinish_Unf', 'Foundation_PConc', 'MasVnrArea', 'GarageYrBlt', 'Fireplaces', 'HeatingQC', 'Neighborhood_NridgHt', 'BsmtFinType1_GLQ', 'LotFrontage', 'SaleType_New', 'SaleCondition_Partial', 'BsmtFinSF1', 'Neighborhood_NoRidge', 'GarageType_Detchd', 'LotArea', 'BsmtExposure_No', 'WoodDeckSF', 'Exterior1st_VinylSd', 'Exterior2nd_VinylSd', 'Foundation_CBlock', '2ndFlrSF', 'HalfBath', 'OpenPorchSF', 'MSZoning_RM', 'BsmtExposure_Gd']
Highly correlated features to drop: ['GarageArea', 'TotRmsAbvGrd', 'SaleCondition_Partial', 'Exterior2nd_VinylSd']
Final selected features: ['OverallQual', 'GrLivArea', 'KitchenQual', 'ExterQual', 'GarageCars', 'BsmtQual', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'YearBuilt', 'YearRemodAdd', 'GarageFinish_Unf', 'Foundation_PConc', 'MasVnrAre



             Mixed Linear Model Regression Results
Model:               MixedLM   Dependent Variable:   SalePrice 
No. Observations:    2335      Method:               ML        
No. Groups:          100       Scale:                0.5712    
Min. group size:     13        Log-Likelihood:       -2669.3797
Max. group size:     36        Converged:            Yes       
Mean group size:     23.4                                      
---------------------------------------------------------------
                     Coef.  Std.Err.   z    P>|z| [0.025 0.975]
---------------------------------------------------------------
const                -0.023    0.075 -0.311 0.756 -0.169  0.123
OverallQual           0.166    0.030  5.497 0.000  0.107  0.225
GrLivArea             0.236    0.192  1.229 0.219 -0.140  0.611
KitchenQual           0.085    0.025  3.362 0.001  0.036  0.135
ExterQual             0.030    0.028  1.073 0.283 -0.025  0.085
GarageCars            0.051    0.023  2.209 0.027  0.



In [19]:
from sklearn.linear_model import Lasso

# Train Lasso on all numerical features
lasso_select = Lasso(alpha=0.01, max_iter=10000)
lasso_select.fit(X_train_numeric, y_train)

# Get non-zero coefficient features
selected_features_lasso = X_train_numeric.columns[lasso_select.coef_ != 0].tolist()

print(f"Selected features by Lasso: {selected_features_lasso}")
print(f"Number of selected features: {len(selected_features_lasso)}")

# Keep only selected features
X_train_selected_lasso = X_train_numeric[selected_features_lasso]
X_test_selected_lasso = X_test_numeric[selected_features_lasso]

# Add intercept for LMM
X_train_selected_lasso_intercept = sm.add_constant(X_train_selected_lasso).astype(float)
X_test_selected_lasso_intercept = sm.add_constant(X_test_selected_lasso).astype(float)

from sklearn.linear_model import LinearRegression, Ridge

# Train LMM
md_lasso = sm.MixedLM(endog=y_train, exog=X_train_selected_lasso_intercept, groups=X_train['BuyerID_enc'])
mdf_lasso = md_lasso.fit(reml=False)
y_pred_lmm_lasso = mdf_lasso.predict(X_test_selected_lasso_intercept)

# Train Linear Regression
lr_lasso = LinearRegression()
lr_lasso.fit(X_train_selected_lasso, y_train)
y_pred_lr_lasso = lr_lasso.predict(X_test_selected_lasso)

# Train Ridge Regression
ridge_lasso = Ridge(alpha=1.0)
ridge_lasso.fit(X_train_selected_lasso, y_train)
y_pred_ridge_lasso = ridge_lasso.predict(X_test_selected_lasso)

# Evaluate
models_lasso = {
    "LMM (Lasso-selected features)": y_pred_lmm_lasso,
    "Linear Regression (Lasso-selected features)": y_pred_lr_lasso,
    "Ridge Regression (Lasso-selected features)": y_pred_ridge_lasso
}

for name, preds in models_lasso.items():
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(f"{name} --> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")

Selected features by Lasso: ['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtFinSF1', 'TotalBsmtSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageCars', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'BuyerID_enc', 'NewOldPref_enc']
Number of selected features: 34




LMM (Lasso-selected features) --> RMSE: 0.7676, MAE: 0.5217, R²: 0.4247
Linear Regression (Lasso-selected features) --> RMSE: 0.7675, MAE: 0.5220, R²: 0.4248
Ridge Regression (Lasso-selected features) --> RMSE: 0.7675, MAE: 0.5219, R²: 0.4248




In [25]:
# import numpy as np
# import pandas as pd
# import statsmodels.api as sm
# from sklearn.linear_model import LinearRegression, Lasso, Ridge
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# # Step 1: Check and clean NaN from y_train
# print(f"Original y_train NaNs: {y_train.isna().sum()}")

# # Create valid index where y_train is not NaN
# valid_idx = y_train.notna()

# # Filter X_train and y_train
# X_train_numeric_valid = X_train_numeric.loc[valid_idx]
# y_train_valid = y_train.loc[valid_idx]

# # Step 2: Log-transform the valid y_train
# y_train_log = np.log1p(y_train_valid)

# # Step 3: Double-check if any NaN still remains
# print(f"After log-transform, NaNs in y_train_log: {y_train_log.isna().sum()}")

# # Final clean (only keep rows where y_train_log is not NaN)
# final_valid_idx = y_train_log.notna()
# X_train_numeric_final = X_train_numeric_valid.loc[final_valid_idx]
# y_train_log_final = y_train_log.loc[final_valid_idx]

# # Step 4: Prepare X_test and y_test (test set usually clean)
# y_test_log = np.log1p(y_test)  # apply log1p to test labels
# X_test_numeric_final = X_test_numeric.copy()  # safe to use original test features

# # Step 5: Add intercept for LMM
# X_train_numeric_final_intercept = sm.add_constant(X_train_numeric_final).astype(float)
# X_test_numeric_final_intercept = sm.add_constant(X_test_numeric_final).astype(float)

# # Step 6: Train LMM model
# md_log = sm.MixedLM(
#     endog=y_train_log_final,
#     exog=X_train_numeric_final_intercept,
#     groups=X_train_numeric_final['BuyerID_enc']  # Important! use the cleaned group
# )
# mdf_log = md_log.fit(reml=False)
# y_pred_lmm_log = mdf_log.predict(X_test_numeric_final_intercept)

# # Step 7: Train Linear Regression
# lr_log = LinearRegression()
# lr_log.fit(X_train_numeric_final, y_train_log_final)
# y_pred_lr_log = lr_log.predict(X_test_numeric_final)

# # Step 8: Train Lasso Regression
# lasso_log = Lasso(alpha=0.01, max_iter=10000)
# lasso_log.fit(X_train_numeric_final, y_train_log_final)
# y_pred_lasso_log = lasso_log.predict(X_test_numeric_final)

# # Step 9: Train Ridge Regression
# ridge_log = Ridge(alpha=1.0)
# ridge_log.fit(X_train_numeric_final, y_train_log_final)
# y_pred_ridge_log = ridge_log.predict(X_test_numeric_final)

# # Step 10: Evaluate all models
# models_log = {
#     "LMM (Log SalePrice)": y_pred_lmm_log,
#     "Linear Regression (Log SalePrice)": y_pred_lr_log,
#     "Lasso Regression (Log SalePrice)": y_pred_lasso_log,
#     "Ridge Regression (Log SalePrice)": y_pred_ridge_log
# }

# print("\n🚀 Log-Transformed Model Performance (evaluated on log(SalePrice)):")

# for name, preds in models_log.items():
#     mse = mean_squared_error(y_test_log, preds)
#     rmse = np.sqrt(mse)
#     mae = mean_absolute_error(y_test_log, preds)
#     r2 = r2_score(y_test_log, preds)
#     print(f"{name} --> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")


In [30]:
from sklearn.preprocessing import PolynomialFeatures

# Step 1: Create polynomial interaction features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train_numeric)
X_test_poly = poly.transform(X_test_numeric)

# Step 2: Train Ridge Regression on polynomial features
ridge_poly = Ridge(alpha=1.0)
ridge_poly.fit(X_train_poly, y_train)
y_pred_ridge_poly = ridge_poly.predict(X_test_poly)

# Step 3: Evaluate
print("Performance with Polynomial Interaction Features (Ridge):")
mse = mean_squared_error(y_test, y_pred_ridge_poly)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_ridge_poly)
r2 = r2_score(y_test, y_pred_ridge_poly)
print(f"Ridge (Polynomial features) --> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")


Performance with Polynomial Interaction Features (Ridge):
Ridge (Polynomial features) --> RMSE: 0.8337, MAE: 0.5165, R²: 0.3214


In [33]:
from sklearn.ensemble import RandomForestRegressor

# Step 1: Train Random Forest
rf = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)
rf.fit(X_train_numeric, y_train)
y_pred_rf = rf.predict(X_test_numeric)

# Step 2: Evaluate
print("Performance with Random Forest:")
mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)
print(f"Random Forest --> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")


Performance with Random Forest:
Random Forest --> RMSE: 0.3487, MAE: 0.1557, R²: 0.8813


In [31]:
alphas = [0.01, 0.1, 1, 10, 100]

print("Ridge Regression with different alphas:")

for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_numeric, y_train)
    y_pred_ridge = ridge.predict(X_test_numeric)
    mse = mean_squared_error(y_test, y_pred_ridge)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred_ridge)
    r2 = r2_score(y_test, y_pred_ridge)
    print(f"Ridge (alpha={alpha}) --> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")


Ridge Regression with different alphas:
Ridge (alpha=0.01) --> RMSE: 0.7645, MAE: 0.5214, R²: 0.4293
Ridge (alpha=0.1) --> RMSE: 0.7645, MAE: 0.5213, R²: 0.4294
Ridge (alpha=1) --> RMSE: 0.7644, MAE: 0.5213, R²: 0.4295
Ridge (alpha=10) --> RMSE: 0.7645, MAE: 0.5211, R²: 0.4294
Ridge (alpha=100) --> RMSE: 0.7650, MAE: 0.5197, R²: 0.4287


In [32]:
# Assuming you already trained Ridge and Random Forest above
# y_pred_ridge: from Ridge
# y_pred_rf: from Random Forest

# Step 1: Average predictions
y_pred_ensemble = (y_pred_ridge_poly + y_pred_rf) / 2  # Use Ridge(Polynomial) + Random Forest for stronger

# Step 2: Evaluate
print("Performance with Simple Ensemble (Ridge+RF):")
mse = mean_squared_error(y_test, y_pred_ensemble)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_ensemble)
r2 = r2_score(y_test, y_pred_ensemble)
print(f"Ensemble (Ridge + RF) --> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")

Performance with Simple Ensemble (Ridge+RF):
Ensemble (Ridge + RF) --> RMSE: 0.5066, MAE: 0.3079, R²: 0.7494
