In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from scipy.stats import randint, uniform
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, PredefinedSplit, GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [2]:


##############################################################################
# 1) Define global resources so we can reuse the same objects for train & test
##############################################################################

# --- Ordinal mapping for your ordinal columns ---
ordinal_mapping = {
    "GarageQual":    {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "Fence":         {"GdPrv": 4, "MnPrv": 3, "GdWo": 2, "MnWw": 1, "NA": 0},
    "GarageFinish":  {"Fin": 3, "RFn": 2, "Unf": 1, "NA": 0},
    "KitchenQual":   {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
    "GarageCond":    {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "HeatingQC":     {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
    "ExterQual":     {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
    "BsmtCond":      {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "LandSlope":     {"Gtl": 2, "Mod": 1, "Sev": 0},
    "ExterCond":     {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
    "BsmtExposure":  {"Gd": 4, "Av": 3, "Mn": 2, "No": 1, "NA": 0},
    "PavedDrive":    {"Y": 2, "P": 1, "N": 0},
    "BsmtQual":      {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "LotShape":      {"Reg": 3, "IR1": 2, "IR2": 1, "IR3": 0},
    "BsmtFinType2":  {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1, "NA": 0},
    "BsmtFinType1":  {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1, "NA": 0},
    "FireplaceQu":   {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "Utilities":     {"AllPub": 3, "NoSewr": 2, "NoSeWa": 1, "ELO": 0},
    "Functional":    {"Typ": 7, "Min1": 6, "Min2": 5, "Mod": 4, "Maj1": 3,
                      "Maj2": 2, "Sev": 1, "Sal": 0},
    "PoolQC":        {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
}
# Extract the list of ordinal columns from that mapping
ordinal_cols = list(ordinal_mapping.keys())

# --- Nominal features (for One-Hot Encoding) ---
nominal_features = [
    "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood",
    "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl",
    "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating",
    "CentralAir", "Electrical", "GarageType", "MiscFeature", "SaleType",
    "SaleCondition"
]

# Create ONE global OrdinalEncoder and ONE global OneHotEncoder.
# We'll fit them on train, then reuse them to transform test.
ordinal_encoder = OrdinalEncoder(
    categories=[list(ordinal_mapping[col].keys()) for col in ordinal_cols],
    handle_unknown="use_encoded_value",
    unknown_value=-1
)
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

##############################################################################
# 2) Main preprocessing function
##############################################################################
def preprocess_data(df, is_train=True):
    """
    Preprocess a housing DataFrame.
    If is_train=True, drops outliers and fits the encoders.
    If is_train=False, only transforms using the fitted encoders (NO outlier removal).

    Returns:
      (X, y) if is_train=True
      X if is_train=False
    """

    # Make a copy so we don't overwrite the original data
    df = df.copy()

    # If this is training data, grab the target 'SalePrice'
    # and remove it from the DataFrame.
    if is_train:
        y = df['SalePrice']
        df.drop('SalePrice', axis=1, inplace=True)
    else:
        y = None  # no target in test set

    # 1) Convert certain columns to numeric if they exist
    cols_to_numeric = [
        "LotFrontage", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
        "TotalBsmtSF", "BsmtFullBath", "GarageCars", "BsmtHalfBath", "GarageYrBlt"
    ]
    for col in cols_to_numeric:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # 2) Fill numeric columns with 0, categorical with "NA"
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].fillna(0)

    # Identify any columns of object/string dtype
    cat_cols = df.select_dtypes(include=['object']).columns
    for cat_col in cat_cols:
        df[cat_col] = df[cat_col].fillna("NA")

    # 3) Feature engineering examples (abbreviated from your original code)
    #    Only do these if the columns exist in df.
    if set(["FullBath","HalfBath","BsmtFullBath","BsmtHalfBath"]).issubset(df.columns):
        df["TotalBath"] = (
            df["FullBath"] +
            0.5 * df["HalfBath"] +
            df["BsmtFullBath"] +
            0.5 * df["BsmtHalfBath"]
        )
    else:
        # If any are missing, you can either create them or skip
        df["TotalBath"] = 0

    if "GarageYrBlt" in df.columns:
        df["Has_garage"] = df["GarageYrBlt"].notnull().astype(int)

    if set(["YrSold","YearBuilt"]).issubset(df.columns):
        df["House_Age"] = df["YrSold"] - df["YearBuilt"]

    if set(["YearBuilt","YearRemodAdd"]).issubset(df.columns):
        df["Is_Remodeled"] = (df["YearBuilt"] != df["YearRemodAdd"]).astype(int)

    # ... etc. for other features from your original code ...
    # e.g., if you want "House_Age2", "GrLivArea" / "LotArea" ratio, etc.

    # 4) Outlier removal: only do this on TRAIN.
    #    Make sure to drop rows from y as well to keep them aligned.
    if is_train:
        # (a) LotFrontage > 200
        if "LotFrontage" in df.columns:
            idx = df[df["LotFrontage"] > 200].index
            df.drop(idx, inplace=True)
            y.drop(idx, inplace=True)

        # (b) LotArea > 100000
        if "LotArea" in df.columns:
            idx = df[df["LotArea"] > 100000].index
            df.drop(idx, inplace=True)
            y.drop(idx, inplace=True)

        # (c) BsmtFinSF1 > 4000
        if "BsmtFinSF1" in df.columns:
            idx = df[df["BsmtFinSF1"] > 4000].index
            df.drop(idx, inplace=True)
            y.drop(idx, inplace=True)

        # (d) TotalBsmtSF > 6000
        if "TotalBsmtSF" in df.columns:
            idx = df[df["TotalBsmtSF"] > 6000].index
            df.drop(idx, inplace=True)
            y.drop(idx, inplace=True)

        # (e) 1stFlrSF > 4000
        if "1stFlrSF" in df.columns:
            idx = df[df["1stFlrSF"] > 4000].index
            df.drop(idx, inplace=True)
            y.drop(idx, inplace=True)

        # (f) GrLivArea > 4000 & y < 300000
        if "GrLivArea" in df.columns:
            idx = df[(df["GrLivArea"] > 4000) & (y < 300000)].index
            df.drop(idx, inplace=True)
            y.drop(idx, inplace=True)

        # (g) LowQualFinSF > 550
        if "LowQualFinSF" in df.columns:
            idx = df[df["LowQualFinSF"] > 550].index
            df.drop(idx, inplace=True)
            y.drop(idx, inplace=True)

    # 5) Ordinal Encoding for the columns in ordinal_mapping
    #    We'll only encode columns that actually exist in df.
    existing_ordinal = [col for col in ordinal_cols if col in df.columns]

    # Clean up whitespace if any
    for col in existing_ordinal:
        df[col] = df[col].astype(str).str.strip()

    # If train: fit_transform
    # If test:  transform only
    if len(existing_ordinal) > 0:
        if is_train:
            df[existing_ordinal] = ordinal_encoder.fit_transform(df[existing_ordinal])
        else:
            df[existing_ordinal] = ordinal_encoder.transform(df[existing_ordinal])
        # Replace unknown encodings (-1) with 0
        df.replace(-1, 0, inplace=True)

    # 6) One-Hot Encoding for nominal features
    existing_nominal = [col for col in nominal_features if col in df.columns]

    if len(existing_nominal) > 0:
        if is_train:
            # Fit on train
            encoded = one_hot_encoder.fit_transform(df[existing_nominal])
        else:
            # Transform test
            encoded = one_hot_encoder.transform(df[existing_nominal])

        # Convert to DataFrame
        encoded_df = pd.DataFrame(
            encoded,
            columns=one_hot_encoder.get_feature_names_out(existing_nominal),
            index=df.index
        )
        # Drop original nominal cols, then join
        df.drop(columns=existing_nominal, inplace=True)
        df = df.join(encoded_df)

    # Return final results
    if is_train:
        return df, y
    else:
        return df


In [3]:
df_train = pd.read_csv("train.csv")
X_train, y_train = preprocess_data(df_train, is_train=True)

In [4]:
df_test = pd.read_csv("test.csv")
test_ids = df_test['Id'].copy()
X_test = preprocess_data(df_test, is_train=False)

In [5]:
print(f"New train shape: {X_train.shape}, test shape: {X_test.shape}")

New train shape: (1452, 230), test shape: (1459, 230)


## Random Forest

In [6]:
# Example parameter distribution:
param_dist = {
    'n_estimators': randint(100, 2000),        # Number of trees
    'max_depth': randint(1, 20),             # Maximum tree depth
    'min_samples_split': randint(2, 20),     # Minimum samples required to split an internal node
    'min_samples_leaf': randint(1, 10),      # Minimum samples required at a leaf node
    'max_features': ['sqrt', 'log2', None], # Number of features to consider at each split
    'criterion': ['squared_error', 'absolute_error'] # For regression
}

In [7]:
# Initialize the RandomForestRegressor
rf = RandomForestRegressor(bootstrap=True,oob_score=True, random_state=42)


In [8]:
cv_dummy = [(np.arange(len(X_train)), np.arange(len(X_train)))]
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=1,
    verbose=2,
    return_train_score=True,
    cv=cv_dummy
)

In [9]:
random_search.fit(X_train, y_train)

Fitting 1 folds for each of 50 candidates, totalling 50 fits
[CV] END criterion=squared_error, max_depth=15, max_features=None, min_samples_leaf=8, min_samples_split=8, n_estimators=221; total time=   3.7s
[CV] END criterion=squared_error, max_depth=11, max_features=None, min_samples_leaf=8, min_samples_split=5, n_estimators=971; total time=  16.5s
[CV] END criterion=absolute_error, max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=3, n_estimators=443; total time=   4.4s
[CV] END criterion=absolute_error, max_depth=6, max_features=log2, min_samples_leaf=5, min_samples_split=2, n_estimators=559; total time=   7.6s
[CV] END criterion=absolute_error, max_depth=12, max_features=sqrt, min_samples_leaf=1, min_samples_split=11, n_estimators=1599; total time=  48.7s
[CV] END criterion=absolute_error, max_depth=16, max_features=None, min_samples_leaf=3, min_samples_split=13, n_estimators=1690; total time= 8.5min
[CV] END criterion=absolute_error, max_depth=3, max_features=sq

In [10]:
# Get the best parameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_
best_rmse = -random_search.best_score_
print("Ridge Regression best params:", best_params)
print("Ridge Regression best RMSE on validation set: {:.4f}".format(best_rmse))

Ridge Regression best params: {'criterion': 'absolute_error', 'max_depth': 17, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 741}
Ridge Regression best RMSE on validation set: 10127.9483


In [11]:
# Train a final RandomForestRegressor using the best parameters
final_model = RandomForestRegressor(**best_params,bootstrap=True,oob_score=True, random_state=42)
final_model.fit(X_train, y_train)

In [12]:
# Generate predictions for the test set
test_predictions = final_model.predict(X_test)

In [13]:
# Create a submission dataframe (assuming the sample_submission.csv expects 'Id' and 'SalePrice')
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions})
submission.to_csv('submission_Random_forest.csv', index=False)

print("Submission file 'submission.csv' created successfully!")

Submission file 'submission.csv' created successfully!


## XGBOOST

In [14]:
X, X_val, y, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [15]:
param_grid = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(1, 20),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 0.5),   # or (0.5, 0.5) means range [0.5..1.0]
    'colsample_bytree': uniform(0.5, 0.5),
}

In [16]:
xgb_reg = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    # If you want bagging-like behavior in each tree:
    # subsample=0.8, colsample_bytree=0.8,  # or tune them in param_grid
)

In [17]:
# Set up RandomizedSearchCV using negative RMSE as the scoring metric
random_search_xgb = RandomizedSearchCV(estimator=xgb_reg,
                                   param_distributions=param_grid,
                                   n_iter=50,
                                   cv=3,
                                   scoring='neg_root_mean_squared_error',
                                   random_state=42,
                                   n_jobs=1,
                                   verbose=2)

In [18]:
# Run the search
random_search_xgb.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END colsample_bytree=0.6872700594236812, learning_rate=0.2952142919229748, max_depth=11, n_estimators=121, subsample=0.7993292420985183; total time=   2.7s
[CV] END colsample_bytree=0.6872700594236812, learning_rate=0.2952142919229748, max_depth=11, n_estimators=121, subsample=0.7993292420985183; total time=   1.3s
[CV] END colsample_bytree=0.6872700594236812, learning_rate=0.2952142919229748, max_depth=11, n_estimators=121, subsample=0.7993292420985183; total time=   1.3s
[CV] END colsample_bytree=0.5780093202212182, learning_rate=0.05679835610086079, max_depth=11, n_estimators=137, subsample=0.6668543055695109; total time=   1.4s
[CV] END colsample_bytree=0.5780093202212182, learning_rate=0.05679835610086079, max_depth=11, n_estimators=137, subsample=0.6668543055695109; total time=   1.1s
[CV] END colsample_bytree=0.5780093202212182, learning_rate=0.05679835610086079, max_depth=11, n_estimators=137, subsample=0.666854

In [19]:
# Get the best parameters and model
best_params_xgb = random_search_xgb.best_params_
best_model_xgb = random_search_xgb.best_estimator_
best_rmse_xgb = -random_search.best_score_
print("Ridge Regression best params:", best_params_xgb)
print("Ridge Regression best RMSE on validation set: {:.4f}".format(best_rmse_xgb))

Ridge Regression best params: {'colsample_bytree': 0.6629794526009424, 'learning_rate': 0.07607231426966449, 'max_depth': 3, 'n_estimators': 196, 'subsample': 0.6743329936458646}
Ridge Regression best RMSE on validation set: 10127.9483


In [20]:
# Evaluate on the validation set
# val_predictions = best_model.predict(X_val)
# rmse_val = np.sqrt(mean_squared_error(y_val, val_predictions))
# print("Best Parameters:", best_params)
# print("Validation RMSE:", rmse_val)

In [21]:
# Train the final XGBRegressor using the best parameters
final_model_xgb = xgb.XGBRegressor(**best_params_xgb, objective='reg:squarederror', random_state=42)
final_model_xgb.fit(X_train, y_train)

In [22]:
# Generate predictions for the test set
test_predictions_xgb = final_model_xgb.predict(X_test)

In [23]:
# Create a submission dataframe (assumes sample_submission.csv expects 'Id' and 'SalePrice')
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions_xgb})
submission.to_csv('submission_XGBoost.csv', index=False)

print("Submission file 'submission_XGBoost.csv' created successfully!")

Submission file 'submission_XGBoost.csv' created successfully!


## Linear Regression

In [24]:
param_dist_ridge = {
    'alpha': uniform(0.01, 10)   # Sample alpha between 0.01 and 10
}

In [25]:
ridge = Ridge()

random_search_ridge = RandomizedSearchCV(
    estimator=ridge,
    param_distributions=param_dist_ridge,
    n_iter=50,
    cv=5,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=1,
    verbose=2,
    return_train_score=True
)


In [26]:
random_search_ridge.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END ...........................alpha=3.7554011884736247; total time=   0.0s
[CV] END ...........................alpha=3.7554011884736247; total time=   0.0s
[CV] END ...........................alpha=3.7554011884736247; total time=   0.0s
[CV] END ...........................alpha=3.7554011884736247; total time=   0.0s
[CV] END ...........................alpha=3.7554011884736247; total time=   0.0s
[CV] END .............................alpha=9.51714306409916; total time=   0.0s
[CV] END .............................alpha=9.51714306409916; total time=   0.0s
[CV] END .............................alpha=9.51714306409916; total time=   0.0s
[CV] END .............................alpha=9.51714306409916; total time=   0.0s
[CV] END .............................alpha=9.51714306409916; total time=   0.0s
[CV] END ...........................alpha=7.3299394181140505; total time=   0.0s
[CV] END ...........................alpha=7.329

In [27]:
# Best hyperparameters and score (convert negative RMSE to RMSE)
best_params_ridge = random_search_ridge.best_params_
best_rmse_ridge = -random_search_ridge.best_score_
print("Ridge Regression best params:", best_params_ridge)
print("Ridge Regression best RMSE on validation set: {:.4f}".format(best_rmse_ridge))

Ridge Regression best params: {'alpha': 9.709098521619943}
Ridge Regression best RMSE on validation set: 25950.1214


In [28]:
# Train final model on the original training set using best hyperparameters
final_ridge = Ridge(**best_params_ridge)
final_ridge.fit(X_train, y_train)

In [29]:
# Generate predictions for the test set
test_predictions_ridge = final_ridge.predict(X_test)

In [30]:
# Create a submission dataframe (assumes sample_submission.csv expects 'Id' and 'SalePrice')
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions_ridge})
submission.to_csv('submission_ridge.csv', index=False)

print("Submission file 'submission_ridge.csv' created successfully!")

Submission file 'submission_ridge.csv' created successfully!


## SVM

In [31]:
#param_dist_svr = {
#    'C': uniform(0.1, 10),
#    'epsilon': uniform(0.01, 1),
#    'kernel': ['rbf', 'linear']
#}

In [32]:
param_grid_svr = {
    'C': [0.1, 1, 3, 10],
    'epsilon': [0.01, 0.1, 0.3, 1],
    'kernel': ['rbf', 'linear']
}

In [33]:
svr = SVR()

In [34]:
# random_search_svr = RandomizedSearchCV(
#    estimator=svr,
#    param_distributions=param_dist_svr,
#    n_iter=50,
#    cv=5,
#    scoring='neg_root_mean_squared_error',
#    random_state=42,
#    n_jobs=-1,
#    verbose=2,
#    return_train_score=True
#)

In [35]:
# Set up GridSearchCV
grid_search_svr = GridSearchCV(
    estimator=svr,
    param_grid=param_grid_svr,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=2
)

In [36]:
grid_search_svr.fit(X_train, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] END ....................C=0.1, epsilon=0.01, kernel=rbf; total time=   0.2s
[CV] END ....................C=0.1, epsilon=0.01, kernel=rbf; total time=   0.2s
[CV] END ....................C=0.1, epsilon=0.01, kernel=rbf; total time=   0.1s
[CV] END .................C=0.1, epsilon=0.01, kernel=linear; total time=   1.1s
[CV] END .................C=0.1, epsilon=0.01, kernel=linear; total time=   0.6s
[CV] END .................C=0.1, epsilon=0.01, kernel=linear; total time=   1.1s
[CV] END .....................C=0.1, epsilon=0.1, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, epsilon=0.1, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, epsilon=0.1, kernel=rbf; total time=   0.2s
[CV] END ..................C=0.1, epsilon=0.1, kernel=linear; total time=   1.4s
[CV] END ..................C=0.1, epsilon=0.1, kernel=linear; total time=   0.7s
[CV] END ..................C=0.1, epsilon=0.1, k

In [37]:
# Fit the randomized search on the combined dataset
# random_search_svr.fit(X_train, y_train)

In [38]:
# best_params_svr = random_search_svr.best_params_
# best_rmse_svr = -random_search_svr.best_score_
# print("SVR best params:", best_params_svr)
# print("SVR best RMSE on validation set: {:.4f}".format(best_rmse_svr))

In [39]:
best_params_svr = grid_search_svr.best_params_
best_rmse_svr = -grid_search_svr.best_score_
print("Best Parameters:", best_params_svr)
print("Best RMSE:", best_rmse_svr)

Best Parameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'linear'}
Best RMSE: 33373.708739748916


In [40]:
final_svr = SVR(**best_params_svr)
final_svr.fit(X_train, y_train)

In [41]:
# Generate predictions for the test set
test_predictions_svr = final_svr.predict(X_test)

In [42]:

submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions_svr})
submission.to_csv('submission_svr.csv', index=False)

print("Submission file 'submission_svr.csv' created successfully!")

Submission file 'submission_svr.csv' created successfully!


## Final Results

In [43]:
best_models = {
    'Random Forest': random_search.best_estimator_,
    'XGBoost': random_search_xgb.best_estimator_,
    'Linear Regression': random_search_ridge.best_estimator_,
    'SVM': grid_search_svr.best_estimator_
}

In [44]:
# Prepare a list to store results
results = []

# Evaluate each model on the validation set
for model_name, model in best_models.items():
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    # Convert the best params dictionary to a formatted string for display
    best_params = model.get_params()
    best_params_str = ", ".join([f"{k}={v}" for k, v in best_params.items() if k in best_models[model_name].get_params()])

    results.append({
        'Model': model_name,
        'RMSE': rmse,
        'Best Parameters': best_params_str
    })

# Create a DataFrame to display the results in a table format
results_df = pd.DataFrame(results)
print(results_df)

               Model          RMSE  \
0      Random Forest   9280.205315   
1            XGBoost  13002.850149   
2  Linear Regression  21946.317519   
3                SVM  26600.682989   

                                     Best Parameters  
0  bootstrap=True, ccp_alpha=0.0, criterion=absol...  
1  objective=reg:squarederror, base_score=None, b...  
2  alpha=9.709098521619943, copy_X=True, fit_inte...  
3  C=10, cache_size=200, coef0=0.0, degree=3, eps...  


## Final Results

In [45]:
models = {
    "Random Forest": (random_search.best_estimator_, random_search.best_params_),
    "XGBoost": (random_search_xgb.best_estimator_, random_search_xgb.best_params_),
    "Linear Regression": (random_search_ridge.best_estimator_, random_search_ridge.best_params_),
    "SVM": (grid_search_svr.best_estimator_, grid_search_svr.best_params_)
}

# Prepare a list to store the evaluation results
results = []

# Evaluate each model on the validation set
for model_name, (model, best_params) in models.items():
    # Predict on the validation set
    y_pred = model.predict(X_val)
    # Compute RMSE (ensure it's positive)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))

    # Append the model results, including the best parameters dictionary
    results.append({
        "Model": model_name,
        "RMSE": rmse,
        "Best Parameters": best_params
    })

# Create a DataFrame to display the results in a neat table
results_df = pd.DataFrame(results)
print(results_df)

               Model          RMSE  \
0      Random Forest   9280.205315   
1            XGBoost  13002.850149   
2  Linear Regression  21946.317519   
3                SVM  26600.682989   

                                     Best Parameters  
0  {'criterion': 'absolute_error', 'max_depth': 1...  
1  {'colsample_bytree': 0.6629794526009424, 'lear...  
2                       {'alpha': 9.709098521619943}  
3     {'C': 10, 'epsilon': 0.01, 'kernel': 'linear'}  
