In [44]:
from sklearn.linear_model import (
    LogisticRegression,
    LinearRegression,
    Ridge,
    Lasso,
    ElasticNet,
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestClassifier,
    RandomForestRegressor,
    GradientBoostingClassifier,
    GradientBoostingRegressor,
    AdaBoostClassifier,
    AdaBoostRegressor,
)
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from xgboost import XGBRegressor

In [45]:
# Checking Performance of Linear Machine Learning Models: including prediction of new data

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler


def check_model_performance_linear(ModelClasses, data, dependent_var, drop_columns=[], test_size=0.2, scaler=MinMaxScaler(), new_data=None, random_state=None):
    """
    Check the performance of multiple models on a dataset, and predict values for new data.

    Args:
        ModelClasses (list): A list of model classes to evaluate.
        data (pandas.DataFrame): The dataset to use for training and evaluation.
        dependent_var (str): The name of the dependent variable column.
        drop_columns (list, optional): A list of column names to drop from the independent variables. Defaults to [].
        test_size (float, optional): The proportion of the data to use for testing. Defaults to 0.2.
        scaler (object, optional): The scaler to use for scaling the independent variables. Defaults to MinMaxScaler().
        new_data (pandas.DataFrame, optional): New data for which to predict values. Defaults to None.
        random_state (int, optional): The random seed to use for splitting the data into training and testing sets. Defaults to None.

    Returns:
        pandas.DataFrame: A dataframe with the performance metrics for each model, and predicted values for new data.
    """
    # Define the independent and dependent variables
    X = data.drop([dependent_var] + drop_columns, axis=1)
    y = data[dependent_var]

    # Scale the independent variables
    X_scaled = scaler.fit_transform(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=random_state)

    
    # for-loop for all Linear Supervised Learning Models
    results = []
    for ModelClass in ModelClasses:
        model_name = ModelClass.__name__
        
        # Triang Models 
        model = ModelClass().fit(X_scaled, y)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        
        # Calculate R^2 
        r2_train = r2_score(y_train, y_pred_train)
        r2_test = r2_score(y_test, y_pred_test)
        
        # Calculte RMSE
        rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
        rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
        
        # Calculate AIC
        n = len(y)
        k = len(X.columns) + 1  # Add 1 for the intercept term
        aic = n * np.log(np.mean((y - model.predict(X_scaled)) ** 2)) + 2 * k
        
        # Calculate CV RMSE
        cv_results = cross_val_score(model, X_scaled, y, cv=5, scoring=make_scorer(mean_squared_error, squared=False))
        cv_rmse_mean = cv_results.mean()
        cv_rmse_std = cv_results.std()

        # Predict values for new data
        if new_data is not None:
            X_new = new_data.drop(drop_columns, axis=1)
            X_new_scaled = scaler.transform(X_new)
            y_pred_new = model.predict(X_new_scaled)
            new_data_with_predictions = pd.concat([new_data, pd.Series(y_pred_new, name='predicted_'+dependent_var)], axis=1)
        else:
            new_data_with_predictions = None

        
        # Wrapping up Performance Metrics
        results.append(
            {
                "Model": model_name,
                "Train R^2": r2_train,
                "Test R^2": r2_test,
                "Train RMSE": rmse_train,
                "Test RMSE": rmse_test,
                "AIC": aic,
                "CV RMSE Mean": cv_rmse_mean,
                "CV RMSE Std": cv_rmse_std,
                "Predictions for New Data": new_data_with_predictions
            }
        )
    return pd.DataFrame(results)

In [46]:
# modified version of grid_search_best_regression_models_updated

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from joblib import dump, load
import os
# Regression models
from sklearn.linear_model import (
    LinearRegression,
    Ridge,
    Lasso,
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

def ML_Regression_models_with_GridSearch(ModelClasses, hyperparameters, data, dependent_var, drop_columns=[], 
                                       new_data=None, test_size=0.2, random_state=42, scaler=StandardScaler(), 
                                       scoring = 'neg_mean_squared_error', cv=3, pca=None, save_model=False, save_dir=None):
    """
    This function performs grid search for the best classification models.

    Parameters:
    Modelclasses (list): A list of classifier classes.
    
    Example: use the following code to create a list of model classes
    ModelClasses = [
        LinearRegression,
        Ridge,
        Lasso,
        DecisionTreeRegressor,
        RandomForestRegressor,
        GradientBoostingRegressor,
        AdaBoostRegressor,
        SVR,
        KNeighborsRegressor,
        XGBRegressor
    ]
    
    hyperparameters (dict): Hyperparameters to be used in grid search.
    
    Example: use the following code to create a dictionary of hyperparameters
    hyperparameters = {
     'LinearRegression': {},
    'Ridge': {
        'alpha': [0.1, 1, 10]
    },
    'Lasso': {
        'alpha': [0.1, 1, 10]
    },
    'DecisionTreeRegressor': {
        'max_depth': [2, 4, 8],
        'min_samples_leaf': [1, 2, 5]
    },
    'RandomForestRegressor': {
        'n_estimators': [50, 100, 150],
        'max_depth': [2, 4, 8]
    },
    'GradientBoostingRegressor': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [2, 4]
    },
    'AdaBoostRegressor': {
        'n_estimators': [25, 50, 100],
        'learning_rate': [0.5, 1, 2]
    },
    'SVR': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'KNeighborsRegressor': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    },
    'XGBRegressor': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [2, 4, 6],
        'subsample': [0.5, 0.75, 1],
    }
}

    
    data (pd.DataFrame): A DataFrame containing the data.
    dependent_var (str): Column name of the dependent variable.
    drop_columns (list, optional): Column names to be excluded from the analysis.
    new_data (pd.DataFrame, optional): DataFrame containing the new data.
    test_size (float, optional): Test set size for splitting (default is 0.2).
    random_state (int, optional): Random seed for reproducibility (default is 42).
    scaler (scikit-learn scaler, optional): Scaler object for feature scaling (default is MinMaxScaler(feature_range=(0, 1))).
    scoring (str, optional): Scoring method for cross-validation (default is 'accuracy').
    cv (int, optional): Number of folds for cross-validation (default is 5).    
    pca (int, optional): Number of components for PCA (default is None).

    Returns:
    pd.DataFrame: DataFrame containing model performance metrics.    
    """
    
    if drop_columns is None:
        drop_columns = []

    if not isinstance(ModelClasses, list):
        raise TypeError("ModelClasses must be a list of classifier classes.")
    if not isinstance(hyperparameters, dict):
        raise TypeError("hyperparameters must be a dictionary.")
    if not isinstance(data, pd.DataFrame):
        raise TypeError("data must be a pandas DataFrame.")
    if not isinstance(drop_columns, list):
        raise TypeError("drop_columns must be a list of column names.")
     
    # 1. Load and preprocess data
    X = data.drop([dependent_var] + drop_columns, axis=1)
    y = data[dependent_var]
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Demension Reduction using PCA (Prinscipal Component Analysis)
    if pca is not None:
        pca_transformer = PCA(n_components=pca)
        X_train = pca_transformer.fit_transform(X_train)
        X_test = pca_transformer.transform(X_test)

    # Scaling
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 2. Run regression models
    results = []
    
    for ModelClass in ModelClasses:
        # Perform grid search
        grid_search = GridSearchCV(ModelClass(), hyperparameters[ModelClass.__name__], scoring=scoring, cv=cv)
        grid_search.fit(X_train_scaled, y_train)

        # Predict using best model
        best_model = grid_search.best_estimator_
        y_train_pred = best_model.predict(X_train_scaled)
        y_test_pred = best_model.predict(X_test_scaled)

        # Calculate R2, Train RMSE, Test RMSE, CV RMSE, and CV RMSE Std
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        cv_rmse = np.sqrt(-grid_search.best_score_)
        cv_rmse_std = np.sqrt(grid_search.cv_results_['std_test_score'][grid_search.best_index_])

        # Predict values for new data
        if new_data is not None:
            new_X = new_data.drop([dependent_var] + drop_columns, axis=1)
            new_y = new_data[dependent_var]

            if pca is not None:
                new_X = pca_transformer.transform(new_X)
                
            new_X_scaled = scaler.transform(new_X)
            new_y_pred = np.round(best_model.predict(new_X_scaled), 3)
            new_y_residual = np.round(new_y - new_y_pred, 3)

        else:
            new_y_pred = None
            new_y_residual = None

        # Save the best model for the current ModelClass if save_model is True
        if save_model:
            model_file_name = f"{ModelClass.__name__}_best_regression_ML_model.joblib"
            if save_dir is not None:
                os.makedirs(save_dir, exist_ok=True)
                model_file_path = os.path.join(save_dir, model_file_name)
            else:
                model_file_path = model_file_name

            dump(best_model, model_file_path)
            print(f"Best model saved to {model_file_path}")
            
        # Store results
        result = {
            "Title": "ML Regression Models with Grid Search",
            "Model": ModelClass.__name__,
            "Train R²": round(train_r2,3),
            "Test R²": round(test_r2,3),
            "Train RMSE": round(train_rmse,3),
            "Test RMSE": round(test_rmse, 3),
            "CV RMSE": round(cv_rmse, 3),
            "CV RMSE Std": round(cv_rmse_std,3),
            "Best Hyperparameters": grid_search.best_params_,
            "New Prediction": new_y_pred,
            "New Residual": new_y_residual
        }

        results.append(result)

    return pd.DataFrame(results)


In [47]:
# P-values: mathematical computation of feature importace for linear models
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import t


# P-values: mathematical computation of feature importace for linear models
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import statsmodels.api as sm


def check_p_values(data, dependent_var, drop_columns=[]):
    """
    Check the p-values of the coefficients for each independent variable in a single model.

    Args:
        data (pandas.DataFrame): The dataset to use for training and evaluation.
        dependent_var (str): The name of the dependent variable column.
        drop_columns (list, optional): A list of column names to drop from the independent variables. Defaults to [].

    Returns:
        pandas.DataFrame: A dataframe with the p-values for each independent variable.
    """
    # Define the independent and dependent variables
    X = data.drop([dependent_var] + drop_columns, axis=1)
    y = data[dependent_var]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    X_ols = sm.add_constant(X_scaled)
    model_ols = sm.OLS(y, X_ols).fit()
    p_values = model_ols.pvalues[1:]

    results_df = pd.DataFrame({'Variable': list(X.columns), 'P-Value': [f'{p:.2f}' for p in p_values], 'Coefficient': model_ols.params[1:].round(2)})

    return results_df.set_index('Variable')

In [48]:
# Plots for checking assumptions (linearity, normality, homoscedasticity) for Supervised Learning models_improved

import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import ProbPlot
from statsmodels.stats.diagnostic import het_breuschpagan
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA


def plot_for_checking_assumptions(ModelClasses, data, dependent_var, drop_columns=[], scaler=MinMaxScaler(feature_range=(0,1)), pca=None, savefig=None):
    """
    Create diagnostic plots to check the assumptions of multiple regression models.

    Args:
        ModelClasses (list): A list of model classes to evaluate.
        data (pandas.DataFrame): The dataset to use for training and evaluation.
        dependent_var (str): The name of the dependent variable column.
        drop_columns (list, optional): A list of column names to drop from the independent variables. Defaults to [].
        scaler (object, optional): The scaler to use for scaling the independent variables. Defaults to MinMaxScaler().
        pca (PCA, optional): A PCA object to use for dimensionality reduction. Defaults to None.
        savefig (str, optional): The filename to save the plot to. Defaults to None.

    Returns:
        None
    """

    X = data.drop([dependent_var] + drop_columns, axis=1)
    y = data[dependent_var]

    X_scaled = scaler.fit_transform(X)
    

    # Apply PCA if specified
    if pca is not None:
        pca.fit(X_scaled)
        X_scaled = pca.transform(X_scaled)

    fig, axes = plt.subplots(nrows=len(ModelClasses), ncols=3, figsize=(16, 4 * len(ModelClasses)))
    fig.subplots_adjust(hspace=0.5, wspace=0.3)
    axes = axes.ravel()  # convert to a 1D array of Axes objects
    
    for i, ModelClass in enumerate(ModelClasses):
        model = ModelClass().fit(X_scaled, y)
        y_pred = model.predict(X_scaled)
        residuals = y - y_pred
        
        # Linearity plot
        sns.regplot(x=y_pred, y=residuals, lowess=True, ax=axes[i*3])
        axes[i*3].set_xlabel('Predicted Values', y=-0.2)
        axes[i*3].set_ylabel('Residuals')
        axes[i*3].set_title(f'{ModelClass.__name__} Linearity Plot')
        axes[i*3].title.set(y=1.05)
        # Normality plot
        qq = ProbPlot(residuals)
        qq.qqplot(line='s', ax=axes[i*3+1])
        axes[i*3+1].set_xlabel('Theoretical Quantiles', y=-0.2)
        axes[i*3+1].set_ylabel('Standardized Residuals')
        axes[i*3+1].set_title(f'{ModelClass.__name__} Normality Plot')
        axes[i*3+1].title.set(y=1.05)
        # Homoscedasticity plot
        bp_test = het_breuschpagan(residuals, X)
        sns.regplot(x=y_pred, y=residuals ** 2, lowess=True, ax=axes[i*3+2])
        axes[i*3+2].set_xlabel('Predicted Values', y=-0.2)
        axes[i*3+2].set_ylabel('Squared Residuals')
        axes[i*3+2].set_title(f'{ModelClass.__name__} Homoscedasticity Plot')
        axes[i*3+2].text(0.1, 0.9, f'p-value: {bp_test[1]}', transform=axes[i*3+2].transAxes)
        axes[i*3+2].title.set(y=1.05)
    
    plt.tight_layout()
    
    if savefig:
        plt.savefig(savefig)
    else:
        plt.show()

In [49]:
# Calculate VIF for Checking Multicollinarity of Each Independent Variable_Improved

import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler, MinMaxScaler


def calculate_vif(data, dependent_var, drop_columns=[], vif_threshold=5, scaler=StandardScaler()):
    """Calculate the variance inflation factor (VIF) for each independent variable in a dataset.

    Args:
        data (pandas.DataFrame): The dataset to use for calculation.
        dependent_var (str): The name of the dependent variable column.
        drop_columns (list, optional): A list of column names to drop from the independent variables. Defaults to [].
        vif_threshold (float, optional): The threshold for detecting high multicollinearity. Defaults to 5.
        scaler (object, optional): The scaler to use for scaling the independent variables. Defaults to StandardScaler().

    Returns:
        pandas.DataFrame: The VIF values for each independent variable.
    """
    # Data preparation by dropping columns
    X = data.drop([dependent_var] + drop_columns, axis=1)
    
    # Scaling of Data
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    vif = pd.DataFrame()
    vif['Variable'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X_scaled.values, i) for i in range(X.shape[1])]
        
    multicollinear_columns = vif[vif['VIF'] > vif_threshold]['Variable'].tolist()
    
    if multicollinear_columns:
        print(f"The following variables have high multicollinearity with threshold={vif_threshold}:\n", multicollinear_columns)
    else:
        print("No variables have high multicollinearity.")
        
    return vif

In [50]:
# Data Preparation

import pandas as pd
import numpy as np

def impute_nulls(df):
    """
    Impute null values in a Pandas DataFrame based on the data type of each column.
    - For float columns, impute with the mean.
    - For integer columns, impute with the median.
    - For object columns, impute with the mode.
    - For datetime columns, impute with the most recent or most frequent date.
    - For timedelta columns, impute with the mode.
    - For bool columns, impute with the mode.
    - For category columns, impute with the mode.
    - For complex columns, impute with the mean.
    """
    # Get data types of all columns
    dtypes = df.dtypes

    # Iterate over all columns
    for col in df.columns:
        # Check if column contains null values
        if df[col].isnull().sum() > 0:
            # Get data type of column
            dtype = dtypes[col]
            # Impute null values based on data type
            if dtype == 'float64' or dtype == 'float32' or dtype == 'float16':
                df[col].fillna(df[col].mean(), inplace=True)
            elif dtype == 'int64' or dtype == 'int32' or dtype == 'int16' or dtype == 'int8':
                df[col].fillna(df[col].median(), inplace=True)
            elif dtype == 'object':
                df[col].fillna(df[col].mode()[0], inplace=True)
            elif dtype == 'datetime64':
                df[col].fillna(method='bfill', inplace=True)
            elif dtype == 'timedelta64':
                df[col].fillna(df[col].mode()[0], inplace=True)
            elif dtype == 'bool':
                df[col].fillna(df[col].mode()[0], inplace=True)
            elif dtype.name == 'category':
                df[col].fillna(df[col].mode()[0], inplace=True)
            elif dtype == 'complex64' or dtype == 'complex128':
                df[col].fillna(df[col].mean(), inplace=True)
    return df

In [51]:
from scipy import stats

def drop_outliers_by_zscores(data, column, lower_zscore, upper_zscore, inplace=False):
    """
    Drops rows from a Pandas DataFrame based on z-scores of a given column.

    Parameters:
    data (pandas.DataFrame): The input data.
    column (str): The name of the column to use for computing z-scores.
    lower_zscore (float): The lower z-score boundary.
    upper_zscore (float): The upper z-score boundary.
    inplace (bool): If True, updates the DataFrame directly. If False, returns a new DataFrame with outliers dropped.

    Returns:
    pandas.DataFrame or None: The modified DataFrame with outliers dropped, if inplace is False;
                              None, if inplace is True.
    """
    # Check input arguments
    if column not in data.columns:
        raise ValueError("Column '%s' not found in data." % column)
    if not np.isfinite(lower_zscore):
        raise ValueError("Lower z-score boundary must be finite.")
    if not np.isfinite(upper_zscore):
        raise ValueError("Upper z-score boundary must be finite.")

    # Compute z-scores
    z_scores = pd.Series(stats.zscore(data[column]), index=data.index)

    # Drop outliers outside boundaries
    mask = (z_scores >= upper_zscore) | (z_scores <= lower_zscore)
    
    if inplace:
        data.drop(data[mask].index, inplace=True)
        return None
    else:
        return data.loc[~mask]

In [30]:
df = pd.read_csv('/home/young78703/Data_Science_Project/data/melb_data.csv')
df.rename(columns={'Longtitude':'Longitude'},inplace=True)
impute_nulls(df)
drop_outliers_by_zscores(df, 'Price', -3.5, 3.5, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13439 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13439 non-null  object 
 1   Address        13439 non-null  object 
 2   Rooms          13439 non-null  int64  
 3   Type           13439 non-null  object 
 4   Price          13439 non-null  float64
 5   Method         13439 non-null  object 
 6   SellerG        13439 non-null  object 
 7   Date           13439 non-null  object 
 8   Distance       13439 non-null  float64
 9   Postcode       13439 non-null  float64
 10  Bedroom2       13439 non-null  float64
 11  Bathroom       13439 non-null  float64
 12  Car            13439 non-null  float64
 13  Landsize       13439 non-null  float64
 14  BuildingArea   13439 non-null  float64
 15  YearBuilt      13439 non-null  float64
 16  CouncilArea    13439 non-null  object 
 17  Lattitude      13439 non-null  float64
 18  Longit

In [41]:
ModelClasses = [
    LinearRegression,
    Ridge,
    Lasso,
    DecisionTreeRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
    SVR,
    KNeighborsRegressor,
    XGBRegressor
]
check_model_performance_linear(ModelClasses, df, 'Price', drop_columns=['Suburb', 'Address','Type','Method', 'Bedroom2', 'SellerG','Date','Postcode', 'CouncilArea', 'Lattitude',
   'Longitude', 'Regionname'])

Unnamed: 0,Model,Train R^2,Test R^2,Train RMSE,Test RMSE,AIC,CV RMSE Mean,CV RMSE Std,Predictions for New Data
0,LinearRegression,0.446266,0.4743,408543.263528,404829.680746,347242.517509,451496.072889,84279.24105,
1,Ridge,0.445618,0.474405,408782.502643,404789.174004,347254.622795,410951.76704,10056.270271,
2,Lasso,0.446269,0.474291,408542.397262,404833.322226,347242.519407,450822.444177,82935.316483,
3,DecisionTreeRegressor,0.997915,0.998385,25069.961658,22438.434776,271731.087909,436645.140051,20133.158487,
4,RandomForestRegressor,0.961619,0.964847,107558.598016,104684.581158,311278.409799,316051.504448,20594.81439,
5,GradientBoostingRegressor,0.684394,0.692974,308432.487331,309379.095003,339752.498029,333510.740139,14078.456391,
6,AdaBoostRegressor,0.366827,0.397019,436866.667504,433566.107433,349052.365158,448495.43056,16822.883903,
7,SVR,-0.066026,-0.072321,566854.230451,578183.571728,356201.99731,569206.230449,37404.790891,
8,KNeighborsRegressor,0.70579,0.714399,297794.351134,298389.082019,338803.313232,401477.709266,20455.17805,
9,XGBRegressor,0.905419,0.906305,168845.28662,170907.328834,323607.341052,278048.432977,18984.426407,


LinearRegression, Ridge, and Lasso have similar performance metrics, which makes sense as they are variations of linear models. None of them explain more than 50% of the variance in the dependent variable.
DecisionTreeRegressor seems to perform exceptionally well on the training data but this could be a sign of overfitting as decision trees tend to memorize the training data.
RandomForestRegressor and GradientBoostingRegressor are both ensemble methods and show a balanced performance on training and test data.
AdaBoostRegressor has lower R^2 values, indicating a lesser fit compared to the other models.
SVR (Support Vector Regression) performs poorly, with negative R^2 values indicating worse predictions than a simple average of the dependent variable.
KNeighborsRegressor has moderate R^2 values and could be a candidate for parameter tuning to improve performance.
XGBRegressor shows strong performance, with high R^2 values and lower RMSE, indicating a good balance between bias and variance.
In summary, the ensemble models, especially XGBRegressor, seem to perform best on this dataset. However, without knowing the context of the data and the scale of the dependent variable, these interpretations should be taken cautiously. It's also important to check for overfitting and ensure that the models are interpretable and applicable to the domain from which the data is drawn.

Key Metrics Explained
R^2 (Coefficient of Determination): Indicates how well the model explains the variability of the target variable. Values range from 0 to 1, with higher values indicating better model performance. It's a measure of the model's explanatory power.
RMSE (Root Mean Square Error): Measures the average magnitude of the errors between predicted and actual values, with lower values indicating better fit. It's a standard way to measure the error of a model in predicting quantitative data.
AIC (Akaike Information Criterion): Assesses the quality of a model relative to other models, considering both the complexity of the model and its fit to the data. Lower AIC values indicate a better model.
CV RMSE Mean (Cross-Validation RMSE Mean): The average RMSE across different folds in cross-validation, providing an estimate of the model's prediction error. Lower values suggest better generalization.
CV RMSE Std (Cross-Validation RMSE Standard Deviation): Reflects the variability of the model's prediction error across different folds in cross-validation. Lower values indicate more stable performance.
Model Performance Analysis
Linear Models (Linear Regression, Ridge, Lasso): Show moderate explanatory power with R^2 values around 0.447 to 0.469. Their RMSE values are relatively high, indicating less precise predictions.
Decision Tree Regressor: Exhibits excellent performance with R^2 values close to 1 and very low RMSE, indicating highly accurate predictions.
Random Forest and XGBRegressor: Both models demonstrate high R^2 values and relatively low RMSE, suggesting strong predictive accuracy and reliability.
Gradient Boosting Regressor and KNeighborsRegressor: Present moderate R^2 values and RMSE, indicating fair predictive performance.
AdaBoostRegressor and SVR: These models show lower R^2 values and higher RMSE, suggesting weaker predictive capabilities compared to other models.

In [52]:
    ModelClasses = [
    LinearRegression,
    Ridge,
    Lasso,
    DecisionTreeRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
    SVR,
    KNeighborsRegressor,
    XGBRegressor]
    
    hyperparameters = {
     'LinearRegression': {},
    'Ridge': {
        'alpha': [0.1, 1, 10]
    },
    'Lasso': {
        'alpha': [0.1, 1, 10]
    },
    'DecisionTreeRegressor': {
        'max_depth': [2, 4, 8],
        'min_samples_leaf': [1, 2, 5]
    },
    'RandomForestRegressor': {
        'n_estimators': [50, 100, 150],
        'max_depth': [2, 4, 8]
    },
    'GradientBoostingRegressor': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [2, 4]
    },
    'AdaBoostRegressor': {
        'n_estimators': [25, 50, 100],
        'learning_rate': [0.5, 1, 2]
    },
    'SVR': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'KNeighborsRegressor': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    },
    'XGBRegressor': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [2, 4, 6],
        'subsample': [0.5, 0.75, 1],
    }
}

In [54]:
returns= ML_Regression_models_with_GridSearch(ModelClasses, hyperparameters, df, 'Price', drop_columns=['Suburb', 'Address','Type','Method', 'Bedroom2', 'SellerG','Date','Postcode', 'CouncilArea', 'Lattitude',
   'Longitude', 'Regionname'],new_data=None, test_size=0.2, random_state=42, scaler=StandardScaler(), scoring = 'neg_mean_squared_error', cv=3, pca=None, save_model=False, save_dir=None)
print(returns)
returns.to_csv('ML_Linear_Model_Results.csv', index=False)

                                   Title                      Model  Train R²  \
0  ML Regression Models with Grid Search           LinearRegression     0.454   
1  ML Regression Models with Grid Search                      Ridge     0.454   
2  ML Regression Models with Grid Search                      Lasso     0.454   
3  ML Regression Models with Grid Search      DecisionTreeRegressor     0.634   
4  ML Regression Models with Grid Search      RandomForestRegressor     0.708   
5  ML Regression Models with Grid Search  GradientBoostingRegressor     0.792   
6  ML Regression Models with Grid Search          AdaBoostRegressor     0.461   
7  ML Regression Models with Grid Search                        SVR     0.082   
8  ML Regression Models with Grid Search        KNeighborsRegressor     0.998   
9  ML Regression Models with Grid Search               XGBRegressor     0.881   

   Test R²  Train RMSE   Test RMSE     CV RMSE  CV RMSE Std  \
0    0.442  408789.228  403907.399  466445.99