In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

def compare_scaling(df, dependent_var, drop_columns):
    X = df.drop([dependent_var] + drop_columns, axis=1)
    y = df[dependent_var]

    # Data scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Create a DataFrame from the scaled dataset
    df_scaled = pd.DataFrame(X_scaled, columns=X.columns)

    # Compare distributions visually with histograms
    fig, axes = plt.subplots(nrows=2, ncols=len(X.columns), figsize=(20, 6))
    for i, col in enumerate(X.columns):
        X[col].hist(ax=axes[0, i], alpha=0.5, color='blue', label='Original')
        df_scaled[col].hist(ax=axes[1, i], alpha=0.5, color='red', label='Scaled')
        axes[0, i].set_title(col)
        axes[0, i].legend()
        axes[1, i].legend()
    plt.show()

    # Compare descriptive statistics
    original_stats = X.describe().loc[['mean', 'std']]
    scaled_stats = df_scaled.describe().loc[['mean', 'std']]
    print("Original Dataset Statistics:\n", original_stats)
    print("\nScaled Dataset Statistics:\n", scaled_stats)



In [None]:
# Example usage
dependent_var = 'Price'
drop_columns = ['Suburb', 'Address', 'Type', 'Method', 'Bedroom2', 'SellerG', 'Date', 'Postcode', 'CouncilArea', 'Lattitude', 'Longitude', 'Regionname', 'year', 'month', 'day', 'dayofweek']
compare_scaling(df, dependent_var, drop_columns)