In [None]:
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

In [None]:
import os
import json
import numpy as np
import pandas as pd
import joblib
from types import SimpleNamespace
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, Concatenate, ReLU
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers.legacy import Adam  # Use legacy optimizer for M1/M2 Macs
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, LearningRateScheduler
from tensorflow.keras.layers import Masking
import tensorflow as tf

from keras.saving import register_keras_serializable

In [None]:
MODEL_DIR=os.path.abspath(os.path.expanduser("models"))

cfg = SimpleNamespace(
    NA_FILL=-1,
    AUTO_ENCODER_EPOCHS=100,
    AUTO_ENCODER_BATCH_SIZE=32,
    AUTO_ENCODER_LEARNING_RATE=0.001,
    MODEL_DIR=MODEL_DIR,
    AUTOENCODER_MODEL_PATH=os.path.join(MODEL_DIR, "autoencoder_model.keras"),
    RF_MODEL_PATH=os.path.join(MODEL_DIR, "rf_imputer.joblib"),
    OVERWRITE_AUTOENCODER=False,
    OVERWRITE_RF=False,
    VERBOSE=1,
    RANDOM_STATE=100,
    RANDOM_SAMPLE_FRACTION=0.2,
    TEST_SIZE=0.2,
    HISTORY_FILE_PATH=os.path.join(MODEL_DIR, "autoencoder_training_history.json")
)

os.makedirs(cfg.MODEL_DIR, exist_ok=True)

cfg.__dict__

In [None]:
@register_keras_serializable()  # Register the custom layer
class NonNegativeOutputLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        fixed_outputs = tf.nn.relu(inputs[:, :num_fixed_features])  # Enforce non-negative values for fixed features
        unrestricted_outputs = inputs[:, num_fixed_features:]  # Allow unrestricted values for lat/lon
        return tf.concat([fixed_outputs, unrestricted_outputs], axis=1)

    def get_config(self):
        config = super().get_config()  # Get base config if any
        return config

In [None]:
# Load dataset and simulate missing values
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)

# Save original values for comparison
original_df = df.copy()

# Introduce NaNs randomly, excluding Longitude and Latitude
np.random.seed(cfg.RANDOM_STATE)
nan_mask = np.random.rand(*df.shape) < cfg.RANDOM_SAMPLE_FRACTION
nan_mask[:, df.columns.get_loc('Longitude')] = False  # Exclude Longitude
nan_mask[:, df.columns.get_loc('Latitude')] = False   # Exclude Latitude
df[nan_mask] = np.nan

# Separate rows with and without missing values
train_df = df.dropna()
test_df = df[df.isna().any(axis=1)]

# Normalize and handle missing values
df_filled = df.fillna(cfg.NA_FILL)
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_filled), columns=df.columns)

# Scale train and test sets individually for model input
X_train = scaler.transform(train_df.fillna(cfg.NA_FILL))
X_test = scaler.transform(test_df.fillna(cfg.NA_FILL))

# Define fixed and unrestricted features
fixed_features = [col for col in df.columns if col not in ["Longitude", "Latitude"]]
num_fixed_features = len(fixed_features)

# Learning rate warm-up function
initial_lr = 1e-5
def warmup(epoch, lr):
    if epoch < 5:
        return lr + (initial_lr * 0.2)
    return lr

# Define the autoencoder architecture
input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim,))

# Encoder
encoded = Dense(256, activation="relu")(input_layer)
encoded = LayerNormalization()(encoded)
encoded = Dropout(0.2)(encoded)
encoded = Dense(128, activation="relu")(encoded)
encoded = LayerNormalization()(encoded)
encoded = Dropout(0.2)(encoded)
encoded = Dense(64, activation="relu")(encoded)
encoded = LayerNormalization()(encoded)
encoded = Dropout(0.2)(encoded)
encoded = Dense(32, activation="relu")(encoded)

# Decoder
decoded = Dense(64, activation="relu")(encoded)
decoded = LayerNormalization()(decoded)
decoded = Dropout(0.2)(decoded)
decoded = Dense(128, activation="relu")(decoded)
decoded = LayerNormalization()(decoded)
decoded = Dropout(0.2)(decoded)
decoded = Dense(256, activation="relu")(decoded)
decoded = LayerNormalization()(decoded)
decoded = Dropout(0.2)(decoded)

# Final output with custom non-negative layer
output_layer = Dense(input_dim)(decoded)
output_layer = NonNegativeOutputLayer()(output_layer)  # Apply custom layer to enforce constraints

# Model definition
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer=Adam(learning_rate=initial_lr), loss="mse")

# Callbacks
history_file_path = cfg.HISTORY_FILE_PATH
lr_reduction = ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, min_lr=1e-6)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_warmup = LearningRateScheduler(warmup, verbose=1)

# Training with saving and loading history
if cfg.OVERWRITE_AUTOENCODER or not os.path.exists(cfg.AUTOENCODER_MODEL_PATH):
    print("Training autoencoder with enhanced configuration...")
    history = autoencoder.fit(
        X_train, X_train,
        epochs=cfg.AUTO_ENCODER_EPOCHS,
        batch_size=cfg.AUTO_ENCODER_BATCH_SIZE,
        validation_data=(X_test, X_test),
        verbose=cfg.VERBOSE,
        callbacks=[lr_warmup, lr_reduction, early_stopping]
    )
    autoencoder.save(cfg.AUTOENCODER_MODEL_PATH)

    # Save training history to disk
    with open(history_file_path, 'w') as f:
        json.dump({k: [float(v) for v in values] for k, values in history.history.items()}, f)
    history_data = history.history  # Save for plotting
else:
    print("Loading pre-trained autoencoder...")
    # autoencoder = load_model(cfg.AUTOENCODER_MODEL_PATH)
    autoencoder = load_model(
        cfg.AUTOENCODER_MODEL_PATH,
        custom_objects={"NonNegativeOutputLayer": NonNegativeOutputLayer},
        compile=False
    )
    autoencoder.compile(optimizer=Adam(learning_rate=initial_lr), loss="mse")
    with open(history_file_path, 'r') as f:
        history_data = json.load(f)

# Plot training progress
plt.figure(figsize=(10, 6))
plt.plot(history_data['loss'], label='Training Loss')
plt.plot(history_data['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.title('Autoencoder Training and Validation Loss Over Epochs')
plt.yscale("log")
plt.legend()
plt.grid(True)
plt.show()

# Impute missing values using the autoencoder
print("Imputing missing values using the autoencoder...")
X_test_pred = autoencoder.predict(X_test)
pred_df = pd.DataFrame(scaler.inverse_transform(X_test_pred), columns=df.columns, index=test_df.index)

In [None]:
display(pred_df)

In [None]:
def find_embedding_layer(model):
    previous_neurons = float('inf')
    embedding_layer_index = None
    
    # Start by skipping the input layer
    for i, layer in enumerate(model.layers[1:], start=1):  # Skip index 0
        output_shape = layer.output_shape[0] if isinstance(layer.output_shape, list) else layer.output_shape
        if len(output_shape) > 1:  # Make sure it's a valid tensor shape
            num_neurons = output_shape[1]
            
            # Check if we've hit the point of increase, marking the end of encoding
            if num_neurons > previous_neurons:
                embedding_layer_index = i - 1  # The last smallest layer is the embedding layer
                break
            
            previous_neurons = num_neurons

    return model.layers[embedding_layer_index]


def get_embeddings(autoencoder, input_df, scaler):
    # Normalize the entire input DataFrame
    normalized_input = scaler.transform(input_df)

    # Find the embedding layer
    embedding_layer = find_embedding_layer(autoencoder)
    
    # Create a model up to the embedding layer
    encoder_model = Model(inputs=autoencoder.input, outputs=embedding_layer.output)
    
    # Predict the embeddings for all rows without verbose output
    embeddings = encoder_model.predict(normalized_input, verbose=0)
    
    return pd.DataFrame(embeddings, index=input_df.index)


data = original_df.sample(3)
embedding_vectors_df = get_embeddings(autoencoder, data, scaler)

display(data)
display(embedding_vectors_df)

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
import joblib

if cfg.OVERWRITE_RF or not os.path.exists(cfg.RF_MODEL_PATH):

    print("Imputing missing values using IterativeImputer with Random Forest...")
    
    # Fit the imputer on X_train (complete cases only)
    rf_imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=10, random_state=cfg.RANDOM_STATE)
    rf_imputer.fit(X_train)  # Train only on non-missing data

    # Save the fitted IterativeImputer with RandomForest
    joblib.dump(rf_imputer, cfg.RF_MODEL_PATH)
    print("Imputer model saved to disk.")

else:
    print('Loading RF imputation model from disk')
    rf_imputer = joblib.load(cfg.RF_MODEL_PATH)
    
# Use the trained imputer to transform X_test, applying the learned imputation patterns
rf_imputed_scaled = rf_imputer.transform(X_test)
rf_imputed = pd.DataFrame(scaler.inverse_transform(rf_imputed_scaled), columns=df.columns, index=test_df.index)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Step 7: Calculate absolute differences between imputed values and original values for the test set
abs_diff_records = []

for col in test_df.columns:
    # Get indices in the test set where original data had NaNs
    missing_indices = test_df.loc[test_df[col].isna(), col].index

    # Extract the relevant values from original, autoencoder, and random forest imputed DataFrames
    original_values = original_df.loc[missing_indices, col]
    ae_imputed_values = pred_df.loc[missing_indices, col]
    rf_imputed_values = rf_imputed.loc[missing_indices, col]
    
    # Calculate absolute differences for each model, avoiding potential NaN comparisons
    ae_abs_diff = (ae_imputed_values - original_values).abs()
    rf_abs_diff = (rf_imputed_values - original_values).abs()
    
    # Append results for plotting
    abs_diff_records.extend([
        {'Feature': col, 'Model': 'Autoencoder', 'Abs Difference': diff}
        for diff in ae_abs_diff.dropna()
    ])
    abs_diff_records.extend([
        {'Feature': col, 'Model': 'Random Forest', 'Abs Difference': diff}
        for diff in rf_abs_diff.dropna()
    ])

# Convert to DataFrame for analysis and plotting
abs_diff_df = pd.DataFrame(abs_diff_records)

In [None]:
# Set a dark theme and define a color palette
sns.set_theme(style="darkgrid")
sns.set_context("talk", font_scale=1.1)
plt.rcParams['font.family'] = 'Avenir'
custom_palette = {"Autoencoder": "cornflowerblue", "Random Forest": "#ff7f0e"}  # Vibrant colors for models

# Create the box plot with enhanced styling
golden_ratio = 1.6180339887
plt.figure(figsize=(14, 14/golden_ratio))
sns.boxplot(
    data=abs_diff_df, 
    x='Feature', 
    y='Abs Difference', 
    hue='Model', 
    palette=custom_palette,
    width=0.5,  # Narrower boxes
    linewidth=1.5,  # Thicker lines for contrast
    fliersize=3  # Smaller outliers
)

# Log scale for the y-axis and adjusted tick parameters
fontsize=18
plt.yscale("log")
plt.xticks(rotation=45, fontsize=fontsize)
plt.yticks(color="black")

# Titles and labels with color adjustments
plt.title("Absolute Differences Between Imputed and Original Values by Feature and Model", fontsize=fontsize*1.2, fontweight='bold', 
          loc='left')
plt.xlabel("")
plt.ylabel("absolute difference (log scale)", color="black", fontsize=fontsize)
plt.legend(title="", fontsize=14, title_fontsize=12, facecolor='white', framealpha=0.8)

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
mean_abs_diff_df = abs_diff_df.groupby(['Feature', 'Model']).mean().reset_index()

# Pivot the DataFrame for easier comparison
pivot_df = mean_abs_diff_df.pivot(index='Feature', columns='Model', values='Abs Difference')

# Calculate the ratio of the Abs Difference for Random Forest to Autoencoder
pivot_df['RF/AE'] = pivot_df['Random Forest'] / pivot_df['Autoencoder']

# Display the result with only the 'Ratio' column
ratio_df = pivot_df[['RF/AE']].reset_index()
ratio_df.columns.name = None

ratio_df.sort_values('RF/AE', ascending=False, inplace=True)

display(ratio_df)

# supervised learning

Use the encoder to get the embedding vectors related to the training data. 
Train two models: one with the orignal data and one with the emebdding vectors: is there a difference in efficacy

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

def train_and_evaluate_rf(df, target, test_size=0.2, random_state=42, n_estimators=100):
    """
    Train and evaluate a Random Forest regressor on the given data.

    @param df: DataFrame containing feature data.
    @param target: Series or array-like object containing the target variable.
    @param test_size: float, proportion of data to be used for testing.
    @param random_state: int, seed for reproducibility.
    @param n_estimators: int, number of trees in the forest.

    @return: tuple (mse, r2, rf_regressor) containing Mean Squared Error, R-squared, and trained model.
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=test_size, random_state=random_state)
    
    # Create a Random Forest regressor
    rf_regressor = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    
    # Train the model
    rf_regressor.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = rf_regressor.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return mse, r2, rf_regressor



In [None]:
n_estimators=100

# Load the California housing data
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
target = data.target

# regression on regular data
mse, r2, rf_regressor = train_and_evaluate_rf(df, target, test_size=cfg.TEST_SIZE, random_state=cfg.RANDOM_STATE, n_estimators=n_estimators)

print("REGULAR DATA")
print(f"Mean Squared Error (MSE): {round(mse, 2)}")
print(f"R-squared (R2): {round(r2, 2)}")

# regression on transformed data into embeddings via the autoencoder
ae_embed_df = get_embeddings(autoencoder, df, scaler)
mse_ae, r2_ae, rf_regressor_ae = train_and_evaluate_rf(ae_embed_df, target, test_size=cfg.TEST_SIZE, random_state=cfg.RANDOM_STATE, n_estimators=n_estimators)

print("\nAUTOENCODED EMBEDDINGS")
print(f"Mean Squared Error (MSE): {round(mse_ae, 2)}")
print(f"R-squared (R2): {round(r2_ae, 2)}")

# regression on regular data augmented with AE embeddings
aug_ae_df = pd.concat([df, ae_embed_df], axis=1)
aug_ae_df.columns = [str(s) for s in aug_ae_df.columns]
mse_aug_ae, r2_aug_ae, rf_aug_regressor_ae = train_and_evaluate_rf(aug_ae_df, target, test_size=cfg.TEST_SIZE, random_state=cfg.RANDOM_STATE, n_estimators=n_estimators)
print("\nREGULAR DATA AUGMENTED WITH AUTOENCODED EMBEDDINGS")
print(f"Mean Squared Error (MSE): {round(mse_aug_ae, 2)}")
print(f"R-squared (R2): {round(r2_aug_ae, 2)}")

The results indicate that the Random Forest regression model trained on the original data performs significantly better than the model trained on the autoencoder embeddings. Here’s what we can infer:

	1.	Performance Metrics:
	•	The original data model has a lower Mean Squared Error (MSE) of 0.255, indicating that its predictions are closer to the actual target values compared to the model using the autoencoder embeddings, which has a higher MSE of 0.454.
	•	The R-squared (R2) score for the original data model is 0.805, showing that it explains about 80.5% of the variance in the target variable. The AE model has a lower R-squared score of 0.654, explaining only 65.4% of the variance.
	2.	Interpretation of Autoencoder Embeddings:
	•	The embeddings generated by the autoencoder capture compressed representations of the original data. While these embeddings may retain important structural information, they are inherently a reduced form of the input features. This reduction can lead to a loss of fine-grained details that are critical for a regression task, causing the model to underperform compared to using the original data.
	•	The lower R-squared score of the AE model suggests that the embeddings might not capture all relevant information necessary for the Random Forest model to make precise predictions.
	3.	Implications:
	•	The original data is better suited for training the Random Forest model in this case, as it retains all feature details needed for accurate prediction.
	•	The autoencoder’s embeddings might still be useful for dimensionality reduction, noise reduction, or other tasks, but for this regression problem, they seem to compromise model performance.
	4.	Potential Improvements:
	•	Fine-tuning the architecture of the autoencoder, adjusting its latent space size, or training it to retain more critical information could improve the model’s efficacy.
	•	Alternatively, using different embedding techniques or combining original data features with embeddings may yield better results.

In summary, while the autoencoder-based model captures some structural information, the reduced dimensionality results in a loss of detail, leading to reduced performance compared to the model trained on the full original dataset.