In [None]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, Concatenate, ReLU
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers.legacy import Adam  # Use legacy optimizer for M1/M2 Macs
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, LearningRateScheduler
from sklearn.impute import SimpleImputer
from tensorflow.keras.layers import Masking
import joblib
from types import SimpleNamespace
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.saving import register_keras_serializable

In [None]:
MODEL_DIR=os.path.abspath(os.path.expanduser("models"))

cfg = SimpleNamespace(
    NA_FILL=-1,
    AUTO_ENCODER_EPOCHS=100,
    AUTO_ENCODER_BATCH_SIZE=32,
    AUTO_ENCODER_LEARNING_RATE=0.001,
    MODEL_DIR=MODEL_DIR,
    AUTOENCODER_MODEL_PATH=os.path.join(MODEL_DIR, "autoencoder_model.keras"),
    RF_MODEL_PATH=os.path.join(MODEL_DIR, "rf_imputer.joblib"),
    OVERWRITE_AUTOENCODER=False,
    OVERWRITE_RF=False,
    VERBOSE=1,
    RANDOM_STATE=100,
    RANDOM_SAMPLE_FRACTION=0.2,
    HISTORY_FILE_PATH=os.path.join(MODEL_DIR, "autoencoder_training_history.json")
)

os.makedirs(cfg.MODEL_DIR, exist_ok=True)

cfg.__dict__

In [None]:
@register_keras_serializable()  # Register the custom layer
class NonNegativeOutputLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        fixed_outputs = tf.nn.relu(inputs[:, :num_fixed_features])  # Enforce non-negative values for fixed features
        unrestricted_outputs = inputs[:, num_fixed_features:]  # Allow unrestricted values for lat/lon
        return tf.concat([fixed_outputs, unrestricted_outputs], axis=1)

    def get_config(self):
        config = super().get_config()  # Get base config if any
        return config

In [None]:
# Load dataset and simulate missing values
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)

# Save original values for comparison
original_df = df.copy()

# Introduce NaNs randomly, excluding Longitude and Latitude
np.random.seed(cfg.RANDOM_STATE)
nan_mask = np.random.rand(*df.shape) < cfg.RANDOM_SAMPLE_FRACTION
nan_mask[:, df.columns.get_loc('Longitude')] = False  # Exclude Longitude
nan_mask[:, df.columns.get_loc('Latitude')] = False   # Exclude Latitude
df[nan_mask] = np.nan



# Separate rows with and without missing values
train_df = df.dropna()
test_df = df[df.isna().any(axis=1)]

# Normalize and handle missing values
df_filled = df.fillna(cfg.NA_FILL)
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_filled), columns=df.columns)

# Scale train and test sets individually for model input
X_train = scaler.transform(train_df.fillna(cfg.NA_FILL))
X_test = scaler.transform(test_df.fillna(cfg.NA_FILL))

# Define fixed and unrestricted features
fixed_features = [col for col in df.columns if col not in ["Longitude", "Latitude"]]
num_fixed_features = len(fixed_features)

# Learning rate warm-up function
initial_lr = 1e-5
def warmup(epoch, lr):
    if epoch < 5:
        return lr + (initial_lr * 0.2)
    return lr


# Define the autoencoder architecture
input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim,))

# Encoder
encoded = Dense(256, activation="relu")(input_layer)
encoded = LayerNormalization()(encoded)
encoded = Dropout(0.2)(encoded)
encoded = Dense(128, activation="relu")(encoded)
encoded = LayerNormalization()(encoded)
encoded = Dropout(0.2)(encoded)
encoded = Dense(64, activation="relu")(encoded)
encoded = LayerNormalization()(encoded)
encoded = Dropout(0.2)(encoded)
encoded = Dense(32, activation="relu")(encoded)

# Decoder
decoded = Dense(64, activation="relu")(encoded)
decoded = LayerNormalization()(decoded)
decoded = Dropout(0.2)(decoded)
decoded = Dense(128, activation="relu")(decoded)
decoded = LayerNormalization()(decoded)
decoded = Dropout(0.2)(decoded)
decoded = Dense(256, activation="relu")(decoded)
decoded = LayerNormalization()(decoded)
decoded = Dropout(0.2)(decoded)

# Final output with custom non-negative layer
output_layer = Dense(input_dim)(decoded)
output_layer = NonNegativeOutputLayer()(output_layer)  # Apply custom layer to enforce constraints

# Model definition
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer=Adam(learning_rate=initial_lr), loss="mse")

# Callbacks
history_file_path = cfg.HISTORY_FILE_PATH
lr_reduction = ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, min_lr=1e-6)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_warmup = LearningRateScheduler(warmup, verbose=1)

# Training with saving and loading history
if cfg.OVERWRITE_AUTOENCODER or not os.path.exists(cfg.AUTOENCODER_MODEL_PATH):
    print("Training autoencoder with enhanced configuration...")
    history = autoencoder.fit(
        X_train, X_train,
        epochs=cfg.AUTO_ENCODER_EPOCHS,
        batch_size=cfg.AUTO_ENCODER_BATCH_SIZE,
        validation_data=(X_test, X_test),
        verbose=cfg.VERBOSE,
        callbacks=[lr_warmup, lr_reduction, early_stopping]
    )
    autoencoder.save(cfg.AUTOENCODER_MODEL_PATH)

    # Save training history to disk
    with open(history_file_path, 'w') as f:
        json.dump({k: [float(v) for v in values] for k, values in history.history.items()}, f)
    history_data = history.history  # Save for plotting
else:
    print("Loading pre-trained autoencoder...")
    # autoencoder = load_model(cfg.AUTOENCODER_MODEL_PATH)
    autoencoder = load_model(
        cfg.AUTOENCODER_MODEL_PATH,
        custom_objects={"NonNegativeOutputLayer": NonNegativeOutputLayer},
        compile=False
    )
    autoencoder.compile(optimizer=Adam(learning_rate=initial_lr), loss="mse")
    with open(history_file_path, 'r') as f:
        history_data = json.load(f)

# Plot training progress
plt.figure(figsize=(10, 6))
plt.plot(history_data['loss'], label='Training Loss')
plt.plot(history_data['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.title('Autoencoder Training and Validation Loss Over Epochs')
plt.yscale("log")
plt.legend()
plt.grid(True)
plt.show()

# Impute missing values using the autoencoder
print("Imputing missing values using the autoencoder...")
X_test_pred = autoencoder.predict(X_test)
pred_df = pd.DataFrame(scaler.inverse_transform(X_test_pred), columns=df.columns, index=test_df.index)

In [None]:
print(pred_df)

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

print("Imputing missing values using IterativeImputer with Random Forest...")

# Fit the imputer on X_train (complete cases only)
rf_imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=10, random_state=cfg.RANDOM_STATE)
rf_imputer.fit(X_train)  # Train only on non-missing data

# Use the trained imputer to transform X_test, applying the learned imputation patterns
rf_imputed_scaled = rf_imputer.transform(X_test)
rf_imputed = pd.DataFrame(scaler.inverse_transform(rf_imputed_scaled), columns=df.columns, index=test_df.index)

In [None]:
import joblib

# Save the fitted IterativeImputer with RandomForest
joblib.dump(rf_imputer, 'rf_imputer_model.joblib')
print("Imputer model saved to disk.")

In [None]:
# Load the IterativeImputer from disk
rf_imputer_loaded = joblib.load('rf_imputer_model.joblib')
print("Imputer model loaded from disk.")

# Now you can use rf_imputer_loaded to transform new data
new_imputed_data = rf_imputer_loaded.transform(X_test)
new_imputed_data

In [None]:
print('test_df:')
print(test_df)

print('original_df:')
print(original_df)

print('pred_df:')
print(pred_df)

print('rf_imputed:')
print(rf_imputed)

In [None]:
missing_indices = test_df[col].isna().index

missing_indices

In [None]:
print(f'test_df.shape = {test_df.shape}')
print(f'original_df.shape = {original_df.shape}')
print(f'pred_df.shape = {pred_df.shape}')
print(f'rf_imputed.shape = {rf_imputed.shape}')

In [None]:
test_df.loc[test_df[col].isna(), col].index

In [None]:
col = 'MedInc'

missing_indices = test_df.loc[test_df[col].isna(), col].index

print(len(missing_indices))

original_values = original_df.loc[missing_indices, col]



In [None]:
pred_df.loc[

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Step 7: Calculate absolute differences between imputed values and original values for the test set
abs_diff_records = []

for col in test_df.columns:
    # Get indices in the test set where original data had NaNs
    missing_indices = test_df.loc[test_df[col].isna(), col].index

    # Extract the relevant values from original, autoencoder, and random forest imputed DataFrames
    original_values = original_df.loc[missing_indices, col]
    ae_imputed_values = pred_df.loc[missing_indices, col]
    rf_imputed_values = rf_imputed.loc[missing_indices, col]
    
    # Calculate absolute differences for each model, avoiding potential NaN comparisons
    ae_abs_diff = (ae_imputed_values - original_values).abs()
    rf_abs_diff = (rf_imputed_values - original_values).abs()
    
    # Append results for plotting
    abs_diff_records.extend([
        {'Feature': col, 'Model': 'Autoencoder', 'Abs Difference': diff}
        for diff in ae_abs_diff.dropna()
    ])
    abs_diff_records.extend([
        {'Feature': col, 'Model': 'Random Forest', 'Abs Difference': diff}
        for diff in rf_abs_diff.dropna()
    ])

# Convert to DataFrame for analysis and plotting
abs_diff_df = pd.DataFrame(abs_diff_records)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set a dark theme and define a color palette
sns.set_theme(style="darkgrid")
sns.set_context("talk", font_scale=1.1)
plt.rcParams['font.family'] = 'Avenir'
custom_palette = {"Autoencoder": "cornflowerblue", "Random Forest": "#ff7f0e"}  # Vibrant colors for models

# Create the box plot with enhanced styling
golden_ratio = 1.6180339887
plt.figure(figsize=(14, 14/golden_ratio))
sns.boxplot(
    data=abs_diff_df, 
    x='Feature', 
    y='Abs Difference', 
    hue='Model', 
    palette=custom_palette,
    width=0.5,  # Narrower boxes
    linewidth=1.5,  # Thicker lines for contrast
    fliersize=3  # Smaller outliers
)

# Log scale for the y-axis and adjusted tick parameters
fontsize=18
plt.yscale("log")
plt.xticks(rotation=45, fontsize=fontsize)
plt.yticks(color="black")

# Titles and labels with color adjustments
plt.title("Absolute Differences Between Imputed and Original Values by Feature and Model", fontsize=fontsize*1.2, fontweight='bold', 
          loc='left')
plt.xlabel("")
plt.ylabel("absolute difference (log scale)", color="black", fontsize=fontsize)
plt.legend(title="", fontsize=14, title_fontsize=12, facecolor='white', framealpha=0.8)

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
%pip install graphviz pydot

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(autoencoder, to_file='model_plot.png', show_shapes=True, show_layer_names=True, dpi=96)

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Initialize the graph
G = nx.Graph()

# Add nodes and edges
nodes = ["A", "B", "C", "D", "E"]
edges = [("A", "B"), ("A", "C"), ("B", "C"), ("C", "D"), ("D", "E")]
G.add_nodes_from(nodes)
G.add_edges_from(edges)

# Define position layout (e.g., spring layout)
pos = nx.spring_layout(G, seed=42)

# Styling options for nodes and edges
node_color = 'cornflowerblue'
node_size = 1000
edge_color = 'gray'
edge_width = 1.5
font_size = 12
font_color = 'black'
font_family = 'Avenir'

# Draw the network graph with custom styling
plt.figure(figsize=(10, 7))
nx.draw_networkx(
    G, pos,
    node_color=node_color,
    node_size=node_size,
    edge_color=edge_color,
    width=edge_width,
    font_size=font_size,
    font_color=font_color,
    font_family=font_family,
    with_labels=True,
    connectionstyle='arc3,rad=0.1'  # Adds a slight curve to edges for visual appeal
)

# Additional styling options
plt.title("Professional Network Graph", loc='left', fontsize=16, fontweight='bold')
plt.axis('off')  # Turn off the axis for a cleaner look
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import networkx as nx

def plot_autoencoder_structure(model):
    # Create a directed graph
    graph = nx.DiGraph()

    # Iterate over each layer in the model
    for i, layer in enumerate(model.layers):
        layer_name = f"{layer.name}\n{layer.output_shape}"
        graph.add_node(layer_name)

        if i > 0:
            # Connect each layer to the previous one
            prev_layer_name = f"{model.layers[i - 1].name}\n{model.layers[i - 1].output_shape}"
            graph.add_edge(prev_layer_name, layer_name)

    # Draw the graph
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(graph)
    nx.draw(graph, pos, with_labels=True, node_size=3000, node_color="skyblue", font_size=10, font_weight="bold")
    plt.title("Autoencoder Architecture")
    plt.show()

# Use this function to visualize your autoencoder
plot_autoencoder_structure(autoencoder)