In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.cuda.amp import GradScaler, autocast
from torch.nn import MSELoss
from torch import nn, device, save
from torch.optim import Adam, AdamW
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
import os
import timeit
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from typing import Optional

def prepare_and_save_data(data, dep_var, drop_columns, test_size=0.15, random_state=42):
    """
    Prepares and saves data for modeling.

    This function prepares the data for modeling by splitting into train, 
    validation and test sets, scaling features, and converting to tensors.

    Parameters:
        data (pandas DataFrame): Dataframe containing all data
        dep_var (str): Name of dependent/target variable column  
        drop_columns (list): Columns to drop from data
        test_size (float): Proportion of data to include in test set
        random_state (int): Random state for splitting data

    Returns:
        X_train_tensor, y_train_tensor, X_valid_tensor, y_valid_tensor, 
        X_test_tensor, y_test_tensor (torch tensors): Prepared and split data
   """

    # Drop specified columns
    if drop_columns:
        data = data.drop(columns=drop_columns)
    
    # Select features and target variable
    X = data.drop(columns=[dep_var])
    y = data[dep_var]

    # Split data into training+validation sets and test set
    X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Further split training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=test_size, random_state=random_state)

    # Scale features
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_valid_scaled = scaler.transform(X_valid)
    X_test_scaled = scaler.transform(X_test)

    # Convert numpy arrays to tensors
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float)
    X_valid_tensor = torch.tensor(X_valid_scaled, dtype=torch.float)
    y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float)

    return X_train_tensor, y_train_tensor, X_valid_tensor, y_valid_tensor, X_test_tensor, y_test_tensor

class ANNModel(nn.Module):
    """
    Artificial Neural Network model class.

    This class defines an ANN architecture with multiple hidden layers 
    and activation functions between each layer. Dropout is also applied
    between hidden layers for regularization. L1 and L2 regularization 
    are supported on the weights.
    """
    def __init__(self, n_features, num_hidden_layers=2, neurons_per_layer=64, activation_function=torch.nn.ReLU(), 
                 dropout_rate=0.1, l1_regularization=0, l2_regularization=0):
        super(ANNModel, self).__init__()

        self.activation_function = activation_function
        self.l1_regularization = l1_regularization
        self.l2_regularization = l2_regularization

        # Build the ANN architecture
        layers = [nn.Linear(n_features, neurons_per_layer), activation_function]
        for _ in range(num_hidden_layers - 1):
            layers += [
                nn.Linear(neurons_per_layer, neurons_per_layer),
                activation_function,
                nn.Dropout(dropout_rate)
            ]
        
        # Final layer without activation
        layers.append(nn.Linear(neurons_per_layer, 1))
        
        # Wrap the layers into nn.Sequential
        self.layers = nn.Sequential(*layers)
        
        # Initialize weights
        self.init_weights()

    def init_weights(self):
        """Initialize weights of layers using Xavier initialization."""

        for layer in self.layers:
            if isinstance(layer, nn.Linear):
                # Describe what this line is doing
                nn.init.xavier_uniform_(layer.weight) 
                # Describe what this line is doing  
                nn.init.zeros_(layer.bias)

    def regularization_loss(self):
        """Calculate regularization loss from model weights."""

        l1_loss = 0
        # Describe what this variable represents
        l2_loss = 0  

        for param in self.parameters():
            # Describe what this line is doing
            l1_loss += torch.norm(param, 1)  
            # Describe what this line is doing
            l2_loss += torch.norm(param, 2) ** 2

        return self.l1_regularization * l1_loss + self.l2_regularization * l2_loss

    def forward(self, x):
        """Forward pass for the ANN model."""

        for layer in self.layers:
            # Describe what this line is doing
            x = layer(x) if not isinstance(layer, nn.Dropout) else layer(x)

            if any(isinstance(layer, activ) for activ in [torch.nn.ReLU, torch.nn.Tanh, torch.nn.ELU, torch.nn.LeakyReLU, torch.nn.Sigmoid]):
                # Describe what this line is doing
                x = self.activation_function(x)

        return x

def train_ANN_model(model, X_train, y_train, X_valid, y_valid, n_epochs, optimizer=torch.optim.Adam, criterion=torch.nn.MSELoss, batch_size=32, learning_rate=1e-3, 
                    patience=10, min_delta=0.0001, max_norm=1.0, num_workers=0, pin_memory=False, validation_frequency=1, load_directory=None, save_directory=None, trial=1):

    """
    Trains an Artificial Neural Network (ANN) model using the specified parameters and data, implementing early stopping and model saving.

    Parameters:
    - model: The neural network model to train.
    - X_train, y_train: Training data and labels.
    - X_valid, y_valid: Validation data and labels.
    - n_epochs: Number of epochs to train the model.
    - optimizer: The optimization algorithm (default: torch.optim.Adam).
    - criterion: Loss function (default: torch.nn.MSELoss).
    - batch_size: Size of batches for training and validation (default: 32).
    - learning_rate: Learning rate for the optimizer (default: 1e-3).
    - patience: Number of epochs with no improvement on validation loss to wait before stopping training early (default: 10).
    - min_delta: Minimum change in validation loss to qualify as an improvement (default: 0.0001).
    - max_norm: Maximum norm for gradient clipping (default: 1.0).
    - num_workers: Number of subprocesses to use for data loading (default: 0).
    - pin_memory: If True, the data loader will copy tensors into CUDA pinned memory before returning them (default: False).
    - validation_frequency: Frequency of validation in terms of number of epochs (default: 1).
    - load_directory: Directory from which to load the model if resuming training (default: None).
    - save_directory: Directory where to save the model and plots (default: None).
    - trial: Identifier for the training trial (default: 1).

    Returns:
    A dictionary containing:
    - 'train_losses': List of average training losses per epoch.
    - 'val_losses': List of average validation losses per validation cycle.
    - 'best_val_loss': Best validation loss achieved during training.
    - 'early_stopping_counter': Counter indicating the number of consecutive epochs without improvement in validation loss.
    """
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Convert numpy arrays to PyTorch tensors
    X_train, y_train = X_train.clone().detach().to(device).float(), y_train.clone().detach().to(device).float()
    X_valid, y_valid = X_valid.clone().detach().to(device).float(), y_valid.clone().detach().to(device).float()

    # Create DataLoader for both training and validation sets
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=pin_memory)
    valid_dataset = TensorDataset(X_valid, y_valid)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)

    optimizer = optimizer(model.parameters(), lr=learning_rate)
    criterion = criterion().to(device)
    scaler = GradScaler() # Enables mixed-precision training for faster computation.

    best_val_loss = float('inf')
    early_stopping_counter = 0

    # Lists to hold average losses per epoch
    train_losses = []
    val_losses = []

    for epoch in range(n_epochs):
        model.train()
        total_train_loss = 0

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad() # Reset gradients to zero for a new optimization step.
            with autocast(): # Enable mixed-precision training.
                y_pred = model(X_batch)
                loss = criterion(y_pred, y_batch.view_as(y_pred)) # Calculate loss.
                reg_loss = model.regularization_loss() # Calculate regularization loss.
                loss += reg_loss # Combine main loss and regularization loss.
            scaler.scale(loss).backward() # Scale loss to prevent underflow in mixed precision.
            scaler.unscale_(optimizer) # Unscale gradients before clipping.
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) # Clip gradients to prevent explosion.
            scaler.step(optimizer) # Perform optimization step.
            scaler.update() # Update scaler for next iteration.

            total_train_loss += loss.item() # Accumulate the loss.

        avg_train_loss = total_train_loss / len(train_loader) # Calculate average training loss for the epoch.
        train_losses.append(avg_train_loss)  # Store it for later analysis.

    # Perform validation at the specified frequency.
        if epoch % validation_frequency == 0:
            model.eval() # Set the model to evaluation mode.
            total_val_loss = 0
            with torch.no_grad():  # Disable gradient computation.
                for X_batch, y_batch in valid_loader:
                    y_pred = model(X_batch)
                    loss = criterion(y_pred, y_batch.view_as(y_pred)) # Calculate validation loss.
                    total_val_loss += loss.item() # Accumulate validation loss.

            avg_val_loss = total_val_loss / len(valid_loader) # Calculate average validation loss for this cycle.
            val_losses.append(avg_val_loss)  # Store it for later analysis.
            
            print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.5f}, Val Loss = {avg_val_loss:.5f}")

            # Early stopping logic.
            if avg_val_loss < best_val_loss - min_delta:
                best_val_loss = avg_val_loss
                early_stopping_counter = 0
                if save_directory:
                    os.makedirs(load_directory, exist_ok=True)
                    save_path = os.path.join(load_directory, f"ANN_model_trial_{trial}.pt")
                    torch.save(model, save_path) # Save the model if there's an improvement.
            else:
                early_stopping_counter += 1
                if early_stopping_counter >= patience:
                    print(f"Early stopping at epoch {epoch} due to no improvement in validation loss.")
                    break # Exit the training loop if no improvement for 'patience' epochs.

    return {
        "train_losses": train_losses,
        "val_losses": val_losses,
        "best_val_loss": best_val_loss,
        "early_stopping_counter": early_stopping_counter
    }

def plot_results(train_losses, val_losses, trial, save_directory=None):
    """
    Plots the training and validation loss curves over epochs for a given trial and optionally saves the plot to a directory.

    Parameters:
    - train_losses (list of float): A list containing the average training loss for each epoch.
    - val_losses (list of float): A list containing the average validation loss for each validation cycle.
    - trial (int): The trial number, used for titling the plot and naming the saved file.
    - save_directory (str, optional): The directory where the plot image will be saved. If None, the plot is not saved.

    This function creates a line plot with training and validation losses, providing a visual representation of the model's learning process over epochs. The plot is displayed and optionally saved as a PNG file.
    """
    
    plt.figure(figsize=(10, 6))  # Set the figure size for the plot.
    
    # Plot training and validation losses with labels and a specific linewidth for visibility.
    plt.plot(train_losses, label='Train Loss', linewidth=2)
    plt.plot(val_losses, label='Valid Loss', linewidth=2)
    
    # Labeling the x-axis as 'Epoch' and the y-axis as 'Loss', with a specific font size for clarity.
    plt.xlabel('Epoch', fontsize=14)
    plt.ylabel('Loss', fontsize=14)
    
    # Adding a legend to distinguish between training and validation losses, with a specified font size.
    plt.legend(fontsize=14)
    
    # Titling the plot with reference to the trial number, enhancing readability and information content.
    plt.title(f'Train and Valid Losses (Trial {trial+1})', fontsize=16)
    
    plt.grid(True)  # Adding a grid to the plot for better visualization of data points.
    
    # Saving the plot to a file if a save directory is specified.
    if save_directory:
        os.makedirs(save_directory, exist_ok=True)  # Ensures the directory exists before saving.
        save_path = os.path.join(save_directory, f"loss_plot_trial_{trial+1}.png")  # Constructs the file path.
        plt.savefig(save_path)  # Saves the figure to the constructed file path.
        print(f"Plot saved to {save_path}")  # Prints a confirmation message with the save path.
    
    plt.show()  # Displays the plot in the notebook or Python script output.

import os
import timeit
import random
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset

def model_save_ANN(X_train, y_train, X_valid, y_valid, n_trials=1, load_directory=None, save_directory=None, plot_loss=True):
    
    """
    Performs multiple trials of training an Artificial Neural Network (ANN) with randomly selected hyperparameters,
    saves the trained models, and optionally plots the training and validation loss curves.

    Parameters:
    - X_train, y_train: Training data and labels.
    - X_valid, y_valid: Validation data and labels.
    - n_trials (int): Number of training trials to execute with random hyperparameters.
    - load_directory (str, optional): Directory to load existing models from (not used in current implementation).
    - save_directory (str, optional): Directory to save trained models and results.
    - plot_loss (bool): If True, plots the training and validation loss curves after each trial.

    Returns:
    - all_results_params_df (pandas.DataFrame): DataFrame containing the parameters and results from all trials
      if save_directory is specified. Otherwise, returns a list of dictionaries with the same information.
    """
    if save_directory and not os.path.exists(save_directory):
        os.makedirs(save_directory)

    all_results_params = []

    for trial in range(n_trials):
        print(f"Trial {trial + 1} of {n_trials}")

        start = timeit.default_timer()

        # Randomly generate hyperparameters
        num_hidden_layers = random.choice(range(2, 16))
        neurons_per_layer = random.choice(range(16, 128))
        dropout_rate = random.choice([0.1, 0.2, 0.3])
        batch_size = random.choice(range(16, 128))
        learning_rate = 0.0001 * random.choice(range(1, 16))
        # activation_function = random.choice([torch.nn.Tanh(), torch.nn.Sigmoid(), torch.nn.ELU(), torch.nn.ReLU(), torch.nn.LeakyReLU(negative_slope=0.01)])
        activation_function = random.choice([torch.nn.ReLU(), torch.nn.LeakyReLU(negative_slope=0.01)])
        optimizer = random.choice([torch.optim.Adam, torch.optim.AdamW])
        criterion = random.choice([torch.nn.SmoothL1Loss])
        n_epochs = random.choice(range(100, 201))
        patience = random.choice(range(5, 11))
        min_delta = 0.0001 * random.choice(range(1, 2))
        l1_regularization = 0.00000001 * random.choice(range(1, 2))
        l2_regularization = l1_regularization

        # Define the ANN model
        model = ANNModel(n_features=X_train.shape[1], num_hidden_layers=num_hidden_layers,
                         neurons_per_layer=neurons_per_layer, activation_function=activation_function,
                         dropout_rate=dropout_rate, l1_regularization=l1_regularization, l2_regularization = l2_regularization)

        # Train the ANN model
        training_results = train_ANN_model(model, X_train, y_train, X_valid, y_valid, n_epochs, 
                                           batch_size=batch_size, learning_rate=learning_rate,
                                           patience=patience, min_delta=min_delta, optimizer=optimizer, criterion=criterion, load_directory=load_directory,
                                           save_directory=save_directory, trial=trial)

        train_losses, val_losses = training_results['train_losses'], training_results['val_losses']
        
        # Optionally plot the loss curves
        if plot_loss:
            plot_results(train_losses, val_losses, trial, save_directory=save_directory)

        # Collect results and parameters
        params = {
            "num_hidden_layers": num_hidden_layers,
            "neurons_per_layer": neurons_per_layer,
            "dropout_rate": dropout_rate,
            "batch_size": batch_size,
            "learning_rate": learning_rate,
            "activation_function": activation_function.__class__.__name__,
            "optimizer": optimizer.__name__,
            "criterion": criterion.__name__,
            "n_epochs": n_epochs,
            "patience": patience,
            "min_delta": min_delta,
            "l1_regularization": l1_regularization,
            "l2_regularization": l2_regularization
        }

        results_params = {**params, "trial": trial + 1, "train_loss": train_losses[-1], "val_loss": val_losses[-1]}
        all_results_params.append(results_params)

        # Save the results to a CSV file if a save directory is specified
        if save_directory:
            all_results_params_df = pd.DataFrame(all_results_params)
            all_results_params_df.to_csv(os.path.join(save_directory, "all_results_params.csv"), index=False)

        end = timeit.default_timer()
        print(f"Execution Time for Trial {trial + 1}: {end - start} seconds")

    return all_results_params_df if save_directory else all_results_params

import os
import torch
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd

def evaluate_models_and_save_results(X_test, y_test, load_directory=None, save_directory=None, results_file_name='evaluation_results.csv'):
    """
    Loads trained models from a specified directory, evaluates them on a test dataset,
    calculates performance metrics, and saves the results to a CSV file.

    Parameters:
    - X_test (torch.Tensor): The test set features, expected to be a PyTorch tensor.
    - y_test (torch.Tensor): The test set labels, expected to be a PyTorch tensor.
    - load_directory (str): The directory from which to load trained model files.
    - save_directory (str, optional): The directory where to save the evaluation results CSV.
      If None, the results are not saved to file.
    - results_file_name (str): The name of the CSV file to save the results to (default is 'evaluation_results.csv').

    Returns:
    - results_df (pandas.DataFrame): A DataFrame containing the model names and their evaluation metrics (MAE, MSE, R2).
    
    This function automates the evaluation process of multiple models, facilitating comparison and analysis.
    """
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Determine the device to use for computation.
    
    results = []  # Initialize an empty list to store the results for each model.
    file_names = os.listdir(load_directory)  # List all files in the load directory.
    
    for file_name in file_names:
        model_path = os.path.join(load_directory, file_name)  # Construct the full path to the model file.
        loaded_model = torch.load(model_path, map_location=device)  # Load the model onto the specified device.
        loaded_model.to(device)  # Ensure the model is on the correct device.
        loaded_model.eval()  # Set the model to evaluation mode.
        
        X_test_tensor = X_test.float().to(device)  # Ensure test data is a float tensor and move to the device.
        y_test_tensor = y_test.float().to(device)  # Ensure test labels are a float tensor and move to the device.
        
        with torch.no_grad():  # Disable gradient computation for evaluation.
            y_pred_tensor = loaded_model(X_test_tensor)  # Make predictions with the model.
            # Adjust shape if necessary, e.g., y_pred_tensor = y_pred_tensor.squeeze()
        
        y_pred = y_pred_tensor.cpu().numpy()  # Move predictions back to CPU and convert to numpy array for evaluation.
        y_true = y_test_tensor.cpu().numpy()  # Convert true labels to numpy array for evaluation.

        # Calculate evaluation metrics.
        mae = mean_absolute_error(y_true, y_pred)
        mse = mean_squared_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

        # Append results for the current model to the results list.
        results.append({'Model': file_name, 'MAE': mae, 'MSE': mse, 'R2': r2})
    
    results_df = pd.DataFrame(results)  # Convert the list of results into a pandas DataFrame for easy analysis and export.
    
    # Save the results DataFrame to a CSV file if a save directory is specified.
    if save_directory:
        if not os.path.exists(save_directory):
            os.makedirs(save_directory)  # Create the directory if it doesn't exist.
        results_path = os.path.join(save_directory, results_file_name)  # Construct the full path to the results CSV.
        results_df.to_csv(results_path, index=False)  # Save the DataFrame to CSV.
        print(f"Results saved to {results_path}")  # Print a confirmation message.
    
    return results_df  


import os
import pandas as pd

def execute_and_evaluate_ANN_models(data, dep_var, drop_columns, n_trials, top_k_models, save_directory, load_directory):
    """
    Executes a series of steps to train, evaluate, and select the top ANN models based on a dataset. 
    It involves data preparation, model training with random hyperparameters for a specified number of trials,
    evaluation of these models on a test set, and selection of the top performing models based on Mean Absolute Error (MAE).

    Parameters:
    - data (pandas.DataFrame): The dataset containing features and the target variable.
    - dep_var (str): The name of the target variable in the dataset.
    - drop_columns (list of str): A list of column names to be dropped from the dataset before training.
    - n_trials (int): The number of training trials to conduct with randomly selected hyperparameters.
    - top_k_models (int): The number of top models to select based on their performance metric.
    - save_directory (str): The directory where to save training results, model files, and evaluation results.
    - load_directory (str): The directory from which to load trained model files for evaluation.

    Returns:
    - top_models_df (pandas.DataFrame): A DataFrame containing the evaluation results of the top k models selected based on MAE.

    This function orchestrates the workflow from data preprocessing, model training, model evaluation, 
    to the selection of the top performing models. It automates the process of experimenting with multiple 
    ANN architectures and hyperparameters to identify models that achieve the best performance on the given dataset.
    """
    
    if not os.path.exists(save_directory):
        os.makedirs(save_directory, exist_ok=True)  # Ensure the save directory exists.
        
    # Prepare data and split into training, validation, and test sets, then save the processed data.
    X_train, y_train, X_valid, y_valid, X_test, y_test = prepare_and_save_data(data, dep_var, drop_columns, test_size=0.2, random_state=42)
    
    # Train models with randomly generated hyperparameters and save the training results and plots.
    model_save_ANN(X_train, y_train, X_valid, y_valid, n_trials=n_trials, load_directory=load_directory, save_directory=save_directory, plot_loss=True)
    
    # Evaluate the trained models on the test set, calculate performance metrics, and save the evaluation results.
    evaluate_results_df = evaluate_models_and_save_results(X_test, y_test, load_directory=load_directory, save_directory=save_directory, results_file_name='evaluation_results.csv')
    
    # Select the top k models based on the lowest MAE score and print the results.
    if 'MAE' in evaluate_results_df.columns:
        top_models_df = evaluate_results_df.nsmallest(top_k_models, 'MAE')
    else:
        print("MAE column not found in evaluate_results_df.")
    
    print("Top models based on MAE score:")
    print(top_models_df)
    
    # Save the evaluation results of the top models for further analysis.
    top_models_df.to_csv(os.path.join(save_directory, "top_models_evaluation.csv"), index=False)
    
    return top_models_df


In [None]:
# Data Preparation

def impute_nulls(df):
    """
    Impute null values in a Pandas DataFrame based on the data type of each column.
    - For float columns, impute with the mean.
    - For integer columns, impute with the median.
    - For object columns, impute with the mode.
    - For datetime columns, impute with the most recent or most frequent date.
    - For timedelta columns, impute with the mode.
    - For bool columns, impute with the mode.
    - For category columns, impute with the mode.
    - For complex columns, impute with the mean.
    """
    # Get data types of all columns
    dtypes = df.dtypes

    # Iterate over all columns
    for col in df.columns:
        # Check if column contains null values
        if df[col].isnull().sum() > 0:
            # Get data type of column
            dtype = dtypes[col]
            # Impute null values based on data type
            if dtype == 'float64' or dtype == 'float32' or dtype == 'float16':
                df[col].fillna(df[col].mean(), inplace=True)
            elif dtype == 'int64' or dtype == 'int32' or dtype == 'int16' or dtype == 'int8':
                df[col].fillna(df[col].median(), inplace=True)
            elif dtype == 'object':
                df[col].fillna(df[col].mode()[0], inplace=True)
            elif dtype == 'datetime64':
                df[col].fillna(method='bfill', inplace=True)
            elif dtype == 'timedelta64':
                df[col].fillna(df[col].mode()[0], inplace=True)
            elif dtype == 'bool':
                df[col].fillna(df[col].mode()[0], inplace=True)
            elif dtype.name == 'category':
                df[col].fillna(df[col].mode()[0], inplace=True)
            elif dtype == 'complex64' or dtype == 'complex128':
                df[col].fillna(df[col].mean(), inplace=True)
    return df

from scipy import stats

def drop_outliers_by_zscores(data, column, lower_zscore, upper_zscore, inplace=False):
    """
    Drops rows from a Pandas DataFrame based on z-scores of a given column.

    Parameters:
    data (pandas.DataFrame): The input data.
    column (str): The name of the column to use for computing z-scores.
    lower_zscore (float): The lower z-score boundary.
    upper_zscore (float): The upper z-score boundary.
    inplace (bool): If True, updates the DataFrame directly. If False, returns a new DataFrame with outliers dropped.

    Returns:
    pandas.DataFrame or None: The modified DataFrame with outliers dropped, if inplace is False;
                              None, if inplace is True.
    """
    # Check input arguments
    if column not in data.columns:
        raise ValueError("Column '%s' not found in data." % column)
    if not np.isfinite(lower_zscore):
        raise ValueError("Lower z-score boundary must be finite.")
    if not np.isfinite(upper_zscore):
        raise ValueError("Upper z-score boundary must be finite.")

    # Compute z-scores
    z_scores = pd.Series(stats.zscore(data[column]), index=data.index)

    # Drop outliers outside boundaries
    mask = (z_scores >= upper_zscore) | (z_scores <= lower_zscore)
    
    if inplace:
        data.drop(data[mask].index, inplace=True)
        return None
    else:
        return data.loc[~mask]
    
from sklearn.preprocessing import LabelEncoder
from typing import Optional

def encode_categorical_column(data: pd.DataFrame, column: str, mapping: Optional[dict] = None) -> pd.DataFrame:
    """
    Encode a categorical column in the input dataframe using LabelEncoder or mapping.

    :param data: input dataframe
    :param column: column name to be encoded
    :param mapping: optional dictionary defining the mapping for ordinal variables; defaults to None
    :return: dataframe with the specified column encoded
    """
    data = data.copy()

    if mapping is not None:
        # Use mapping for ordinal categorical variables
        data[column] = data[column].map(mapping)
    else:
        # Use LabelEncoder for nominal categorical variables
        encoder = LabelEncoder()
        encoded = encoder.fit_transform(data[column])
        data[column] = encoded

    return data

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from pandas.api.types import is_categorical_dtype

def one_hot_encode(df, columns):
    """
    Preprocesses categorical columns in a DataFrame using OneHotEncoder.

    Parameters:
        df (pandas.DataFrame): The DataFrame to preprocess.
        columns (list of str): The names of the categorical columns to encode.

    Returns:
        pandas.DataFrame: The preprocessed DataFrame with the categorical columns
            one-hot encoded and dropped.

    Raises:
        ValueError: If any of the specified columns do not exist in the DataFrame.
        ValueError: If any of the specified columns do not contain categorical data.
    """
    # Check that all specified columns exist in the DataFrame
    missing_columns = set(columns) - set(df.columns)
    if missing_columns:
        raise ValueError(f"Columns {missing_columns} not found in DataFrame")

    # Check that all specified columns contain categorical data
    non_categorical_columns = [col for col in columns if not (is_categorical_dtype(df[col]) or df[col].dtype == object)]
    if non_categorical_columns:
        raise ValueError(f"Columns {non_categorical_columns} do not contain categorical data")

    encoder = OneHotEncoder(drop='first', sparse=False)
    encoded_array = encoder.fit_transform(df[columns])

    # Create a DataFrame with the one-hot encoded arrays and feature names
    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(columns))

    # Concatenate the original DataFrame and the encoded DataFrame
    df = pd.concat([df, encoded_df], axis=1)

    # Drop the original categorical columns
    df.drop(columns, axis=1, inplace=True)

    return df

In [None]:
df = pd.read_csv('/home/young78703/Data_Science_Project/data/melb_data.csv')
df.rename(columns={'Longtitude':'Longitude'},inplace=True)
impute_nulls(df)
drop_outliers_by_zscores(df, 'Price', -3.5, 3.5, inplace=True)


In [None]:
n_trials = 3
top_k_models = 1
load_directory = '/home/young78703/Data_Science_Project/output/ANN_Regression/load_directory'
save_directory = '/home/young78703/Data_Science_Project/output/ANN_Regression/save_directory'
top_models_df = execute_and_evaluate_ANN_models(df, 'Price', drop_columns=['Suburb', 'Address','Type','Method', 'Bedroom2', 'SellerG','Date','Postcode', 'CouncilArea', 'Lattitude',
   'Longitude', 'Regionname'], n_trials=n_trials, top_k_models=top_k_models, load_directory=load_directory, save_directory=save_directory)
print(top_models_df)

In [None]:
df = pd.read_csv('/home/young78703/Data_Science_Project/data/german_credit_data.csv')
df=encode_categorical_column(df, 'Saving accounts')
impute_nulls(df)
df=encode_categorical_column(df, 'Checking account')
columns = ['Sex', 'Housing', 'Purpose']
df = one_hot_encode(df, columns)

In [None]:
n_trials = 10
top_k_models = 3
load_directory = '/home/young78703/Data_Science_Project/output/ANN_Regression/load_directory'
save_directory = '/home/young78703/Data_Science_Project/output/ANN_Regression/save_directory'
top_models_df = execute_and_evaluate_ANN_models(df, 'Credit amount', drop_columns=[], n_trials=n_trials, top_k_models=top_k_models, load_directory=load_directory, save_directory=save_directory)
print(top_models_df)