In [1]:
import numpy as np
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt


# Data Preprocessing Functions

In [None]:
# Load in the train csv data set via numpy
def load_data(file_path):
    
    """Load data from a CSV file into a pandas DataFrame.

    Returns:
        pd.DataFrame: DataFrame containing the data from the CSV file.
    """
    
    if not os.path.exists(file_path):
        print(f"File {file_path} does not exist.")
        sys.exit(1)
    data = pd.read_csv(file_path, delimiter=' ', header=None)
    return data


def plot_histgrams(df, display_corr_matrix=False):
    
    """
    Plot histograms of the DataFrame columns and optionally display a correlation matrix.
    """
    
    df.hist(bins=30, figsize=(8,6), color='b')
    plt.tight_layout()
    plt.show()
    
    if display_corr_matrix:
        corr_matrix = df.corr()
        plt.figure(figsize=(10, 8))
        plt.matshow(corr_matrix, fignum=1)
        plt.colorbar()
        plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, rotation=90)
        plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns)
        plt.title('Correlation Matrix', pad=20)
        plt.show()
    
def delete_bad_sensors(df):
    
    """Identify and remove sensors with NaN or zero correlation in the DataFrame.

    Returns:
        pd.DataFrame: DataFrame with bad sensors removed.
        list: List of labels of the removed sensors.
    """
    
    # Build the correlation matrix from columns 1 through -2 of train_df
    correlation_matrix = df.iloc[:, 1:-1].corr()

    # Select the 2nd column of that matrix (0-based index 1)
    col = correlation_matrix.iloc[:, 1]

    # Build your mask: NaNs or exact zeros
    mask = col.isna() | (col == 0)

    # 1) If you want the actual **column names** (labels) whose correlation is bad:
    bad_labels = col.index[mask].tolist()
    print("Columns with NaN or 0 correlation:", bad_labels)

    # 2) If you instead want their **integer positions** within this correlation matrix:
    bad_positions = np.where(mask)[0]         # 0-based positions
    bad_positions_one_based = bad_positions + 1
    print("Bad Sensors 1-based positions:", bad_positions_one_based)

    # Drop the bad sensors from the training data
    df.drop(columns=bad_labels, inplace=True)
    
    return df, bad_labels

def normalize_data(train_df):
    
    """Normalize the training data using Min-Max scaling.

    Returns:
        pd.DataFrame: Normalized DataFrame with values scaled to the range [-1, 1].
    """
    
    from sklearn.preprocessing import MinMaxScaler
    features=train_df.columns
    min_max_scaler = MinMaxScaler(feature_range=(-1,1))
    train_df[features] = min_max_scaler.fit_transform(train_df[features])

    return train_df


# Function to calculate the remaining flights for each engine
def calculate_remaining_flights(data):
    # Group by engine ID and find the maximum cycle for each engine
    max_cycles = data.groupby(0)[1].max().reset_index()
    max_cycles.columns = [0, 'max_cycle']
    
    # Merge to get the remaining flights
    data = data.merge(max_cycles, on=0)
    data['remaining_flights'] = data['max_cycle'] - data[1]
    
    return data

# Gaussian Process Functions

In [3]:
import tensorflow as tf
import gpflow as gpf
from gpflow.utilities import print_summary

def get_gpus():
    """
    Returns a list of available GPU devices.
    """
    gpus = tf.config.list_physical_devices('GPU')
    print("GPUs:", gpus)
    return gpus


def additive_kernel():
    """Create an additive kernel composed of a linear and RBF kernel.
    """
    linear_kernel = gpf.kernels.Linear()
    rbf_kernel = gpf.kernels.RBF()
    additive_kernel = linear_kernel + rbf_kernel
    
    return additive_kernel





def prepare_data(train_df, targets_df, num_points=1000):
    """
    Extracts and formats training inputs and targets from DataFrames.
    - train_df: pandas.DataFrame of features
    - targets_df: pandas.DataFrame with a 'remaining_flights' column
    - num_points: number of rows to select
    Returns: (train_x, train_y) as float64 numpy arrays with shape [N, D] and [N, 1]
    """
    x = train_df.to_numpy().astype(np.float64)[:num_points, :]
    y = targets_df['remaining_flights'].to_numpy().astype(np.float64)[:num_points]

    # Ensure 2D arrays
    if x.ndim == 1:
        x = x[:, None]
    if y.ndim == 1:
        y = y[:, None]
    return x, y


def build_gpr_model(train_x, train_y, kernel=gpf.kernels.RBF() ,noise_variance=1e-2):
    """
    Builds and returns a GPFlow GPR model with an additive linear + RBF kernel.
    Also initializes the likelihood variance.
    Returns: (model)
    """
    
    mean_func = gpf.mean_functions.Constant()

    model = gpf.models.GPR(data=(train_x, train_y), kernel=kernel, mean_function=mean_func)
    model.likelihood.variance.assign(noise_variance)
    return model


@tf.function
def optimization_step(model, optimizer):
    with tf.GradientTape() as tape:
        loss = model.training_loss()
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss


def train_model(model, num_epochs=1000, learning_rate=0.01):
    """
    Trains the GPFlow model using the Adam optimizer.
    Returns: (model)
    """
    
    optimizer = tf.optimizers.Adam(learning_rate=learning_rate)
    
    for epoch in range(num_epochs):
        loss = optimization_step(model, optimizer)
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss.numpy()}")
    
    print_summary(model)
    return model



# model = build_gpr_model(train_x, train_y, kernel=kernel)

# # List all trainable variables:
# for v in model.trainable_variables:
#     print(v.name, v.shape)






In [8]:
train_df = load_data('train_FD001.csv')

train_df.head()
train_df.drop(labels=[26, 27], axis=1, inplace=True)

# Remove the bad sensors
train_df, bad_sensors = delete_bad_sensors(train_df)
# Calculate remaining flights
train_df = calculate_remaining_flights(train_df)

# Select just the sensor data
train_sensors = train_df.iloc[:, 2:-1]
# Normalize the sensor data
train_sensors = normalize_data(train_sensors)

# Prepare the targets DataFrame
targets_df = train_df[[0, 1, 'remaining_flights']].copy()



Columns with NaN or 0 correlation: [4, 5, 9, 14, 20, 22, 23]
Bad Sensors 1-based positions: [ 4  5  9 14 20 22 23]


TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.