# The issue

As @ambrosm has show in [this discussion](https://www.kaggle.com/competitions/ariel-data-challenge-2024/discussion/527167), the transit time for the planets to pass the stars is not the same for each sample. Though, calculation and feature engineering on the light curves requires exact knowledge of these cutoffs.

![](https://www.googleapis.com/download/storage/v1/b/kaggle-forum-message-attachments/o/inbox%2F7917824%2F9b51760603a8f72d3604305ed76b8841%2Fsl.png?generation=1723307476416972&alt=media)

# The solution

Here, I am presenting a very robust solution to identify the transit zone that works on ALL train samples, with both sensor types. Discussion about this solution here: https://www.kaggle.com/competitions/ariel-data-challenge-2024/discussion/529533

![](https://www.googleapis.com/download/storage/v1/b/kaggle-forum-message-attachments/o/inbox%2F2675447%2Fc8dcc7b6800036ce45a352d298c99249%2F__results___6_1.png?generation=1724246305869502&alt=media)

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
ROOT = "/kaggle/input/ariel-data-challenge-2024/"

train_adc_info = pd.read_csv(ROOT + "/train_adc_info.csv", index_col="planet_id")
train_labels = pd.read_csv(ROOT + "/train_labels.csv", index_col="planet_id")

In [None]:
train_labels

In [None]:
f_raw_train = np.load("/kaggle/input/ariel24-data-prep/fgs_v4.npy")
a_raw_train = np.load("/kaggle/input/ariel24-data-prep/airs_v4.npy")

In [None]:
f_raw_train.shape

In [None]:
from scipy.signal import savgol_filter


def smooth_data(data, window_size):
    return savgol_filter(data, window_size, 3)  # window size 51, polynomial order 3


def optimize_breakpoint(data, initial_breakpoint, window_size=500, buffer_size=50, smooth_window=121):
    best_breakpoint = initial_breakpoint
    best_score = float("-inf")
    midpoint = len(data) // 2
    smoothed_data = smooth_data(data, smooth_window)

    for i in range(-window_size, window_size):
        new_breakpoint = initial_breakpoint + i
        region1 = data[: new_breakpoint - buffer_size]
        region2 = data[
            new_breakpoint
            + buffer_size : len(data)
            - new_breakpoint
            - buffer_size
        ]
        region3 = data[len(data) - new_breakpoint + buffer_size :]

        # calc on smoothed data
        breakpoint_region1 = smoothed_data[new_breakpoint - buffer_size: new_breakpoint + buffer_size]
        breakpoint_region2 = smoothed_data[-(new_breakpoint + buffer_size): -(new_breakpoint - buffer_size)]

        mean_diff = abs(np.mean(region1) - np.mean(region2)) + abs(
            np.mean(region2) - np.mean(region3)
        )
        var_sum = np.var(region1) + np.var(region2) + np.var(region3)
        range_at_breakpoint1 = (np.max(breakpoint_region1) - np.min(breakpoint_region1))
        range_at_breakpoint2 = (np.max(breakpoint_region2) - np.min(breakpoint_region2))

        mean_range_at_breakpoint = (range_at_breakpoint1 + range_at_breakpoint2) / 2

        score = mean_diff - 0.5 * var_sum + mean_range_at_breakpoint

        if score > best_score:
            best_score = score
            best_breakpoint = new_breakpoint

    return best_breakpoint



def find_and_plot_breakpoints(f_raw, a_raw, IDX, verbose=False):
    # Create a figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))
    planet_id = train_labels.index[IDX]

    for sensor_name in ["AIRS-CH0", "FGS1"]:
        if sensor_name == "AIRS-CH0":
            buffer_size = 120
            smooth_window = 121
            window_size = 500
            default_breakpoint = 1821
            data = a_raw
        elif sensor_name == "FGS1":
            buffer_size = 1500
            smooth_window = 1001
            window_size = 3500
            default_breakpoint = 21500
            data = f_raw

        initial_breakpoint = default_breakpoint

        if verbose:
            print(f"Initial breakpoint: {initial_breakpoint}")
        optimized_breakpoint = optimize_breakpoint(
            data,
            initial_breakpoint,
            window_size=window_size,
            buffer_size=buffer_size,
            smooth_window=smooth_window,
        )

        midpoint = len(data) // 2
        breakpoints = [optimized_breakpoint, 2 * midpoint - optimized_breakpoint]

        if sensor_name == "FGS1":
            ax = ax2
        elif sensor_name == "AIRS-CH0":
            ax = ax1

        # Plot the results
        ax.plot(data, color='#4E79A7', alpha=0.7, label="Original Data")
        ax.plot(smooth_data(data, smooth_window), label="Smoothed Data", color='#F28E2B')

        for bp in breakpoints:
            ax.axvline(x=bp - buffer_size, color="r", linestyle="--")
            ax.axvline(x=bp + buffer_size, color="r", linestyle="--")
            ax.axvspan(bp - buffer_size, bp + buffer_size, color="gray", alpha=0.3)

        ax.set_title(f"{sensor_name}")
        ax.set_xlabel("Time step")
        ax.set_ylabel("Value")
        ax.legend()
    fig.tight_layout()
    fig.suptitle(f"Planet ID: {planet_id}, IDX {IDX}", fontsize=16)
    plt.show()

In [None]:
for IDX in range(20):
    f_raw = f_raw_train[IDX]
    a_raw = a_raw_train[IDX]
    find_and_plot_breakpoints(f_raw, a_raw, IDX)