In [1]:
%pip install lightkurve astroquery astropy tensorflow scikit-learn pandas numpy matplotlib requests tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report

import tensorflow as tf
from tensorflow.keras import layers, models

from lightkurve import search_lightcurvefile
from astropy.timeseries import BoxLeastSquares



In [3]:
import requests, io
import pandas as pd

base = "https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI"
# Request the cumulative KOI table
params = {
    "table": "cumulative",
    "select": "kepid,koi_disposition,koi_period,koi_time0bk,koi_duration,koi_depth",
    "format": "csv"
}
resp = requests.get(base, params=params)
print("Status:", resp.status_code)
text = resp.text
# If error, print first few lines
print(text[:500])
koi_df = pd.read_csv(io.StringIO(text))
print("Columns:", koi_df.columns.tolist())
print("Rows:", len(koi_df))

Status: 200
kepid,koi_disposition,koi_period,koi_time0bk,koi_duration,koi_depth
10797460,CONFIRMED,9.48803557,170.53875,2.9575,615.8
10797460,CONFIRMED,54.4183827,162.51384,4.507,874.8
10811496,CANDIDATE,19.89913995,175.850252,1.7822,10829
10848459,FALSE POSITIVE,1.736952453,170.307565,2.40641,8079.2
10854555,CONFIRMED,2.525591777,171.59555,1.6545,603.3
10872983,CONFIRMED,11.09432054,171.20116,4.5945,1517.5
10872983,CONFIRMED,4.13443512,172.97937,3.1402,686
10872983,CONFIRMED,2.56658897,179.55437,2.429,226.
Columns: ['kepid', 'koi_disposition', 'koi_period', 'koi_time0bk', 'koi_duration', 'koi_depth']
Rows: 9564


In [4]:
# --- 4) Filter KOI table ---
koi_df = koi_df.dropna(subset=["kepid", "koi_disposition"])

# Map dispositions to binary labels: CONFIRMED/CANDIDATE => planet(1), FALSE POSITIVE => non-planet(0)
koi_df['label'] = koi_df['koi_disposition'].apply(lambda s: 0 if 'FALSE POSITIVE' in str(s).upper() or 'FALSE_POSITIVE' in str(s).upper() else 1)

In [9]:
# --- 5) Balanced sample for quick run ---
pos = koi_df[koi_df.label == 1].sample(10, random_state=42)
neg = koi_df[koi_df.label == 0].sample(10, random_state=42)
sample_df = pd.concat([pos, neg]).reset_index(drop=True)

In [10]:
from lightkurve import search_lightcurve
from lightkurve.periodogram import BoxLeastSquares
from tqdm import tqdm

def fetch_and_preprocess(kepid, period=None, t0=None, duration=None, use_median_norm=True):
    try:
        # Use new Lightkurve API
        search = search_lightcurve(f'Kepler {int(kepid)}', mission='Kepler')
        if len(search) == 0:
            return None
        lcf = search.download_all()
        lc = lcf.stitch().remove_nans()
    except Exception as e:
        return None

    flux, time = lc.flux.value, lc.time.value
    if np.all(np.isnan(flux)):
        return None

    if use_median_norm:
        flux = flux / np.nanmedian(flux) - 1

    # Use BLS if no period provided
    if period is None or pd.isna(period):
        try:
            mask = np.isfinite(time) & np.isfinite(flux)
            t, y = time[mask], flux[mask]
            if len(t) < 50:
                return None
            durations = np.linspace(0.02, 0.3, 10)
            bls = BoxLeastSquares(t, y)
            res = bls.autopower(durations)
            best = np.argmax(res.power)
            period, t0 = res.period[best], res.transit_time[best]
        except Exception:
            return None

    # Phase fold
    phase = ((time - t0 + 0.5 * period) % period) / period - 0.5
    idx = np.argsort(phase)
    phase, flux = phase[idx], flux[idx]

    # Global view
    grid = np.linspace(-0.5, 0.5, 501)
    flux_interp = np.interp(grid, phase, flux, left=np.nanmedian(flux), right=np.nanmedian(flux))

    # Local view
    dur = (duration / 24.0) if duration and not pd.isna(duration) else 0.05 * period
    local_grid = np.linspace(-0.05, 0.05, 101)
    local_flux = np.interp(local_grid, phase, flux, left=np.nanmedian(flux), right=np.nanmedian(flux))

    def norm(arr):
        arr = np.array(arr)
        return (arr - np.nanmedian(arr)) / (np.nanpercentile(arr, 95) - np.nanpercentile(arr, 5) + 1e-8)

    return norm(flux_interp), norm(local_flux)


# Step 7: build dataset
X_global, X_local, y = [], [], []
for _, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
    kepid = int(row.kepid)
    p = row.koi_period if not pd.isna(row.koi_period) else None
    t0 = row.koi_time0bk if not pd.isna(row.koi_time0bk) else None  # <-- FIXED
    dur = row.koi_duration if not pd.isna(row.koi_duration) else None

    out = fetch_and_preprocess(kepid, period=p, t0=t0, duration=dur)
    if out is None:
        continue
    g, l = out
    X_global.append(g)
    X_local.append(l)
    y.append(row.label)

X_global, X_local, y = np.array(X_global), np.array(X_local), np.array(y)
print("Built dataset:", X_global.shape, X_local.shape, y.shape)

  arr_common = np.array([arr[0] for arr in arrs])
  arr_common = np.array([arr[0] for arr in arrs])
100%|██████████| 20/20 [17:59<00:00, 53.99s/it] 

Built dataset: (18, 501) (18, 101) (18,)





In [11]:
# --- 8) Train/test split ---
Xg_train, Xg_test, Xl_train, Xl_test, y_train, y_test = train_test_split(
    X_global, X_local, y, test_size=0.2, random_state=42, stratify=y
)

In [12]:
# --- 9) 1D CNN model ---
def build_model(global_len=501, local_len=101):
    in_g = layers.Input(shape=(global_len, 1), name='global_in')
    in_l = layers.Input(shape=(local_len, 1), name='local_in')

    def small_cnn(x):
        x = layers.Conv1D(16, 5, activation='relu', padding='same')(x)
        x = layers.MaxPool1D(2)(x)
        x = layers.Conv1D(32, 5, activation='relu', padding='same')(x)
        x = layers.GlobalMaxPool1D()(x)
        return x

    xg = small_cnn(in_g)
    xl = small_cnn(in_l)
    x = layers.Concatenate()([xg, xl])
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    out = layers.Dense(1, activation='sigmoid')(x)

    model = models.Model([in_g, in_l], out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    return model

model = build_model(global_len=Xg_train.shape[1], local_len=Xl_train.shape[1])
model.summary()

In [None]:
# --- 10) Train ---
Xg_train_r = Xg_train[..., None]; Xl_train_r = Xl_train[..., None]
Xg_test_r = Xg_test[..., None];  Xl_test_r = Xl_test[..., None]

history = model.fit([Xg_train_r, Xl_train_r], y_train,
                    validation_split=0.2, epochs=12, batch_size=8)

Epoch 1/12




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 204ms/step - AUC: 0.4250 - loss: 0.6980 - val_AUC: 1.0000 - val_loss: 0.6699
Epoch 2/12
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - AUC: 0.7778 - loss: 0.6457 - val_AUC: 1.0000 - val_loss: 0.6734
Epoch 3/12
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - AUC: 0.6667 - loss: 0.6494 - val_AUC: 1.0000 - val_loss: 0.6726
Epoch 4/12
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - AUC: 0.6889 - loss: 0.6351 - val_AUC: 1.0000 - val_loss: 0.6742
Epoch 5/12
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - AUC: 0.9556 - loss: 0.6114 - val_AUC: 0.7500 - val_loss: 0.6773
Epoch 6/12
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - AUC: 0.9556 - loss: 0.5889 - val_AUC: 1.0000 - val_loss: 0.6803
Epoch 7/12
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - AUC: 1.0000 - loss: 0.5537 - va

In [14]:
from tensorflow.keras import layers, models, regularizers
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping

# --- 9) Tiny 1D CNN for very small dataset ---
def build_model(global_len=501, local_len=101):
    in_g = layers.Input(shape=(global_len, 1), name='global_in')
    in_l = layers.Input(shape=(local_len, 1), name='local_in')

    def tiny_cnn(x):
        x = layers.Conv1D(8, 3, activation='relu', padding='same',
                          kernel_regularizer=regularizers.l2(0.01))(x)
        x = layers.MaxPool1D(2)(x)
        x = layers.GlobalMaxPool1D()(x)
        return x

    xg = tiny_cnn(in_g)
    xl = tiny_cnn(in_l)
    x = layers.Concatenate()([xg, xl])
    x = layers.Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(1, activation='sigmoid')(x)

    model = models.Model([in_g, in_l], out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    return model

model = build_model(global_len=Xg_train.shape[1], local_len=Xl_train.shape[1])
model.summary()


In [15]:
# --- 10) Train with tiny batch size and early stopping ---
Xg_train_r = Xg_train[..., None]; Xl_train_r = Xl_train[..., None]
Xg_test_r  = Xg_test[..., None];  Xl_test_r  = Xl_test[..., None]

# Compute class weights to handle imbalance
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    {"global_in": Xg_train_r, "local_in": Xl_train_r},
    y_train,
    validation_data=({"global_in": Xg_test_r, "local_in": Xl_test_r}, y_test),
    epochs=50,
    batch_size=2,
    class_weight=class_weights,
    callbacks=[early_stop]
)

Epoch 1/50




[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - AUC: 0.2632 - loss: 1.4397 - val_AUC: 0.5000 - val_loss: 0.8813
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - AUC: 0.7444 - loss: 0.8571 - val_AUC: 0.5000 - val_loss: 0.8664
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - AUC: 0.4747 - loss: 1.1528 - val_AUC: 0.5000 - val_loss: 0.8611
Epoch 4/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - AUC: 0.4288 - loss: 0.8175 - val_AUC: 0.5000 - val_loss: 0.8546
Epoch 5/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - AUC: 0.4849 - loss: 1.0738 - val_AUC: 0.5000 - val_loss: 0.8563
Epoch 6/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - AUC: 0.6225 - loss: 0.9372 - val_AUC: 0.5000 - val_loss: 0.8691
Epoch 7/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - AUC: 0.3456 - loss: 0.9740 - val_AUC: 

In [16]:
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import roc_auc_score, classification_report

# --- 7) Augment small dataset ---
def augment_lightcurve(global_lc, local_lc, n_aug=5):
    """Create synthetic variations of one sample"""
    augmented = []
    for _ in range(n_aug):
        # global view
        g = global_lc + np.random.normal(0, 0.02, size=global_lc.shape)  # small Gaussian noise
        g = g * np.random.uniform(0.95, 1.05)  # slight scaling
        # local view
        l = local_lc + np.random.normal(0, 0.02, size=local_lc.shape)
        l = l * np.random.uniform(0.95, 1.05)
        augmented.append((g, l))
    return augmented

Xg_aug, Xl_aug, y_aug = [], [], []

for i in range(len(X_global)):
    # original
    Xg_aug.append(X_global[i])
    Xl_aug.append(X_local[i])
    y_aug.append(y[i])
    # synthetic
    syn = augment_lightcurve(X_global[i], X_local[i], n_aug=5)
    for g, l in syn:
        Xg_aug.append(g)
        Xl_aug.append(l)
        y_aug.append(y[i])

Xg_aug = np.array(Xg_aug)[..., None]  # add channel dim
Xl_aug = np.array(Xl_aug)[..., None]
y_aug = np.array(y_aug)

# --- 8) Train/test split ---
Xg_train, Xg_test, Xl_train, Xl_test, y_train, y_test = train_test_split(
    Xg_aug, Xl_aug, y_aug, test_size=0.2, random_state=42, stratify=y_aug
)

# Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))

# --- 9) Tiny CNN for small dataset ---
def build_model(global_len=501, local_len=101):
    in_g = layers.Input(shape=(global_len,1), name='global_in')
    in_l = layers.Input(shape=(local_len,1), name='local_in')

    def tiny_cnn(x):
        x = layers.Conv1D(8, 3, activation='relu', padding='same',
                          kernel_regularizer=regularizers.l2(0.01))(x)
        x = layers.MaxPool1D(2)(x)
        x = layers.GlobalMaxPool1D()(x)
        return x

    xg = tiny_cnn(in_g)
    xl = tiny_cnn(in_l)
    x = layers.Concatenate()([xg, xl])
    x = layers.Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(1, activation='sigmoid')(x)

    model = models.Model([in_g, in_l], out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    return model

model = build_model(global_len=Xg_train.shape[1], local_len=Xl_train.shape[1])
model.summary()

# --- 10) Train with early stopping ---
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    {"global_in": Xg_train, "local_in": Xl_train},
    y_train,
    validation_data=({"global_in": Xg_test, "local_in": Xl_test}, y_test),
    epochs=50,
    batch_size=8,
    class_weight=class_weights,
    callbacks=[early_stop]
)

# --- 11) Evaluate ---
preds = model.predict({"global_in": Xg_test, "local_in": Xl_test}).ravel()
print("ROC-AUC:", roc_auc_score(y_test, preds))
print(classification_report(y_test, (preds>0.5).astype(int)))


Epoch 1/50




[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - AUC: 0.3421 - loss: 1.6510 - val_AUC: 0.6198 - val_loss: 1.0444
Epoch 2/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - AUC: 0.4490 - loss: 1.5584 - val_AUC: 0.7686 - val_loss: 0.8821
Epoch 3/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - AUC: 0.4571 - loss: 1.4557 - val_AUC: 0.8182 - val_loss: 0.7763
Epoch 4/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - AUC: 0.5837 - loss: 1.1087 - val_AUC: 0.8182 - val_loss: 0.7037
Epoch 5/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - AUC: 0.5668 - loss: 0.9942 - val_AUC: 0.8182 - val_loss: 0.6517
Epoch 6/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - AUC: 0.5496 - loss: 1.0240 - val_AUC: 0.8182 - val_loss: 0.6369
Epoch 7/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - AUC: 0.4649 - loss: 0.96