# Fully Pipelined Colab Notebook — Data → ~900 TSFresh → Baseline → Siamese → Outputs
This notebook runs end-to-end with minimal interaction. Configure parameters in the next cell and run all cells top-to-bottom.

In [None]:
# ==== Parameters (edit as needed) =====================================
REPO_URL = 'https://github.com/umed-indulkar/Exoplanet-Detection-Project.git'  # GitHub repo URL

# Input glob for your light curves (set to your folder of 5000 NPZ files)
INPUT_GLOB = 'data/lightcurves_ultraclean_5000/*.npz'

# Data ingestion mode: one of 'drive', 'upload', 'urls', 'demo'
# If your data is already committed in the repo under data/, set to 'demo' (no ingestion) or leave 'upload' and skip.
DATA_MODE = 'upload'
# If DATA_MODE == 'drive': folder or glob on Drive to copy from (e.g., '/content/drive/MyDrive/curves/*.npz')
DRIVE_DATA_GLOB = '/content/drive/MyDrive/curves/*.npz'
# If DATA_MODE == 'urls': list of URLs to download
DATA_URLS = []  # e.g., ['https://example.com/file1.npz']

# Labels ingestion: one of 'none', 'drive', 'upload', 'url'
LABELS_MODE = 'none'
LABELS_DRIVE_PATH = '/content/drive/MyDrive/labels/labels.csv'
LABELS_URL = ''  # e.g., 'https://example.com/labels.csv'

# TSFresh preset: 'efficient' or 'comprehensive' (~900+)
TSFRESH_PRESET = 'comprehensive'
WORKERS = 4

# Training toggles
TRAIN_BASELINE = True
TRAIN_SIAMESE = True
EPOCHS_SIAMESE = 10
EMBEDDING_DIM = 32
DEVICE = 'auto'  # 'auto'|'cpu'|'cuda'
# =======================================================================

import os, shutil, sys, subprocess, textwrap
from pathlib import Path
Path('outputs').mkdir(exist_ok=True); Path('runs').mkdir(exist_ok=True)
print('Parameters loaded.')

In [None]:
# 1) Install dependencies
!pip -q install numpy pandas scipy pyyaml matplotlib seaborn tsfresh statsmodels scikit-learn optuna streamlit
# Torch CPU wheel (works anywhere). If Colab GPU already has torch, you can skip this safely.
!pip -q install torch --index-url https://download.pytorch.org/whl/cpu

In [None]:
# 2) Clone repository
import os, shutil
if os.path.exists('exocode'):
    shutil.rmtree('exocode')
!git clone $REPO_URL exocode
%cd exocode
!python simple_test.py || true

In [None]:
# 3) Ingest data (to exocode/data)
from pathlib import Path
Path('data').mkdir(exist_ok=True)
print('DATA_MODE =', DATA_MODE)
if DATA_MODE == 'drive':
    from google.colab import drive
    drive.mount('/content/drive')
    !cp -v $DRIVE_DATA_GLOB data/ || true
elif DATA_MODE == 'urls':
    import urllib.request
    for url in DATA_URLS:
        name = url.split('/')[-1]
        print('Downloading', url)
        urllib.request.urlretrieve(url, f'data/{name}')
elif DATA_MODE == 'upload':
    from google.colab import files
    print('Upload .npz/.csv/.fits files...')
    uploaded = files.upload()
    for name in uploaded:
        shutil.move(name, f'data/{name}')
elif DATA_MODE == 'demo':
    # generate a small synthetic NPZ to demonstrate the pipeline
    import numpy as np
    t = np.linspace(0, 30, 3000); f = np.ones_like(t) + np.random.normal(0, 0.003, len(t))
    for t0 in np.arange(2.5, 30, 5.0):
        mask = (np.abs(t - t0) < 0.1)
        f[mask] -= 0.01
    np.savez('data/demo_curve.npz', time=t, flux=f)
else:
    print('Unknown DATA_MODE; no data ingested.')
print('Data files:', list(Path('data').glob('*'))[:5])

In [None]:
# 4) Extract TSFresh features (efficient or comprehensive)
preset = TSFRESH_PRESET.lower()
assert preset in ('efficient','comprehensive'), 'TSFRESH_PRESET must be efficient or comprehensive'
!python -m exodet.cli extract --input "data/*" --output outputs/features_tsfresh.csv --tier tsfresh --tsfresh-params $preset --workers $WORKERS
import pandas as pd
df = pd.read_csv('outputs/features_tsfresh.csv'); print('Extracted:', df.shape)
df.head(3)

In [None]:
# 5) Labels ingestion and merge (optional)
import pandas as pd, urllib.request
labels_path = None
if LABELS_MODE == 'drive':
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    labels_path = LABELS_DRIVE_PATH
elif LABELS_MODE == 'upload':
    from google.colab import files
    print('Upload labels.csv with columns: source,label')
    up = files.upload()
    if up:
        for name in up:
            shutil.move(name, 'labels.csv')
        labels_path = 'labels.csv'
elif LABELS_MODE == 'url' and LABELS_URL:
    urllib.request.urlretrieve(LABELS_URL, 'labels.csv')
    labels_path = 'labels.csv'
else:
    print('LABELS_MODE = none (supervised steps will be skipped)')

if labels_path and Path(labels_path).exists():
    feats = pd.read_csv('outputs/features_tsfresh.csv')
    lbls = pd.read_csv(labels_path)
    merged = feats.merge(lbls, on='source', how='inner')
    merged.to_csv('outputs/features_labeled.csv', index=False)
    print('Labeled features:', merged.shape)
else:
    print('No labels available; continuing without supervised steps.')

In [None]:
# 6) Train/Evaluate Baseline (RandomForest)
import os
if TRAIN_BASELINE and os.path.exists('outputs/features_labeled.csv'):
    !python -m exodet.cli train --features outputs/features_labeled.csv --target label --model rf --output runs/rf.joblib
    !python -m exodet.cli evaluate --model runs/rf.joblib --features outputs/features_labeled.csv --target label
else:
    print('Skipping baseline (no labels or TRAIN_BASELINE=False).')

In [None]:
# 7) Train/Evaluate Siamese (deep)
import os
if TRAIN_SIAMESE and os.path.exists('outputs/features_labeled.csv'):
    !python -m exodet.cli train-siamese --features outputs/features_labeled.csv --target label --epochs $EPOCHS_SIAMESE --embedding $EMBEDDING_DIM --device $DEVICE --output runs/siamese.pt
    !python -m exodet.cli evaluate-siamese --model runs/siamese.pt --features outputs/features_labeled.csv --target label --device $DEVICE
else:
    print('Skipping Siamese (no labels or TRAIN_SIAMESE=False).')

In [None]:
# 8) Download outputs
from google.colab import files
for p in ['outputs/features_tsfresh.csv','outputs/features_labeled.csv','runs/rf.joblib','runs/siamese.pt']:
    if Path(p).exists():
        files.download(p)
    else:
        print('Not found (skipped):', p)