In [1]:
import datetime
now = datetime.datetime.now()
print("Last executed: " + now.strftime("%Y-%m-%d %H:%M:%S"))

Last executed: 2025-12-28 12:23:10


In [None]:
# initial imports
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

# Find project root and add to path
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").exists() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

SRC_DIR = PROJECT_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

tf.keras.utils.set_random_seed(639256)

In [None]:
# Import utilities (now that sys.path is set)
from data.g2net import find_dataset_dir, load_labels, load_sample
from models.base_model import LogRegBaseline, compute_features

DATASET_DIR = find_dataset_dir()  # should resolve to D:\Programming\g2net-gravitational-wave-detection
print("DATASET_DIR:", DATASET_DIR)
FEAT_PATH = DATASET_DIR / "features_logreg_full.npz"

DETECTORS = ["Hanford (H1)", "Livingston (L1)", "Virgo (V1)"]

In [None]:
# Load labels
df = load_labels(DATASET_DIR) # (id, target)
y_df = df["target"].astype(int).to_numpy()

# Load precomputed features
z = np.load(FEAT_PATH)
X_all = z["X"].astype(np.float32)
y_all = z["y"].astype(np.int64)

print("X_all:", X_all.shape, X_all.dtype)
print("y_all:", y_all.shape, y_all.dtype)

# Sanity checks
assert len(df) == X_all.shape[0] == y_all.shape[0], "Mismatch between labels and feature rows"
assert np.array_equal(y_df, y_all), "Label order mismatch: y from CSV != y saved in npz"

# Split into 80% train, 20% validation
idx = np.arange(len(y_all))
idx_tr, idx_va = train_test_split(
    idx, test_size=0.2, random_state=42, stratify=y_all
)

X_tr, y_tr = X_all[idx_tr], y_all[idx_tr]
X_va, y_va = X_all[idx_va], y_all[idx_va]

X_all: (560000, 30) float32
y_all: (560000,) int64
Validation ROC-AUC: 0.5163768033194686


In [None]:
# Minimal baseline model: StandardScaler + LogisticRegression
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=2000, solver="lbfgs")
)

pipe.fit(X_tr, y_tr)
p_va = pipe.predict_proba(X_va)[:, 1]
auc = roc_auc_score(y_va, p_va)
print("Validation ROC-AUC:", auc)


In [5]:
# Fit on full training features
pipe.fit(X_all, y_all)

# Load sample submission to get test IDs
sub = pd.read_csv(DATASET_DIR / "sample_submission.csv", dtype={"id": str})
test_ids = sub["id"].tolist()
print("Num test samples:", len(test_ids))

# Compute features for test
X_test = np.empty((len(test_ids), X_all.shape[1]), dtype=np.float32)

for i, sample_id in enumerate(tqdm(test_ids)):
    w = load_sample(sample_id, split="test", dataset_dir=DATASET_DIR)
    X_test[i] = compute_features(w)

# Predict + write submission
sub["target"] = pipe.predict_proba(X_test)[:, 1]
out_path = PROJECT_ROOT / "submission_logreg_baseline.csv"
sub.to_csv(out_path, index=False)

print("Wrote:", out_path)
sub.head()


Num test samples: 226000


  0%|          | 0/226000 [00:00<?, ?it/s]

Wrote: c:\Users\jose\OneDrive\Ambiente de Trabalho\lisa_gravitational_wave_detector\submission_logreg_baseline.csv


Unnamed: 0,id,target
0,00005bced6,0.495248
1,0000806717,0.481935
2,0000ef4fe1,0.472083
3,00020de251,0.499208
4,00024887b5,0.51583


In [6]:
d = Path(find_dataset_dir())
npzs = sorted(d.rglob("*.npz"))
print("Found", len(npzs), ".npz files")
for p in npzs[:50]:
    print(p, p.stat().st_size)

Found 1 .npz files
D:\Programming\g2net-gravitational-wave-detection\features_logreg_full.npz 71680490
