# Data Exploration

This file's purpose is to aid in visualizing the data that was used to train and validate the model.

In [None]:
import datetime
now = datetime.datetime.now()
print("Last executed: " + now.strftime("%Y-%m-%d %H:%M:%S"))

In [None]:
# Initial imports
from __future__ import annotations
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import spectrogram

Some fool-proofing:

In [None]:
# Find project root
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").exists() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

SRC_DIR = PROJECT_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

# Import utilities
from data.g2net import find_dataset_dir, load_labels, load_sample

# Resolve dataset directory automatically
DATASET_DIR = find_dataset_dir(PROJECT_ROOT)

# Load labels
train_labels_df = load_labels(DATASET_DIR)

# Constants for plotting / spectrograms
FS = 2048                          # sampling rate (Hz)
N = 4096                           # samples per detector
T = np.arange(N) / FS              # time axis (s)
DETECTORS = ["Hanford (H1)", "Livingston (L1)", "Virgo (V1)"]

# Quick sanity prints
print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATASET_DIR:  ", DATASET_DIR)
print("Labels file:  ", (DATASET_DIR / "training_labels.csv"))
print("train_df:     ", train_labels_df.shape, "| target mean:", float(train_labels_df["target"].mean()))

### Load Training Labels

This file will be used by the model to train its output against the real data. The first 5 instances are shown.

In [None]:
# load train data
train_labels_df = pd.read_csv("../g2net-gravitational-wave-detection/training_labels.csv")
train_labels_df.head()


We now check the first sample from the `train_labels_df` **DataFrame** and select its `id` and `target` values. We then use `load_sample` to retrieve the training sample that specific id points to.

In [None]:
sample_id = train_labels_df.iloc[0]['id']
target_value = train_labels_df.loc[train_labels_df['id'] == sample_id, 'target'].item()
sample = load_sample(sample_id)

print(f"Sample ID: \n {sample_id}")
print(f"Sample: \n {sample} -> notice it has 3 arrays, one for each detector")
print(f"Sample shape: \n {sample.shape} -> 3 detectors each with a 2s time series at 2048 Hz (4096 points)")
print("Is black hole? Yes!") if {int(target_value) == 1} else print("Is black hole? No :(")

# Time-Series Evolution



In [None]:
#visualize time series evolution

def plot_timeseries(sample, sample_id, target):
    plt.figure(figsize=(14, 8))
    
    for i in range(3):
        plt.subplot(3, 1, i+1)
        plt.plot(sample[i], linewidth=0.8)
        plt.title(f"{DETECTORS[i]} - sample {sample_id} - target={target}")
        plt.xlabel("time (samples)")
        plt.ylabel("strain")
    
    plt.tight_layout()
    plt.show()

target = train_labels_df.iloc[000]['target']
plot_timeseries(sample, sample_id, target)


# Spectrograms

In [None]:
# spectrograms

def plot_spectrogram(x, det=0, fs=FS, nperseg=256, noverlap=192, fmax=512, title=None):
    """
    x: np.ndarray shape (3, 4096) or (4096,)
    det: detector index if x is (3, 4096)
    """
    sig = x[det] if x.ndim == 2 else x

    f, t, Sxx = spectrogram(sig, fs=fs, nperseg=nperseg, noverlap=noverlap, scaling="density", mode="magnitude")
    if fmax is not None:
        m = f <= fmax
        f, Sxx = f[m], Sxx[m, :]

    plt.figure(figsize=(10, 4))
    plt.pcolormesh(t, f, 10*np.log10(Sxx + 1e-12), shading="auto")
    plt.ylabel("Frequency [Hz]")
    plt.xlabel("Time [s]")
    plt.title(title or f"Spectrogram â€“ {DETECTORS[det]}")
    plt.colorbar(label="Power (dB)")
    plt.tight_layout()
    plt.show()

for det in range(3):
    plot_spectrogram(sample, det=det, title=f"ID: {sample_id} | Target: {target} | {DETECTORS[det]}")


In [None]:
# explore random samples

import random

def random_sample_vis():
    row = train_labels_df.sample(1).iloc[0]
    sample_id = row.id
    target = row.target
    data = load_sample(sample_id)
    
    print(f"Sample: {sample_id}, Target: {target}")
    plot_timeseries(data, sample_id, target)
    plot_spectrogram(data, sample_id, target)

random_sample_vis()
