In [5]:
# Let's start with necessary imports
import os
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

In [6]:
# load data and normalize it
background = np.load('background.npz')['data']
stds = np.std(background, axis=-1)[:, :, np.newaxis]
background = background/stds
background = np.swapaxes(background, 1, 2)

bbh = np.load('bbh_for_challenge.npy')
stds = np.std(bbh, axis=-1)[:, :, np.newaxis]
bbh = bbh/stds
bbh = np.swapaxes(bbh, 1, 2)

sglf = np.load('sglf_for_challenge.npy')
stds = np.std(sglf, axis=-1)[:, :, np.newaxis]
sglf = sglf/stds
sglf = np.swapaxes(sglf, 1, 2)

In [None]:
from scipy.fft import rfft, rfftfreq

# Sampling rate for the data
sampling_rate = 4096  # Hz

# Define feature extraction functions
def get_peak_frequency(signal, sampling_rate):
    fft_vals = rfft(signal)  # Compute the FFT
    fft_freqs = rfftfreq(len(signal), d=1/sampling_rate)  # Frequency bins
    peak_freq = fft_freqs[np.argmax(np.abs(fft_vals))]  # Find peak frequency
    return peak_freq

def get_snr(signal):
    signal_power = np.mean(signal ** 2)
    noise_power = 1  # Assuming whitened data has unit noise power
    return signal_power / noise_power

def get_central_frequency(signal, sampling_rate):
    fft_vals = np.abs(rfft(signal))  # Magnitude of FFT
    fft_freqs = rfftfreq(len(signal), d=1/sampling_rate)
    central_freq = np.sum(fft_freqs * fft_vals) / np.sum(fft_vals)
    return central_freq

def get_bandwidth(signal, sampling_rate, threshold=0.1):
    fft_vals = np.abs(rfft(signal))
    fft_freqs = rfftfreq(len(signal), d=1/sampling_rate)
    power_threshold = threshold * np.max(fft_vals)
    bandwidth = fft_freqs[fft_vals > power_threshold].ptp()  # Peak-to-peak
    return bandwidth

# Duration is constant for all segments (200 samples at 4096 Hz)
duration = 200 / sampling_rate  # ~48.8 ms

# Extract features for each dataset
def extract_features(data, label):
    features = []
    for i in range(data.shape[0]):
        sample = data[i]  # Shape: (2, 200), two channels (Hanford and Livingston)
        features.append({
            "peakFreq_H1": get_peak_frequency(sample[0], sampling_rate),
            "snr_H1": get_snr(sample[0]),
            "centralFreq_H1": get_central_frequency(sample[0], sampling_rate),
            "bandwidth_H1": get_bandwidth(sample[0], sampling_rate),
            "peakFreq_L1": get_peak_frequency(sample[1], sampling_rate),
            "snr_L1": get_snr(sample[1]),
            "centralFreq_L1": get_central_frequency(sample[1], sampling_rate),
            "bandwidth_L1": get_bandwidth(sample[1], sampling_rate),
            "duration": duration,
            "label": label  # Add a label for the dataset: 0 = background, 1 = BBH, 2 = SGLF
        })
    return features

# Apply feature extraction to all datasets
background_features = extract_features(background, label=0)
bbh_features = extract_features(bbh, label=1)
sglf_features = extract_features(sglf, label=2)

# Combine into a single DataFrame
features_df = pd.DataFrame(background_features + bbh_features + sglf_features)

# Display the first few rows of the feature DataFrame


In [None]:
features_df[features_df["label"] == 1]

(100000, 200, 2)