In [1]:

import numpy as np
import glob 
import pandas as pd
import h5py # import to read hdf5
from pathlib import Path
import pyfstat
from scipy import stats
import os
from joblib import Parallel, delayed
import shutil
from tqdm import tqdm
import sys 

22-12-28 17:49:14.715 pyfstat INFO    : Running PyFstat version 1.18.1+1.73ad1acd.clean


In [2]:
PREPROC_TRAIN_DATA = True

In [3]:


# root = "/Volumes/T7/gravitational-waves/kaggle-data"
root = "/media/viktor/T7/gravitational-waves-kaggle-2022"

def load_trained_files(train=True):
   if train:
      df = pd.read_csv(f'{root}/kaggle-data/train_labels.csv')
      df['filename'] = f'{root}/kaggle-data/train/' + df['id'].astype(str) + ".hdf5"
   else:
      test_files = glob.glob(f'{root}/kaggle-data/test/*.hdf5')
      df = pd.DataFrame({'filename': test_files})
      
   return df
print("[INFO] Loading files ...")
df = load_trained_files(train=PREPROC_TRAIN_DATA)
df


[INFO] Loading files ...


Unnamed: 0,id,target,filename
0,001121a05,1,/media/viktor/T7/gravitational-waves-kaggle-20...
1,004f23b2d,1,/media/viktor/T7/gravitational-waves-kaggle-20...
2,00a6db666,1,/media/viktor/T7/gravitational-waves-kaggle-20...
3,00f36a6ac,1,/media/viktor/T7/gravitational-waves-kaggle-20...
4,010a387db,1,/media/viktor/T7/gravitational-waves-kaggle-20...
...,...,...,...
598,fe38dbe64,1,/media/viktor/T7/gravitational-waves-kaggle-20...
599,feafd0d16,1,/media/viktor/T7/gravitational-waves-kaggle-20...
600,feeca844e,1,/media/viktor/T7/gravitational-waves-kaggle-20...
601,ff5ad023f,1,/media/viktor/T7/gravitational-waves-kaggle-20...


In [4]:
if PREPROC_TRAIN_DATA:
    df = df[df["target"] != -1].reset_index(drop=True)

In [5]:
df["name"] = df["filename"].apply(lambda x: x.split("/")[-1].split(".")[0])
df

Unnamed: 0,id,target,filename,name
0,001121a05,1,/media/viktor/T7/gravitational-waves-kaggle-20...,001121a05
1,004f23b2d,1,/media/viktor/T7/gravitational-waves-kaggle-20...,004f23b2d
2,00a6db666,1,/media/viktor/T7/gravitational-waves-kaggle-20...,00a6db666
3,00f36a6ac,1,/media/viktor/T7/gravitational-waves-kaggle-20...,00f36a6ac
4,010a387db,1,/media/viktor/T7/gravitational-waves-kaggle-20...,010a387db
...,...,...,...,...
595,fe38dbe64,1,/media/viktor/T7/gravitational-waves-kaggle-20...,fe38dbe64
596,feafd0d16,1,/media/viktor/T7/gravitational-waves-kaggle-20...,feafd0d16
597,feeca844e,1,/media/viktor/T7/gravitational-waves-kaggle-20...,feeca844e
598,ff5ad023f,1,/media/viktor/T7/gravitational-waves-kaggle-20...,ff5ad023f


In [6]:

# Idea from this function takes from this notebook (ðŸ˜‡): https://www.kaggle.com/code/ayuraj/g2net-understand-the-data
def read_data(file):
    file = Path(file)
    with h5py.File(file, "r") as f:
        filename = file.stem
        f = f[filename]
        h1 = f["H1"]
        l1 = f["L1"]
        freq_hz = list(f["frequency_Hz"])
        
        h1_stft = h1["SFTs"][()]
        h1_timestamp = h1["timestamps_GPS"][()]
        # H2 data
        l1_stft = l1["SFTs"][()]
        l1_timestamp = l1["timestamps_GPS"][()]
        
        return h1_stft, l1_stft

def preprocess_file(file):
    
    h1, l1 = read_data(file)
    
    amplitudes = {}
    amplitudes["H1"] = h1
    amplitudes["L1"] = l1

    def preprocess_amplitude(x):        
        
        x = x[:, 0:4096] 
        
        # Make x.real go from -1 to 1
        x.real = x.real / 1.E-25
        x.imag = x.imag / 1.E-25
        
        # IF X is too short, we pad it randomly gneerated values using np.random.randn of mu = np.mean(x) and sigma = np.std(x)
        if x.shape[1] < 4096:
            x = np.pad(x, ((0,0),(0,4096-x.shape[1])), 'constant')
                
        
        
        # # if the signal is too short, we pad it with zeros
        # if x.shape[1] < 4096:
        #     x = np.pad(x, ((0,0),(0,4096-x.shape[1])), 'constant')
        
        # x is currently of shape (360, 4096)
        x = x.reshape((360, 256,16))
        
        x = np.abs(x)
        
        # max pooling along the axis = 2
        x = np.mean(x, axis=2)
        # x is currently of shape (360, 256)
        
        
        
        # print("max of x", np.max(x))
        
        return np.squeeze(255 * x / np.max(x))
    
    amplitudes["H1"] = preprocess_amplitude(amplitudes["H1"])
    amplitudes["L1"] = preprocess_amplitude(amplitudes["L1"])
    
    # make amplitudes between 0 and 1
    amplitudes_h1 = amplitudes["H1"]
    amplitudes_l1 = amplitudes["L1"]
    
    # stack amplitudes from both detectors
    amplitudes = np.stack((amplitudes_h1, amplitudes_l1), axis=2)
    amplitudes = np.squeeze(amplitudes)
    
    return amplitudes  


In [7]:
# #Loop over all files and preprocess them. Save them to train/ if PREPROC_TRAIN_DATA is True, otherwise save them to test/
# if PREPROC_TRAIN_DATA:
#     output_dir = f"train"
# else:
#     output_dir = f"test"
    
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# for i, file in enumerate(tqdm(df["filename"])):
#     amplitudes = preprocess_file(file)
#     np.save(f"{output_dir}/{df['name'][i]}.npy", amplitudes)


# # Run the above for loop in parallel    
# # Parallel(n_jobs=8)(delayed(preprocess_file)(file) for file in tqdm(df["filename"]))

In [8]:
#Loop over all files and preprocess them. Save them to train/ if PREPROC_TRAIN_DATA is True, otherwise save them to test/
if PREPROC_TRAIN_DATA:
    output_dir = f"train"
else:
    output_dir = f"test"
    
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

def preproc_and_save(file, name):
    amplitudes = preprocess_file(file)
    np.save(f"{output_dir}/{name}.npy", amplitudes)


# Run the above for loop in parallel    
Parallel(n_jobs=14)(delayed(preproc_and_save)(file, name) for file, name in tqdm(zip(df["filename"].tolist(), df["name"].tolist())))

600it [00:38, 15.71it/s]


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,