In [None]:
import numpy as np
import pandas as pd

In [None]:
from gwpy.table import GravitySpyTable

In [None]:
O3_data = GravitySpyTable.read([f'./datasets/GlitchesO3/H1_O3b.csv', f'./datasets/GlitchesO3/L1_O3b.csv'])

In [None]:
assert len(O3_data) == 192693 + 99568

In [None]:
ml_confidence = 0.90
H1_confident = O3_data.filter("ifo==H1", f"ml_confidence>={ml_confidence}").to_pandas()
L1_confident = O3_data.filter("ifo==L1", f"ml_confidence>={ml_confidence}").to_pandas()
len(H1_confident), len(L1_confident)

(82324, 137938)

In [None]:
labels = sorted(np.unique(O3_data['ml_label']))
labels

['1080Lines',
 '1400Ripples',
 'Air_Compressor',
 'Blip',
 'Blip_Low_Frequency',
 'Chirp',
 'Extremely_Loud',
 'Fast_Scattering',
 'Helix',
 'Koi_Fish',
 'Light_Modulation',
 'Low_Frequency_Burst',
 'Low_Frequency_Lines',
 'No_Glitch',
 'Paired_Doves',
 'Power_Line',
 'Repeating_Blips',
 'Scattered_Light',
 'Scratchy',
 'Tomte',
 'Violin_Mode',
 'Wandering_Line',
 'Whistle']

In [None]:
total_samples = {}
for idx, label in enumerate(labels):
    n_samples_H1 = len(H1_confident.loc[H1_confident['ml_label'] == label])
    n_samples_L1 = len(L1_confident.loc[L1_confident['ml_label'] == label])
    print(f'{idx:<2}: {label:<20}: {n_samples_L1:>5} : {n_samples_H1:>5} (L1/H1) samples')

0 : 1080Lines           :   518 :    80 (L1/H1) samples
1 : 1400Ripples         :    34 :    85 (L1/H1) samples
2 : Air_Compressor      :   978 :    39 (L1/H1) samples
3 : Blip                :  2564 :  3410 (L1/H1) samples
4 : Blip_Low_Frequency  :  8728 :  1928 (L1/H1) samples
5 : Chirp               :     6 :     4 (L1/H1) samples
6 : Extremely_Loud      :  3461 :  5604 (L1/H1) samples
7 : Fast_Scattering     : 36918 :  1304 (L1/H1) samples
8 : Helix               :    16 :     7 (L1/H1) samples
9 : Koi_Fish            :  3010 :  3710 (L1/H1) samples
10: Light_Modulation    :   123 :    20 (L1/H1) samples
11: Low_Frequency_Burst :  2918 :  1885 (L1/H1) samples
12: Low_Frequency_Lines :  3190 :  1018 (L1/H1) samples
13: No_Glitch           :  2085 :   573 (L1/H1) samples
14: Paired_Doves        :   130 :    22 (L1/H1) samples
15: Power_Line          :   254 :    87 (L1/H1) samples
16: Repeating_Blips     :   308 :   594 (L1/H1) samples
17: Scattered_Light     : 44682 : 58739 (L1/H1) 

In [None]:
meta_data = pd.DataFrame(columns=['id', 'label', 'detector'])
detectors = ['H1', 'L1']
detector_dfs = {'H1': H1_confident, 'L1': L1_confident}
for label in labels:
    for detector in detectors:
        detector_df = detector_dfs[detector]
        try:
            ids = np.random.choice(detector_df.loc[detector_df['ml_label']==label]['gravityspy_id'], size=5, replace=False)
        except ValueError:
            ids = np.array(detector_df.loc[detector_df['ml_label']==label]['gravityspy_id'])
        for id_ in ids:
            meta_data.loc[len(meta_data)] = [id_, label, detector]
meta_data

Unnamed: 0,id,label,detector
0,4OtG8h8wwc,1080Lines,H1
1,ZV2x9fosuY,1080Lines,H1
2,sHNdaM7ELO,1080Lines,H1
3,diwmOJX3qG,1080Lines,H1
4,rjvZI2ebvG,1080Lines,H1
...,...,...,...
220,H23FqDLgaQ,Whistle,L1
221,HKJ9M8VytN,Whistle,L1
222,IAjGDHqPDT,Whistle,L1
223,dkIL7Jw0bC,Whistle,L1
