In [2]:
!mamba env create --quiet -f '/home/jovyan/ak-retraining/environment.yml'

Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m

done
Installing pip dependencies: ...working... done


In [1]:
import seisbench
import seisbench.data as sbd
import seisbench.generate as sbg
import seisbench.models as sbm
from seisbench.util import worker_seeding

import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
from obspy.clients.fdsn import Client
from obspy import UTCDateTime

In [2]:
# How to see what's in the cache:
import os
print("Cache root:", seisbench.cache_root)
print("Contents:", os.listdir(seisbench.cache_root))
print("datasets:", os.listdir(seisbench.cache_root / "datasets"))

Cache root: /home/jovyan/.seisbench
Contents: ['datasets', '.seisbench', 'models', 'config.json']
datasets: ['iquique', 'dummydataset', 'stead']


In [3]:
data = sbd.Iquique(sampling_rate=100)



In [3]:
data = sbd.DummyDataset() 

#### We see that each entry contains the information for one P/S pick pair on one instrument (3 components), including the event origin information.

#### If there is not an S pick, those columns are simply left as NaNs. 

In [4]:
data.metadata

Unnamed: 0,trace_start_time,source_latitude_deg,source_longitude_deg,source_depth_km,source_event_category,source_magnitude,source_magnitude_uncertainty,source_magnitude2,source_magnitude_uncertainty2,trace_name,...,station_latitude_deg,station_longitude_deg,station_elevation_m,source_magnitude_type,source_magnitude_type2,split,trace_name_original,trace_chunk,trace_sampling_rate_hz,trace_component_order
0,2007/01/01 01:42:45.08,-20.43802,-69.27681,83.18,ID,1.353,0.014,1.426,0.011,"bucket0$0,:3,:1200",...,-21.04323,-69.4874,900.0,MA,ML,train,2007_01_01 01_42_45_08,,20,ZNE
1,2007/01/01 02:41:13.75,-21.64059,-68.41443,118.38,ID,1.981,0.020,2.027,0.023,"bucket0$1,:3,:1200",...,-21.04323,-69.4874,900.0,MA,ML,train,2007_01_01 02_41_13_75,,20,ZNE
2,2007/01/01 03:50:29.27,-21.84637,-68.53904,111.82,ID,2.719,0.024,2.811,0.026,"bucket0$2,:3,:1200",...,-21.04323,-69.4874,900.0,MA,ML,train,2007_01_01 03_50_29_27,,20,ZNE
3,2007/01/01 05:40:02.55,-21.23950,-70.05063,34.95,UP,2.169,0.020,2.269,0.020,"bucket0$3,:3,:1200",...,-21.04323,-69.4874,900.0,MA,ML,train,2007_01_01 05_40_02_55,,20,ZNE
4,2007/01/01 05:52:18.43,-21.81511,-68.65773,106.69,ID,2.028,0.021,2.080,0.024,"bucket0$4,:3,:1200",...,-21.04323,-69.4874,900.0,MA,ML,train,2007_01_01 05_52_18_43,,20,ZNE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2007/01/07 05:19:53.63,-21.02248,-68.96970,109.05,ID,1.464,0.016,1.513,0.021,"bucket2$25,:3,:1200",...,-21.04323,-69.4874,900.0,MA,ML,test,2007_01_07 05_19_53_63,,20,ZNE
96,2007/01/07 05:25:44.43,-20.06507,-69.15500,93.64,ID,1.606,0.024,1.660,0.026,"bucket2$26,:3,:1200",...,-21.04323,-69.4874,900.0,MA,ML,test,2007_01_07 05_25_44_43,,20,ZNE
97,2007/01/07 06:37:03.33,-21.13000,-68.90223,110.59,ID,1.530,0.012,1.574,0.013,"bucket2$27,:3,:1200",...,-21.04323,-69.4874,900.0,MA,ML,test,2007_01_07 06_37_03_33,,20,ZNE
98,2007/01/07 06:39:20.08,-21.26966,-68.73751,120.06,ID,3.514,0.035,3.584,0.032,"bucket2$28,:3,:1200",...,-21.04323,-69.4874,900.0,MA,ML,test,2007_01_07 06_39_20_08,,20,ZNE


In [5]:
print(data.metadata.iloc[0])

trace_start_time                 2007/01/01 01:42:45.08
source_latitude_deg                           -20.43802
source_longitude_deg                          -69.27681
source_depth_km                                   83.18
source_event_category                                ID
source_magnitude                                  1.353
source_magnitude_uncertainty                      0.014
source_magnitude2                                 1.426
source_magnitude_uncertainty2                     0.011
trace_name                           bucket0$0,:3,:1200
station_network_code                                 CX
station_code                                       PB01
station_type                                         BH
station_latitude_deg                          -21.04323
station_longitude_deg                          -69.4874
station_elevation_m                               900.0
source_magnitude_type                                MA
source_magnitude_type2                          

### So that's the metadata, how are the waveforms stored? h5 files!

In [6]:
import h5py

filename = "/home/jovyan/.seisbench/datasets/dummydataset/waveforms.hdf5"


f = h5py.File(filename, 'r')

list(f.keys())

['data', 'data_format']

In [7]:
# Looks like they divided up the waveforms based on length?

dset = f['data']
for name in dset:
    print(dset[name])

<HDF5 dataset "bucket0": shape (60, 3, 1200), type "<f8">
<HDF5 dataset "bucket1": shape (10, 3, 1200), type "<f8">
<HDF5 dataset "bucket2": shape (30, 3, 1200), type "<f8">


### Can grab a stream by using the "trace name" from the metadata:

In [8]:
tryit = dset['bucket0'][0,:3,:1200]
print(tryit)

[[  34.   47.   51. ...,  667.  695.  662.]
 [-100.  -99.  -98. ..., -229. -208. -201.]
 [ 645.  636.  625. ...,  420.  416.  374.]]


In [9]:
dset = f['data_format']
for name in dset:
    print(dset[name])

<HDF5 dataset "component_order": shape (), type "|O">
<HDF5 dataset "dimension_order": shape (), type "|O">
<HDF5 dataset "instrument_response": shape (), type "|O">
<HDF5 dataset "measurement": shape (), type "|O">
<HDF5 dataset "sampling_rate": shape (), type "<i8">
<HDF5 dataset "unit": shape (), type "|O">


### There are some command line tools that can help look at what's in an h5 file on first glance.

From the command line, h5dump -H /home/jovyan/.seisbench/datasets/iquique/waveforms.hdf5
Will list the header of the file and how contents of the file are split into groups, and then datasets within groups

### Also, xarray is a really helpful module to sift through h5 files and see what's in there!

In [20]:
import xarray
import h5netcdf

### If the h5 file has multiple groups, can't just open the whole thing- have to open one group at a time. In the seisbench case, there is the data_format and data group

In [26]:
ds = xarray.open_dataset(filename,group='data_format')
print(ds)
print(ds.measurement)

<xarray.Dataset>
Dimensions:              ()
Data variables:
    component_order      object ...
    dimension_order      object ...
    instrument_response  object ...
    measurement          object ...
    sampling_rate        int64 ...
    unit                 object ...
<xarray.DataArray 'measurement' ()>
array('velocity',
      dtype='<U8')


In [21]:
# We can see that because seisbench hasn't stored the dimension names, we have to give them phony names
ds = xarray.open_dataset(filename,group='data',phony_dims='access')
print(ds)

<xarray.Dataset>
Dimensions:  (phony_dim_0: 60, phony_dim_1: 3, phony_dim_2: 1200,
              phony_dim_3: 10, phony_dim_4: 30)
Dimensions without coordinates: phony_dim_0, phony_dim_1, phony_dim_2,
                                phony_dim_3, phony_dim_4
Data variables:
    bucket0  (phony_dim_0, phony_dim_1, phony_dim_2) float64 ...
    bucket1  (phony_dim_3, phony_dim_1, phony_dim_2) float64 ...
    bucket2  (phony_dim_4, phony_dim_1, phony_dim_2) float64 ...


#### We see that the stored waveform is very long. For the Iquique dataset, it looks like they started the waveform 2000 samples before the P pick, and ended it 12000 samples after the S arrival. 
#### For a 100 Hz sampling rate, this is 2 minutes before P and 12 minutes after S. 
#### Note that not all windows have an S pick- for these traces, it looks like they cut the trace to 12 minutes after the P. 