In [None]:
import numpy as np
from obspy.clients.fdsn import Client
from obspy import UTCDateTime
import matplotlib.pyplot as plt
import h5py
import pandas as pd

In [None]:
root = 'STEAD/'
num_chunks = 2
chunks = [ root + f'chunk{chunk}/chunk{chunk}.hdf5'  for chunk in range(1, 1+num_chunks) ]

In [None]:
from torch.utils.data import Dataset, DataLoader
class SteadDataset(Dataset):

    def __init__(self, chunk_files, channel_first):
        self.files = []
        self.event_lists = []
        self.stopping_indices = None
        for chunk in chunk_files:
            file = h5py.File(chunk, 'r')
            metadata = pd.read_csv(chunk.replace('hdf5', 'csv'))
            ev_list = metadata['trace_name'].astype('str').to_list()
            self.files.append(file)
            self.event_lists.append(ev_list)
            if self.stopping_indices:
                self.stopping_indices.append(self.stopping_indices[-1] + len(ev_list))
            else:
                self.stopping_indices = [len(ev_list)]
        self.stopping_indices = np.array(self.stopping_indices)
        self.channel_first = channel_first
    def __len__(self):
        return sum([len(ev_list) for ev_list in self.event_lists])
    

    def __getitem__(self, idx):
        # find which chunk
        chunk_idx = 0
        while idx >= self.stopping_indices[chunk_idx]:
            chunk_idx += 1
        relative_idx = idx - self.stopping_indices[chunk_idx - 1] if chunk_idx > 0 else idx
        event_name = self.event_lists[chunk_idx][relative_idx]
        file = self.files[chunk_idx].get('data/' + event_name)
        trace = np.array(file)
        p_arrival = file.attrs['p_arrival_sample']
        s_arrival = file.attrs['s_arrival_sample']
        coda_end = file.attrs['coda_end_sample']
        if(p_arrival == ''):
            p_arrival = np.nan
        if(s_arrival == ''):
            s_arrival = np.nan
        if(coda_end == ''):
            coda_end = np.nan
        if self.channel_first:
            trace = trace.transpose(1, 0)
        return trace, p_arrival.item(), s_arrival.item(), coda_end.item()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

#time is the last dimension
class ConvFeatureEncoder(nn.Module):
    def __init__(self, in_ch, dim, kernel_sizes, strides, paddings):
        super().__init__()
        self.net = nn.Sequential()
        for i, (k, s, p) in enumerate(zip(kernel_sizes, strides, paddings)):
            conv = nn.Conv1d(in_ch if i == 0 else dim, dim, kernel_size=k, stride=s, padding=p)
            self.net.add_module(f"conv_{i}", conv)
            self.net.add_module(f"gelu_{i}", nn.GELU())
    def forward(self, x):  
        return self.net(x)

#time is the last dimension
class ConvPositionalEncoding(nn.Module):
    def __init__(self, channels, kernel_size=3):
        super().__init__()
        self.conv = nn.Conv1d(
            channels, channels,
            kernel_size=kernel_size,
            groups=channels,
            padding=kernel_size // 2,
            bias=True
        )

    def forward(self, x):
        return x + self.conv(x)
# channel is the last dimension
class ContextEncoder(nn.Module):
    def __init__(self, dim, n_layers, n_heads, ffn_dim, dropout):
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim, nhead=n_heads, dim_feedforward=ffn_dim, dropout=dropout, activation='gelu', batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
    def forward(self, x):
        out = self.transformer(x)
        return out
    
class MaskedEncoderModel(nn.Module):
    def __init__(
        self,
        in_ch,
        dim,
        feature_enc_kernel_sizes,
        feature_enc_strides,
        feature_enc_paddings,
        context_n_layers,
        context_n_heads,
        context_ffn_dim,
        context_dropout=0.1,
        use_cpe=True
    ):
        super().__init__()
        self.feature_encoder = ConvFeatureEncoder(
            in_ch=in_ch,
            dim=dim,
            kernel_sizes=feature_enc_kernel_sizes,
            strides=feature_enc_strides,
            paddings=feature_enc_paddings,
        )
        self.use_cpe = use_cpe
        if use_cpe:
            self.cpe = ConvPositionalEncoding(dim, kernel_size=3)

        self.context_encoder = ContextEncoder(
            dim=dim,
            n_layers=context_n_layers,
            n_heads=context_n_heads,
            ffn_dim=context_ffn_dim,
            dropout=context_dropout
        )

        self.mask_embedding = nn.Parameter(torch.zeros(dim))
        nn.init.normal_(self.mask_embedding, mean=0.0, std=0.02)

        self.pred_head = nn.Sequential(
            nn.Linear(dim, dim),
            nn.GELU(),
            nn.Linear(dim, dim),
        )
    
    def forward(self, x, run_with_mask):
        # 1) Conv feature encoder: (B, in_ch, T_raw) -> (B, dim, T_enc)
        feats = self.feature_encoder(x)              # encoded targets

        # 2) Optional convolutional positional encoding
        if self.use_cpe:
            feats = self.cpe(feats)                 # (B, dim, T_enc)


        # 3) Prepare for transformer: (B, dim, T_enc) -> (B, T_enc, dim)
        feats_t = feats.transpose(1, 2)             # original encoded features (targets)

        if(run_with_mask):
            # 4) Create masked input sequence
            masked_input = feats_t.clone()
            mask_bool = torch.zeros_like(feats_t[:, :, 0], dtype=torch.bool)  # (B, T_enc)
            mask_bool[torch.arange(0, feats_t.size(0)), torch.randint(0, feats_t.size(1), (feats_t.size(0),))] = True  # randomly mask 1 time step per sample
            masked_input[mask_bool] = self.mask_embedding  # apply mask
            ctx = self.context_encoder(masked_input)  # (B, T_enc, dim)
            preds = self.pred_head(ctx)  # (B, T_enc, dim)
            return preds, feats_t, mask_bool
        else:
            ctx = self.context_encoder(feats_t)  # (B, T_enc, dim)
            return ctx
        

class EncodedSegmentation(nn.Module):
    def __init__(self, input_dim, input_timesteps,output_timesteps, kernel_sizes, strides, paddings):
        super().__init__()
        self.input_dim = input_dim
        self.input_timesteps = input_timesteps
        self.output_timesteps = output_timesteps
        self.up_sampler = nn.Sequential()
        current_length = input_timesteps
        for i, (k, s, p) in enumerate(zip(kernel_sizes, strides, paddings)):
            out_p = s -1
            #out_p = 0
            conv_transpose = nn.ConvTranspose1d(
                in_channels=input_dim,
                out_channels=input_dim,
                kernel_size=k,
                stride=s,
                padding=p,
                output_padding=out_p,
            )
            current_length = (current_length - 1) * s - 2 * p + (k-1) + out_p + 1
            self.up_sampler.add_module(f"conv_transpose_{i}", conv_transpose)
            self.up_sampler.add_module(f"gelu_{i}", nn.GELU())
        if(current_length > output_timesteps):
            self.up_sampler.add_module("crop_conv",nn.Conv1d(
                in_channels=input_dim,
                out_channels=input_dim,
                kernel_size=2,
                stride=1,
                padding=0,
                dilation=current_length - output_timesteps,))
            self.up_sampler.add_module("crop_gelu", nn.GELU())
        elif(current_length < output_timesteps):
            self.up_sampler.add_module("pad_layer", nn.ConstantPad1d((0, output_timesteps - current_length), 0.0))
    
        self.segmentation_head = nn.Sequential(
            nn.Conv1d(
                in_channels=input_dim,
                out_channels=1,
                kernel_size=1,
                stride=1,
                padding=0,
            ))
    def forward(self, x):
        upsampled_x =  self.up_sampler(x)
        return self.segmentation_head(upsampled_x).squeeze(1)

In [None]:
dataset = SteadDataset(chunks[1:] , channel_first=True)

In [None]:
input_dim = 3
num_epochs = 1
batch_size = 32
learning_rate = 0.001
p_mask = 0.01
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0) # must be 0 for hdf5

feature_enc_kernel_sizes=[10,8,4]
feature_enc_strides=[5,4,2]
feature_enc_paddings=[5,4,2]
model = MaskedEncoderModel(
    in_ch=input_dim,
    dim=256,
    feature_enc_kernel_sizes=feature_enc_kernel_sizes,
    feature_enc_strides=feature_enc_strides,
    feature_enc_paddings=feature_enc_paddings,
    context_n_layers=4,
    context_n_heads=8,
    context_ffn_dim=512,
    context_dropout=0.1,
    use_cpe=True
)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
for epoch in range(num_epochs):
    for i, (traces, p_arrivals, s_arrivals, coda_ends) in enumerate(dataloader):
        traces_mean = traces.mean(dim=2, keepdim=True)
        traces_std = traces.std(dim=2, keepdim=True) + 1e-9
        normalized_traces = (traces - traces_mean) / traces_std  # normalize input traces
        optimizer.zero_grad()
        preds, feats_t, mask_bool = model(normalized_traces, run_with_mask=True)
        masked_preds = preds[mask_bool]
        masked_targets = feats_t[mask_bool]
        loss = F.mse_loss(masked_preds, masked_targets)   
        loss.backward()
        optimizer.step()
        print(loss.item())

In [None]:
seg_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0) # must be 0 for hdf5
segmentation_model = EncodedSegmentation(input_dim=256, input_timesteps=151, output_timesteps=6000, 
                                 kernel_sizes=np.flip(feature_enc_kernel_sizes),
                                 strides=np.flip(feature_enc_strides), 
                                 paddings=np.flip(feature_enc_paddings))
optimizer_seg = optim.Adam(segmentation_model.parameters(), lr=learning_rate, weight_decay=1e-5)
criterion = nn.BCEWithLogitsLoss()
for epoch in range(num_epochs):
    for i, (traces, p_arrivals, s_arrivals, coda_ends) in enumerate(dataloader):
        traces_mean = traces.mean(dim=2, keepdim=True)
        traces_std = traces.std(dim=2, keepdim=True) + 1e-9
        normalized_traces = (traces - traces_mean) / traces_std  # normalize input traces
        with torch.no_grad():
            encoded_feats = model(normalized_traces, run_with_mask=False)  # (B, T_enc, dim)
            encoded_feats_t = encoded_feats.transpose(1, 2)  # (B, dim, T_enc)
        
        
        optimizer_seg.zero_grad()
        seg_outputs = segmentation_model(encoded_feats_t)  # (B, T_out)
        initial_indices = torch.floor(p_arrivals).long()
        final_indices = torch.ceil(s_arrivals).long()
        positions = torch.arange(seg_outputs.size(1)).unsqueeze(0).expand(seg_outputs.size(0), -1)  # [0..T-1]
        labels = (positions >= initial_indices.unsqueeze(1)) & (positions < final_indices.unsqueeze(1))
        loss = criterion(seg_outputs, labels.float())
        loss.backward()
        optimizer_seg.step()
        print(loss.item())

# OBSPY

In [None]:
client = Client("IRIS")
inventory = client.get_stations(
    minlongitude=6.0,
    maxlongitude=19.0,
    minlatitude=35.0,
    maxlatitude=47.0,    
    channel="BHZ",
    level="response"
)

In [None]:
from obspy import UTCDateTime
def extract_traces(inventory, t0, duration_seconds, sampling_rate):
    t1 = t0 + duration_seconds
    traces = []
    for net in inventory:
        for sta in net:

            # --- controllo periodo operativo (robusto) ---
            if sta.start_date and sta.start_date > t0:
                continue
            if sta.end_date and sta.end_date < t1:
                continue

            try:
                # wildcard su location
                st = client.get_waveforms(
                    net.code,
                    sta.code,
                    "*",
                    "BHZ",
                    t0,
                    t1,
                    attach_response=True
                )

                # se arrivano più location, scegli una (es. "00" se c'è)
                if len(set(tr.stats.location for tr in st)) > 1:
                    if "00" in [tr.stats.location for tr in st]:
                        st = st.select(location="00")
                    else:
                        st = st[:1]  # prendine una e basta

                #print(f"SUCCESS {net.code}.{sta.code}.{st[0].stats.location}.BHZ")


            except Exception as e:
                continue

            trp = st[0].copy()

            trp.detrend("linear")
            trp.taper(max_percentage=0.05, type="hann")  # 5% cosine taper at both ends
            
            #trp.detrend("demean")
            #trp.taper(0.02)
#
            #trp.remove_response(
            #    inventory=inventory,
            #    output="VEL",
            #    pre_filt=(0.2, 0.5, 15, 20),
            #)

            #trp.filter("bandpass", freqmin=0.5, freqmax=15.0, corners=4, zerophase=True)
            f_nyq = trp.stats.sampling_rate / 2
            freqmax = min(10, 0.9 * f_nyq)  # 90% of Nyquist
            trp.filter("bandpass", freqmin=2.0, freqmax=freqmax, corners=4, zerophase=True)
            trp.resample(sampling_rate)


            x = trp.data.astype(np.float32)
            traces.append(x)
    return traces

In [None]:
duration = 60*10
sampling_rate = 50.0  # Hz
t0 = UTCDateTime("2016-08-24T01:35:00")
t1 = t0 + duration
traces = extract_traces(inventory, t0, duration, sampling_rate)

In [None]:
def two_timeseries_lagged_infos(x1, x2, window_size, lag):
    """
    x1, x2: 1D numpy arrays of the same length
    window_size: size of the moving window
    lag: lag between the two time series (in samples)
    """
    n = len(x1)
    assert len(x2) == n, "Time series must have the same length"
    assert abs(lag) < n, "Lag must be less than the length of the time series"

    infos = []
    for start in range(0, n - window_size + 1):
        end = start + window_size
        segment1 = x1[start:end]
        segment2 = x2[start + lag:end + lag]
        if(len(segment2) != window_size):
            continue


        infos.append(np.cov(segment1, segment2))
    return infos

In [None]:
window_size = int(10.0 * sampling_rate)
lag = int(3.0 * sampling_rate)
results= []
variance_of_mutuals = []
mutuals = []
for info in two_timeseries_lagged_infos(traces[0], traces[3], window_size, lag):
    variance_of_mutuals.append(info[0,1]**2 / (info[0,0]*info[1,1]))
    mutuals.append(-0.5*np.log(1-variance_of_mutuals[-1]))
    vals = np.linalg.eigvalsh(info)
    results.append(vals)
results = np.array(results)
variance_of_mutuals = np.array(variance_of_mutuals)
mutuals = np.array(mutuals)

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(12, 4))

axs[0].plot(variance_of_mutuals/mutuals)
axs[1].plot(traces[0], color='red', alpha=0.5)
axs[1].plot(traces[1], color='blue', alpha=0.5)
plt.show()