# Download, interpolate and dump of GPS data

e.g. will build a file  `velocity_cat=custom_ASlag=45_MS=5.5.npy`

## Basic Utils (MANDATORY)

In [None]:
import seismicutils
from seismicutils import SeismicUtils
import numpy as np
import pandas as pd
from importlib import reload
import matplotlib.pyplot as plt
if(True):
    reload(seismicutils)
from sklearn.metrics.pairwise import haversine_distances
import os
import networkx as nx
from mpl_toolkits.axes_grid1 import make_axes_locatable
# import warnings

from numba import njit

In [None]:
#pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Specify the folder where to put csv files of GPS data

In [None]:
#rootpath="/home/flandes/csv/"
rootpath="/Users/vincenzo/gps_csv/"
#rootpath="D:/gps/"
if(not os.path.exists(rootpath)):
    os.mkdir(rootpath)

## Main body

### Catalog loading

In [None]:
R0 = 6371  ## Earth's Radius
R_search = 300   ## Radius (in km) for which we look for GPS stations
## we predict aftershocks up to 45 days after the MS:
aftershocks_time_window = np.timedelta64(45,'D') 
min_mainshock_mag = 6 # this is telling us the minimal mainshock magnitude

In [None]:
catalog_type = 'custom' # just a flag for the output file
catalog_filename = 'custom_catalog.csv' # valid values: 'custom' or 'giuseppe'

In [None]:
n_days_forward  = np.timedelta64(1,'D')  ## how many days of GPS data to use after the MS 
n_days_backward = np.timedelta64(8, 'D') ## how many days of GPS data to use before the MS
n_total_days = 1 +  n_days_forward.astype('int')+  n_days_backward.astype('int')
## total will be 4+1(MS day)+1
n_total_days

In [None]:
regions = seismicutils.return_regions() 

In [None]:
fit_catalog = pd.read_csv(catalog_filename, sep=" ", parse_dates=['datetime'])
fit_catalog['day'] = fit_catalog.datetime.values.astype('datetime64[D]')
fit_catalog.sort_values(by='day', inplace=True)
fit_catalog.reset_index(drop=True, inplace=True)

In [None]:
#mainshocks = fit_catalog[fit_catalog.type==1]
#mainshocks.reset_index(inplace=True, drop=True)
#drop_ids = set()
#for i in range(0, len(mainshocks)):
#    if(mainshocks.iloc[i].seq_id in drop_ids):
#        continue
#    for j in range(i+1, len(mainshocks)):
#        delta_days = (mainshocks.iloc[j].day.to_numpy() - mainshocks.iloc[i].day.to_numpy()).astype('timedelta64[D]')
#        if(delta_days > aftershocks_time_window):
#            break
#        arg1 = np.radians(np.array([mainshocks.iloc[i].lat, mainshocks.iloc[i].lon]))[None,:]
#        arg2 = np.radians(np.array([mainshocks.iloc[j].lat, mainshocks.iloc[j].lon]))[None,:]
#        d_ij = R0*haversine_distances(arg1, arg2)[0,0]
#        if(d_ij < R_search):
#            print(mainshocks.iloc[j].seq_id,delta_days, d_ij, mainshocks.iloc[j].day.to_numpy())
#            drop_ids.add(mainshocks.iloc[j].seq_id)
#print('TO DROP SZ:', len(drop_ids))

### GPS stations and Mainshocks (TO BE RUN ONLY THE FIRST TIME, OR IF SOMETHING CHANGED)

#### Load the correct catalog (if the previous cell has run, it is already saved locally)

This is useful if we previously run and saved the catalog part of the notebook

#### Download and plot stations (spatial distribution)
We first download the list of all stations, then plot the distribution of the minimal station-to-station distance 

In [None]:
ngl_list = SeismicUtils.get_ngl_stations(post_process=True)
plot_station_distribution = True
if(plot_station_distribution):
    jlist = ngl_list[(ngl_list.lat>= regions['japan'][0])*(ngl_list.lat<= regions['japan'][1])\
                    *(ngl_list.lon >= regions['japan'][2])*(ngl_list.lon<= regions['japan'][3])]
    jlist_distances = haversine_distances(np.radians(jlist[['lat','lon']].values))
    plt.hist(R0*np.sort(jlist_distances, axis=0)[1:2,:].mean(axis=0), bins=np.linspace(0, 1e2))
    plt.xlabel('Minimal distance $d$ (km)')
    plt.ylabel('$P(d)$')
    plt.show()

### Conclusion:

most stations have a neighboring one in a distance < 20km. 

So, $\sigma_{interpolation}$ can be of the order of 20 km. 

For extrapolation (i.e. the choice of `alpha_max_dist`), one should take sthg of the order of 20km or a bit more (a bit more than  $\sigma_{interpolation}$ for sure), but not much more than 20km.

#### For each Mainshock, we find all stations around in a large radius (much larger than our future box). 

We discard MS that have too few stations (but not in a very restrictive way, for now)

"*Actual selection of mainshocks vs stations collection*"

In [None]:
## we ask for min_station_number to be there, but in a very large radius !
## so we may further filter later
min_station_number = 3
stations_to_download = set()
mainshock_stations = {}
mainshock_day = {}
for id, seq in fit_catalog.groupby('seq_id'):
    if(len(seq) <= 1):
        continue
    mainshock = seq[seq['type'] == 1] ## finding one MS
    expected_maximal_radius = max(10**(mainshock.mag.values[0]/2 - 0.79), R_search) ## still a radius
    stations_to_ms_dist = haversine_distances(np.radians(ngl_list[["lat","lon"]]), np.radians(mainshock[["lat","lon"]].values))[:,0]
    valid_stations_mask  =  R0*stations_to_ms_dist <= expected_maximal_radius
    valid_stations_mask*= (ngl_list.begin.values <= mainshock.day.values[0])\
                           *(ngl_list.end.values >= mainshock.day.values[0]) ## discard stations that are not yet born or that have been terminated
    valid_stations = ngl_list.name.values[valid_stations_mask]
    
    if(len(valid_stations) >= min_station_number):
        print('Success: ', id, len(valid_stations), )
        stations_to_download.update(list(valid_stations))
        mainshock_stations[id] = list(valid_stations)
        mainshock_day[id] = mainshock.day.values[0].astype('datetime64[D]')
    else:
        print('Failed: ', id, len(valid_stations), )

#### Load (or down-load) the csv (GPS data) files from NGL
We download/load according to the mainshocks previously collected

In [None]:
if(not os.path.exists(rootpath)):
    os.mkdir(rootpath)
mainshock_data = {}
labels_to_rename = {"_latitude(deg)" : "lat" ,"_longitude(deg)" : "lon", "__height(m)": "height"}
max_n_trials = 5
for s in stations_to_download:
    trial_fname = rootpath + s + ".csv"
    if(os.path.exists(trial_fname)):
        print(s, "(load existing)")

        data = pd.read_csv(trial_fname, sep=" ", parse_dates=['date'])
    else:
        print(s, "(download)")
        trials = 0
        while(trials < max_n_trials):
            try:
                data = SeismicUtils.get_ngl_gps_data(s,"IGS14", "tenv3")
                break
            except:
                trials += 1
        if(trials == max_n_trials):
            print("Failed. Tried: ", trials)
            continue
        data['date'] = [ seismicutils.SeismicUtils.str_to_datetime(s, 23) for s in data['YYMMMDD']]
        data['date'] = data['date'].values.astype('datetime64[D]')
        data.rename(labels_to_rename,axis=1, inplace=True)
        data = data[['date','site','lat','lon','height']]
        data.to_csv(trial_fname, sep=" ", index=False)
    data['lon'] = data['lon'] % 180
    for id in mainshock_stations.keys():
        if(s in mainshock_stations[id]):
            subdata = data[(data.date >= mainshock_day[id] - n_days_backward)*(data.date <= mainshock_day[id] + n_days_forward)]
            
            if(len(subdata) == n_total_days and np.isfinite(subdata[['lat','lon','height']].values).all()):
                if(id not in mainshock_data.keys()):
                    mainshock_data[id] = []
                mainshock_data[id].append(subdata)

We now build the dictionnary  `fit_dataset`, which contains:
(velocities, stations_positions, 
                           mainshock_day[id], mainshock.mag.values[0], \
                           mainshock_location, aftershocks_locations, aftershocks_mags)

In [None]:
conv_factor = R0*1e3*np.pi/180  # convert from lat,lon (degrees) to meters
fit_dataset = {}
for id in mainshock_data:   ## sweep all MS that are m>6 (not much filtered)
    velocities = []
    stations_positions = []    
    for md in mainshock_data[id]:  ## for each station close enough to that MS
        days = md['date'].values
        site = md['site'].values[0]
        station_position = ngl_list[ngl_list.name == site][['lat','lon']].values[0,:]  # station position (initial one)
        pos = md[['lat','lon','height']].values
        vel = np.diff(pos, axis=0)  ## 1-day velocity : we take the diff of position between 2 days as signal
        vel[:, [0,1]] = conv_factor*vel[:,[0,1]]
        velocities.append(vel[:,np.newaxis, :])
        stations_positions.append(station_position[np.newaxis, :])
    velocities = np.concatenate(velocities, axis=1) ## concatenate 1-days velocities over all stations
    ## velocities: (Tdays-1, Nstations(of that MS), 3)
    stations_positions = np.concatenate(stations_positions)
    
    ## extraction of the AS
    seq = fit_catalog[fit_catalog.seq_id==id]
    mainshock   = seq[seq['type']==1]
    aftershocks = seq[seq['type']==2]
    aftershocks = aftershocks[aftershocks.day > mainshock.day.values[0] + n_days_forward]
    ## note: at catalog extraction (before here) we already restricted to AS happening at t<t_MS+45 days.
    mainshock_location    = mainshock  [['lat','lon']].values[0,:]
    aftershocks_locations = aftershocks[['lat','lon']].values
    aftershocks_mags = aftershocks['mag'].values
    if(len(aftershocks_locations)>1):  #  and velocities.shape[0 (should be 1 and not 0)] >= min_station_number):
        fit_dataset[id] = (velocities, stations_positions, \
                           mainshock_day[id], mainshock.mag.values[0], \
                           mainshock_location, aftershocks_locations, aftershocks_mags)
## fit_dataset now contains all raw information relative to that sequence:
## (lat,lon) coordinates, GPS data (at the stations, not interpolated)
        

### Save

In [None]:
if(not os.path.exists('velset')):
    os.mkdir('velset')
#temp = { k:fit_dataset[k] for k in fit_dataset.keys() if k not in drop_ids}
#np.save('velset/' + SeismicUtils.format_velset_filename(catalog_type + 'filtered',aftershocks_time_window.astype('int'), min_mainshock_mag, n_total_days-1), temp)
np.save('velset/' + SeismicUtils.format_velset_filename(catalog_type,aftershocks_time_window.astype('int'), min_mainshock_mag, n_total_days-1), fit_dataset)