This notebook is designed to generate the final catalog based on .reloc file, the ISC catalog, and the GCMT catalog.

In [14]:
import pandas as pd
import obspy
from collections import defaultdict
from tqdm import tqdm
from itertools import product
import numpy as np

## load the .reloc file

In [4]:
file_path="/Users/ziyixi/Library/CloudStorage/OneDrive-MichiganStateUniversity/Paper/PhaseNetTF_myturn/PhaseNet-TF-Figures/phasenettf/data/catalog/tomoDD.all_months_threshold10_08.reloc"
df=pd.read_csv(file_path,header=None,sep="\s+",usecols=[0,1,2,3,10,11,12,13,14,15],names=["id","latitude","longitude","depth","year","month","day","hour","minute","second"])
# construct timestamp column
df["timestamp"]=pd.to_datetime(df[["year","month","day","hour","minute","second"]])

In [5]:
df.head()

Unnamed: 0,id,latitude,longitude,depth,year,month,day,hour,minute,second,timestamp
0,63,-17.106491,-176.983963,371.639,2010,9,30,17,51,35.64,2010-09-30 17:51:35.640
1,80,-21.711088,-174.561371,19.893,2010,6,4,15,15,22.08,2010-06-04 15:15:22.080
2,85,-21.853592,-174.170853,1.706,2010,2,14,22,23,22.6,2010-02-14 22:23:22.600
3,86,-19.213507,-175.351212,241.381,2010,6,14,21,58,44.48,2010-06-14 21:58:44.480
4,89,-17.339659,-174.425491,633.462,2010,10,20,4,10,25.97,2010-10-20 04:10:25.970


## load GCMT catalog

In [6]:
gcmt_catalog=obspy.read_events("/Users/ziyixi/Library/CloudStorage/OneDrive-MichiganStateUniversity/Paper/PhaseNetTF_myturn/PhaseNet-TF-Figures/phasenettf/data/catalog/gcmt.txt")

df_gcmt=[]
for event in gcmt_catalog:
    df_gcmt.append({"gcmt_id":event.resource_id.id.split("/")[2],"latitude":event.origins[0].latitude,"longitude":event.origins[0].longitude,"depth":event.origins[0].depth/1000,"timestamp":event.origins[0].time}) 
df_gcmt=pd.DataFrame(df_gcmt)

# remove last character of timestamp
df_gcmt["timestamp"]=df_gcmt["timestamp"].astype(str).str[:-1]
df_gcmt["timestamp"]=pd.to_datetime(df_gcmt["timestamp"])

In [7]:
df_gcmt.head()

Unnamed: 0,gcmt_id,latitude,longitude,depth,timestamp
0,200911012128A,-15.34,-173.39,120.42,2009-11-01 21:29:04.170
1,200911021706A,-15.44,-172.97,12.0,2009-11-02 17:06:54.930
2,200911050600A,-17.49,-176.66,25.68,2009-11-05 06:00:44.290
3,200911050604A,-17.42,-176.64,23.94,2009-11-05 06:04:56.460
4,200911050611A,-17.53,-176.55,16.52,2009-11-05 06:11:54.510


## load ISC catalog

In [8]:
isc_file_path="/Users/ziyixi/Library/CloudStorage/OneDrive-MichiganStateUniversity/Paper/PhaseNetTF_myturn/PhaseNet-TF-Figures/phasenettf/data/catalog/isc.csv"
df_isc=pd.read_csv(isc_file_path,skiprows=1,usecols=[0,3,4,5,6,7],names=["isc_id","date","time","latitude","longitude","depth"])
df_isc["timestamp"]=pd.to_datetime(df_isc["date"]+" "+df_isc["time"])

In [9]:
df_isc.head()

Unnamed: 0,isc_id,date,time,latitude,longitude,depth,timestamp
0,17143180,2009-11-01,02:05:26.33,-17.2298,179.8438,0.0,2009-11-01 02:05:26.330
1,17143181,2009-11-01,02:07:16,-16.2396,-179.2624,0.0,2009-11-01 02:07:16.000
2,17143183,2009-11-01,03:26:25.81,-14.4712,-173.5185,0.0,2009-11-01 03:26:25.810
3,17143186,2009-11-01,04:38:01.44,-14.3876,-173.1307,0.0,2009-11-01 04:38:01.440
4,17143195,2009-11-01,13:42:15.61,-17.6692,-175.3274,0.0,2009-11-01 13:42:15.610


## Do some analysis

In [23]:
# Define thresholds
DISTANCE_THRESHOLD=0.5
DEPTH_THRESHOLD=100
TIME_THRESHOLD=10

# Convert DataFrame to list of records for faster iteration
isc_records = df_isc.to_dict('records')
df_records = df.to_dict('records')

# Initialize matches dictionary
isc_matches=defaultdict(lambda: {"closest_event": None, "min_time_difference": np.inf})

# Iterate over cartesian product of records
for isc_event, tomodd_event in tqdm(product(isc_records, df_records), total=len(isc_records)*len(df_records)):
    # Calculate differences
    lat_diff = abs(isc_event["latitude"] - tomodd_event["latitude"])
    lon_diff = abs(isc_event["longitude"] - tomodd_event["longitude"])
    depth_diff = abs(isc_event["depth"] - tomodd_event["depth"])
    time_diff = abs((isc_event["timestamp"] - tomodd_event["timestamp"]).total_seconds())

    # If within spatial thresholds
    if lat_diff < DISTANCE_THRESHOLD and lon_diff < DISTANCE_THRESHOLD and depth_diff < DEPTH_THRESHOLD and time_diff < TIME_THRESHOLD:
        # If the time difference is smaller than the current smallest time difference for this isc_id
        if time_diff < isc_matches[isc_event["isc_id"]]["min_time_difference"]:
            # Update the closest event and the smallest time difference
            isc_matches[isc_event["isc_id"]]["closest_event"] = tomodd_event
            isc_matches[isc_event["isc_id"]]["min_time_difference"] = time_diff

# If you want to remove the time differences from the final dictionary and just keep the closest events
isc_matches_final={}
for isc_id in isc_matches:
    isc_matches_final[isc_id] = isc_matches[isc_id]["closest_event"]

100%|██████████| 30200880/30200880 [00:45<00:00, 666674.82it/s]


In [26]:
isc_matches_final[15810850]

{'id': 2272,
 'latitude': -17.768507,
 'longitude': -178.188599,
 'depth': 583.527,
 'year': 2010,
 'month': 12,
 'day': 18,
 'hour': 4,
 'minute': 4,
 'second': 30.41,
 'timestamp': Timestamp('2010-12-18 04:04:30.410000')}

In [27]:
df_with_isc=df.copy()
df_with_isc["isc_id"]=-1
# update isc_id from isc_matches_final
for isc_id in isc_matches_final:
    df_with_isc.loc[df_with_isc["timestamp"]==isc_matches_final[isc_id]["timestamp"],"isc_id"]=isc_id

## Do similar things to the GCMT catalog

In [31]:
# Define thresholds
DISTANCE_THRESHOLD=0.5
DEPTH_THRESHOLD=100
TIME_THRESHOLD=10

# Convert DataFrame to list of records for faster iteration
gcmt_records = df_gcmt.to_dict('records')
df_records = df.to_dict('records')

# Initialize matches dictionary
gcmt_matches=defaultdict(lambda: {"closest_event": None, "min_time_difference": np.inf})

# Iterate over cartesian product of records
for gcmt_event, tomodd_event in tqdm(product(gcmt_records, df_records), total=len(gcmt_records)*len(df_records)):
    # Calculate differences
    lat_diff = abs(gcmt_event["latitude"] - tomodd_event["latitude"])
    lon_diff = abs(gcmt_event["longitude"] - tomodd_event["longitude"])
    depth_diff = abs(gcmt_event["depth"] - tomodd_event["depth"])
    time_diff = abs((gcmt_event["timestamp"] - tomodd_event["timestamp"]).total_seconds())

    # If within spatial thresholds
    if lat_diff < DISTANCE_THRESHOLD and lon_diff < DISTANCE_THRESHOLD and depth_diff < DEPTH_THRESHOLD and time_diff < TIME_THRESHOLD:
        # If the time difference is smaller than the current smallest time difference for this gcmt_id
        if time_diff < gcmt_matches[gcmt_event["gcmt_id"]]["min_time_difference"]:
            # Update the closest event and the smallest time difference
            gcmt_matches[gcmt_event["gcmt_id"]]["closest_event"] = tomodd_event
            gcmt_matches[gcmt_event["gcmt_id"]]["min_time_difference"] = time_diff

# If you want to remove the time differences from the final dictionary and just keep the closest events
gcmt_matches_final={}
for gcmt_id in gcmt_matches:
    gcmt_matches_final[gcmt_id] = gcmt_matches[gcmt_id]["closest_event"]

100%|██████████| 941280/941280 [00:01<00:00, 642595.75it/s]


In [32]:
len(df_gcmt),len(gcmt_matches_final)

(106, 54)

In [33]:
df_final=df_with_isc.copy()
df_final["gcmt_id"]=-1

for gcmt_id in gcmt_matches_final:
    df_final.loc[df_final["timestamp"]==gcmt_matches_final[gcmt_id]["timestamp"],"gcmt_id"]=gcmt_id

In [36]:
# write df_final to csv
df_final.to_csv("./res/phasenetv3_08_gammaglobal_relocation10.csv",index=False)

## do simple analysis for df_final

In [37]:
len(df_final[df_final["isc_id"]!=-1])

592

In [39]:
len(df_final[df_final["gcmt_id"]!=-1])

54

In [41]:
# both isc_id and gcmt_id are not -1
len(df_final[(df_final["isc_id"]!=-1) & (df_final["gcmt_id"]!=-1)])

44

In [42]:
len(df_isc)

3401

In [43]:
len(df_gcmt)

106

In [44]:
len(df_final)

8880