In [3]:
import json
import h5py
import numpy as np
import pandas as pd
import h5py as h5
import json
import sys
import os
import csv
import re
import random
import pprint

from sklearn.model_selection import train_test_split

base_dir = 'F:'
os.chdir(base_dir)

In [5]:
root = os.path.join('F:\\', 'Thesis_ssd', 'norsar_data_nov')
csv_folder = os.path.join('F:\\', 'Thesis_ssd','MasterThesis3.0','csv_folder')
explo_path = os.path.join(root, 'explosions')
earth_path = os.path.join(root, 'earthquakes')
noise_path = os.path.join(root, 'noise')
induced_path = os.path.join(root, 'induced')
data_csv = 'event_paths_no_nan_no_induced.csv'
balanced_csv = 'balanced_csv_3_class.csv'
test_set_csv = 'DO_NOT_TOUCH_test_set.csv'

In [6]:
base_dir = os.path.join('F:\\', 'Thesis_ssd', 'norsar_data_nov')
os.listdir(csv_folder)

['balanced_csv_2_class.csv',
 'balanced_csv_3_class.csv',
 'event_paths_no_nan_no_induced.csv',
 'event_paths_no_nan_no_induced_no_explosions.csv',
 '.ipynb_checkpoints',
 '2_classes',
 '3_classes',
 '4_classes']

### Bypassing issues
 - Due to either dataset issues or ignorance, I was forced to label the dataset by their file names, rather than event info.
 - The dataset contains a lot of NaN values, and therefore need to have a method for omitting those datapoints

In [15]:
def generate_event_csv(root, csv_name, output_folder = csv_folder):
    with open(os.path.join(output_folder, csv_name), 'w+', newline='') as file:
        writer = csv.writer(file, delimiter = ' ', quotechar = '|', quoting = csv.QUOTE_MINIMAL)
        
        for idx, event_type in enumerate(os.listdir(root)):
            if event_type == 'induced':
                print(f"Skipped {event_type}")
                continue
            event_list = os.listdir(os.path.join(root,event_type))
            event_type_total_nr = len(event_list)
            for idx_by_event, event in enumerate(event_list):
                path = os.path.join(root, event_type, event)
                trace_array, label, info = path_to_trace(path)
                progress_bar(idx_by_event + 1, event_type_total_nr, label)
                if np.any(pd.isnull(trace_array)):
                    print(f"NaN values in (label, file name) {label, event}")
                    continue
                if label == "induced or triggered event":
                    print("If you're reading this, someone has a lot of work to do.")
                    continue
                else:
                    writer.writerow([f'{path},{label}'])
                
            print(f'Completed {event_type}                                            ')
    print("Completed " + csv_name)
    
def csv_to_numpy(data_csv, csv_folder = csv_folder):
    with open(csv_folder + '/' + data_csv) as file:
        file_list = np.array(list(file))
        dataset = np.empty((len(file_list), 2), dtype=object)
        for idx, event in enumerate(file_list):
            path, label = event.split(',')
            dataset[idx][0] = path.rstrip()
            dataset[idx][1] = label.rstrip()
        file.close()
    return dataset
    
def path_to_trace(path):
    with h5py.File(path, 'r') as dp:
        trace_array = np.array(dp.get('traces'))
        info = np.array(dp.get('event_info'))
        info = json.loads(str(info))
    # No event type for noise, so handling that below
    if path.split('\\')[2] == 'noise':
        label = 'noise'
    else:
        label = info['event_type']
    # Since we consider induced earthquakes as earthquakes we need to handle that as well:
    if label == "induced or triggered event":
        label = "earthquake"
    return trace_array, label, info

def assert_true_labels(ds):
    for path, label in ds:
        _, label_from_trace, _ = path_to_trace(path)
        if label_from_trace != 'noise':
            label_from_trace = label_from_trace + 's'
            label = label + 's'
        assert path.split('/')[1] == label == label_from_trace, f'Mismatch between {path.split("/")[1]} and {label} and {label_from_trace}, at path: {path}'


def get_class_distribution_from_csv(data_csv, csv_folder = csv_folder):
    with open(csv_folder + '/' + data_csv) as file:
        nr_earthquakes = 0
        nr_explosions = 0
        nr_noise = 0
        nr_total = 0
        for row in file:
            event_type = row.split(',')[1].rstrip()
            if event_type == "earthquake":
                nr_earthquakes += 1
            elif event_type == "explosion":
                nr_explosions += 1
            elif event_type == "noise":            
                nr_noise += 1
            nr_total += 1
        
        return nr_earthquakes, nr_explosions, nr_noise, nr_total


    

def even_classes_csv(data_csv, balanced_csv_name, output_folder = csv_folder):
    nr_earthquakes, nr_explosions, nr_noise, nr_total = get_class_distribution_from_csv(data_csv)
    min_class_nr = min([nr_earthquakes, nr_explosions, nr_noise])
    print(min_class_nr)
    csv_numpy = csv_to_numpy(data_csv)
    class_count = [0,0,0,0]
    earthquake_list = []
    explosion_list = []
    noise_list = []
    final_class_count = []
    for path, label in csv_numpy:
        if label == "explosion":
            explosion_list.append([path,label])
            class_count[0] += 1
            class_count[3] += 1
        if label == "earthquake":
            earthquake_list.append([path,label])
            class_count[1] += 1
            class_count[3] += 1
        if label == "noise":
            noise_list.append([path,label])
            class_count[2] += 1
            class_count[3] += 1
    pure_classes = [earthquake_list, explosion_list, noise_list]
    print(len(pure_classes[0]))
    with open(output_folder + '/' + balanced_csv_name, 'w+', newline='') as file:
        writer = csv.writer(file, delimiter = ' ', quotechar = '|', quoting = csv.QUOTE_MINIMAL)
        for single_class in pure_classes:
            print(f'Sample_size = {min_class_nr} single_class length: {len(single_class)}' )
            random_samples = random.sample(single_class, min_class_nr)
            final_class_count.append(len(random_samples))
            for path, label in random_samples:
                writer.writerow([f'{path},{label}'])
        print(final_class_count)

def create_true_test_set(full_ds):
    ds = full_ds
    np.random.shuffle(ds)
    train_val, test = train_test_split(ds, test_size = 0.075)
    return test
        
def generate_subset_csv(input_ds, output_csv_name, output_folder):
    nr_rows = len(input_ds)
    with open(output_folder + '/' + output_csv_name, 'w+', newline='') as file:
        writer = csv.writer(file, delimiter = ',', quotechar = ' ', quoting = csv.QUOTE_MINIMAL)
        for row in input_ds:
            writer.writerow(row)
    print(f'Completed writing {nr_rows} rows to {output_folder}/{output_csv_name}.')
        

def downsample_label(target_label, ds, n_samples):
    target_array = np.array([x for x in ds if x[1] == target_label], dtype = object)
    down_ds = np.array([y for y in ds if y[1] != target_label], dtype = object)
    down_ds = np.concatenate((down_ds, target_array[np.random.choice(target_array.shape[0], n_samples, replace = True)]))
    return np.array(down_ds)

def upsample_label(target_label, ds, n_samples):
    target_array = np.array([x for x in ds if x[1] == target_label])
    up_ds = [y for y in ds if y[1] != target_label]
    up_ds = np.concatenate((up_ds, target_array[np.random.choice(target_array.shape[0], n_samples, replace = True)]))
    return np.array(up_ds)

def frac_diff_n_samples(frac_diff, min_counts, max_counts):
    diff = max_counts - min_counts
    n_samples = int(min_counts + diff*frac_diff)
    return n_samples

def balance_ds(ds, downsample, upsample, frac_diff = 0):
    unique_labels, counts = np.unique(ds[:,1], return_counts = True)
    nr_classes = len(unique_labels)
    if downsample:
        # Downsamples by first reducing the largest class, then the second class.
        for i in range(nr_classes-1):
            print(i)
            unique_labels, counts = np.unique(ds[:,1], return_counts = True)
            most_occuring_label = unique_labels[np.where(counts == max(counts))]
            n_samples_frac_diff = frac_diff_n_samples(frac_diff, min(counts), max(counts))
            ds = downsample_label(most_occuring_label, ds, n_samples_frac_diff)
    if upsample:
        #
        unique_labels, counts = np.unique(ds[:,1], return_counts = True)
        least_occuring_label = unique_labels[np.where(counts == min(counts))]
        n_samples_for_balance = max(counts)
        ds = upsample_label(least_occuring_label, ds, n_samples_for_balance)
    np.random.shuffle(ds)
    return ds
        
        
def progress_bar(current, total, event_type, barLength = 20):
        percent = float(current) * 100 / total
        arrow   = '-' * int(percent/100 * barLength - 1) + '>'
        spaces  = ' ' * (barLength - len(arrow))
        print('Writing %ss [%s%s] %d %%' % (event_type, arrow, spaces, percent), end='\r')
                      
    

In [130]:
#generate_event_csv(root, data_csv)

In [120]:
nr_eq, nr_ex, nr_noise, nr_total = get_class_distribution_from_csv(data_csv)
print(f'Earthquakes: {nr_eq}, Explosions: {nr_ex}, Noise: {nr_noise}, Total: {nr_total}')

Earthquakes: 9464, Explosions: 111173, Noise: 114578, Total: 235215


In [8]:
full_ds = csv_to_numpy(data_csv)

In [16]:
test_set = create_true_test_set(full_ds)

In [17]:
print(np.unique(test_set[:,1], return_counts = True))

(array(['earthquake', 'explosion', 'noise'], dtype=object), array([ 698, 8365, 8579], dtype=int64))


In [39]:
generate_subset_csv(full_ds_no_test, "full_no_test.csv", csv_folder)

Completed writing 217573 rows to F:\Thesis_ssd\MasterThesis3.0\csv_folder/full_no_test.csv.


##### Balanced data

In [46]:
# Generate balanced data csv using data_csv:
even_classes_csv(data_csv, balanced_csv)

6852
6852
Sample_size = 6852 single_class length: 6852
Sample_size = 6852 single_class length: 107786
Sample_size = 6852 single_class length: 91612
[6852, 6852, 6852]


In [47]:
ds = csv_to_numpy(balanced_csv)

train_ds, test_val_ds = train_test_split(ds, test_size = 0.2)
val_ds, test_ds = train_test_split(test_val_ds, test_size = 0.3)

In [48]:
print(len(train_ds), len(val_ds), len(test_ds))
print(len(train_ds) + len(val_ds) + len(test_ds))

16444 2878 1234
20556


In [49]:
assert_true_labels(ds)

##### Creating balanced splitted csvs

In [52]:

root_sub = 'balanced'
output_folder = f"{csv_folder}/csv_folder_3_class/{root_sub}"

# Test set:
output_csv_name = "test_set.csv"
generate_subset_csv(test_ds, output_csv_name, output_folder)

# Validation set:
output_csv_name = "validation_set.csv"
generate_subset_csv(val_ds, output_csv_name, output_folder)

# Train set:
output_csv_name = "train_set.csv"
generate_subset_csv(train_ds, output_csv_name, output_folder)


Completed writing 1234 rows to C:\Documents\Thesis_ssd\MasterThesis-2.0\csv_folder/csv_folder_3_class/balanced/test_set.csv.
Completed writing 2878 rows to C:\Documents\Thesis_ssd\MasterThesis-2.0\csv_folder/csv_folder_3_class/balanced/validation_set.csv.
Completed writing 16444 rows to C:\Documents\Thesis_ssd\MasterThesis-2.0\csv_folder/csv_folder_3_class/balanced/train_set.csv.


In [11]:
assert_true_labels(csv_to_numpy('csv_folder_3_class/balanced/test_set.csv'))
assert_true_labels(csv_to_numpy('csv_folder_3_class/balanced/validation_set.csv'))
assert_true_labels(csv_to_numpy('csv_folder_3_class/balanced/train_set.csv'))

In [17]:
import time

time_start = time.time()
eq_trace, eq_label, eq_info = path_to_trace(f"{os.path.join(noise_path, '2015-06-02T12.51.52.000000Z.h5')}")
time_end = time.time()
print(time_end - time_start)

KeyError: 'event_type'

In [15]:
pprint.pprint(eq_info)

{'analyst_pick_time': None,
 'az_to_arces': 54.610377749207366,
 'baz_to_arces': 253.12745291713466,
 'comments': [{'creation_info': {'author': 'general'},
               'resource_id': 'smi:local/563d926d-299d-4672-92c0-fc7232966ee4',
               'text': 'PROBABLY EARTHQUAKE'}],
 'dist_to_arces': 941.1384693251722,
 'est_arrivaltime_arces': '1991-01-13 02:24:29.890000',
 'event_type': 'earthquake',
 'event_type_certainty': 'suspected',
 'magnitude_dist_ratio': 0.0036126458654250036,
 'magnitude_sqrtdist_ratio': 0.11082867833934054,
 'magnitudes': [{'creation_info': {'agency_id': 'HEL'},
                 'mag': 3.4,
                 'magnitude_type': 'ML',
                 'origin_id': 'smi:local/0dc10fdd-f584-4a02-81f8-42ab1875b841',
                 'resource_id': 'smi:local/18b449b8-b1e1-4bdf-baed-431b5a7d3508',
                 'station_count': 38}],
 'origins': [{'creation_info': {'agency_id': 'HEL'},
              'depth': 15.0,
              'latitude': 65.759,
              

In [37]:
path = f"{os.path.join(noise_path, '2015-06-02T12.51.52.000000Z.h5')}"

def open_file(path):
    with h5py.File(path, 'r') as dp:
        trace_array = np.array(dp.get('traces'))
        info = np.array(dp.get('event_info'))
        info = json.loads(str(info))
        print(path.split("\\")[2])
    return trace_array, info

In [38]:
trace, info = open_file(path)

noise


In [27]:
info

{'est_arrivaltime_arces': '2015-06-02 12:52:52',
 'baz_to_arces': 0.9887103208968245,
 'analyst_pick_time': None,
 'trace_stats': {'starttime': '2015-06-02T12:51:52.000000Z',
  'sampling_rate': 40.0,
  'station': 'ARCES beam',
  'channels': ['P-beam, vertical', 'S-beam, transverse', 'S-beam, radial']}}

In [19]:
print(len(full_ds), len(test_set))

235215 17642


In [27]:
mock_full = np.concatenate((full_ds[0:100], test_set))
mock_test = test_set

In [28]:
reduced_mock = np.array([x for x in mock_full if x not in mock_test])

In [29]:
len(reduced_mock)

0

In [35]:
def remove_test_set_from_full_ds(full_ds, test_set):
    return [x for x in full_ds if x[0] not in test_set[:,0]]


In [36]:
full_ds_no_test = remove_test_set_from_full_ds(full_ds, test_set)

In [37]:
len(full_ds_no_test)

217573

In [34]:
full_ds[0]

array(['F:/Thesis_ssd\\norsar_data_nov\\explosions\\2007-02-25T00.20.28.832000Z.h5',
       'explosion'], dtype=object)

In [38]:
print(len(full_ds)- len(test_set))

217573
