In [1]:
import json
import h5py
import numpy as np
import pandas as pd
import h5py as h5
import json
import sys
import os
import csv
import re
import random
import pprint

from sklearn.model_selection import train_test_split

In [2]:
root = 'C:\Documents\Thesis_ssd\data_tord_may2020'
csv_folder = 'C:\Documents\Thesis_ssd\MasterThesis-2.0\csv_folder'
explo_path = f'{root}/explosions/'
earth_path = f'{root}/earthquakes/'
noise_path = f'{root}/noise/'
induced_path = f'{root}/induced/'
data_csv = 'event_paths_no_nan_no_induced_no_explosions.csv'
balanced_csv = 'balanced_csv_2_class.csv'

In [8]:
def generate_event_csv(root, csv_name, output_folder = csv_folder):
    with open(output_folder + '/' + csv_name, 'w+', newline='') as file:
        writer = csv.writer(file, delimiter = ' ', quotechar = '|', quoting = csv.QUOTE_MINIMAL)
        
        for event_type in os.listdir(root):
            if event_type == 'induced' or event_type == 'explosions':
                continue
            event_list = os.listdir(root+'/'+event_type)
            for event in event_list:
                path = f'{root}/{event_type}/{event}'
                trace_array, label, info = path_to_trace(path)
                if np.any(pd.isnull(trace_array)):
                    continue
                if label == "induced or triggered event" or label == "explosion":
                    continue
                else:
                    writer.writerow([f'{root}/{event_type}/{event},{label}'])
            print(f'Completed {event_type}')
    print("Completed " + csv_name)
    
def csv_to_numpy(data_csv, csv_folder = csv_folder):
    nr_rows = 0
    with open(csv_folder + '/' + data_csv, 'r') as file:
        csv_reader = csv.reader(file, delimiter = ',')
        nr_rows = len(list(csv_reader))
    dataset = np.empty((nr_rows, 2), dtype=object)
    with open(csv_folder + '/' + data_csv, 'r') as file:
        csv_reader = csv.reader(file, delimiter = ',')
        idx = 0
        for row in csv_reader:
            dataset[idx][0] = row[0]
            dataset[idx][1] = row[1]
            idx += 1
        np.random.shuffle(dataset)
        return dataset
    
def path_to_trace(path):
    trace_array = np.empty((3,6001))
    with h5py.File(path, 'r') as dp:
        trace_array[:3] = dp.get('traces')
        info = np.array(dp.get('event_info'))
        info = json.loads(str(info))
    # No event type for noise, so handling that below
    if path.split('/')[1] == 'noise':
        label = 'noise'
    else:
        label = info['event_type']
    # Since we consider induced earthquakes as earthquakes we need to handle that as well:
    if label == "induced or triggered event":
        label = "earthquake"
    return trace_array, label, info

def assert_true_labels(ds):
    for path, label in ds:
        _, label_from_trace, _ = path_to_trace(path)
        if label_from_trace != 'noise':
            label_from_trace = label_from_trace + 's'
            label = label + 's'
        assert path.split('/')[1] == label == label_from_trace, f'Mismatch between {path.split("/")[1]} and {label} and {label_from_trace}, at path: {path}'


def get_class_distribution_from_csv(data_csv, csv_folder = csv_folder):
    with open(csv_folder + '/' + data_csv) as file:
        nr_earthquakes = 0
        nr_explosions = 0
        nr_noise = 0
        nr_total = 0
        for row in file:
            event_type = row.split(',')[1].rstrip()
            if event_type == "earthquake":
                nr_earthquakes += 1
            elif event_type == "explosion":
                nr_explosions += 1
            elif event_type == "noise":            
                nr_noise += 1
            nr_total += 1
        
        return nr_earthquakes, nr_explosions, nr_noise, nr_total
    

def even_classes_csv(data_csv, balanced_csv_name, output_folder = csv_folder):
    nr_earthquakes, nr_explosions, nr_noise, nr_total = get_class_distribution_from_csv(data_csv)
    min_class_nr = min([nr_earthquakes, nr_noise])
    print(min_class_nr)
    csv_numpy = csv_to_numpy(data_csv)
    class_count = [0,0,0]
    earthquake_list = []
    noise_list = []
    final_class_count = []
    for path, label in csv_numpy:
        if label == "earthquake":
            earthquake_list.append([path,label])
            class_count[0] += 1
            class_count[2] += 1
        if label == "noise":
            noise_list.append([path,label])
            class_count[1] += 1
            class_count[2] += 1
    pure_classes = [earthquake_list, noise_list]
    print(len(pure_classes[0]))
    with open(output_folder + '/' + balanced_csv_name, 'w+', newline='') as file:
        writer = csv.writer(file, delimiter = ' ', quotechar = '|', quoting = csv.QUOTE_MINIMAL)
        for single_class in pure_classes:
            print(f'Sample_size = {min_class_nr} single_class length: {len(single_class)}' )
            random_samples = random.sample(single_class, min_class_nr)
            final_class_count.append(len(random_samples))
            for path, label in random_samples:
                writer.writerow([f'{path},{label}'])
        print(final_class_count)
            
            
        
        
def generate_subset_csv(input_ds, output_csv_name, output_folder):
    nr_rows = len(input_ds)
    with open(output_folder + '/' + output_csv_name, 'w+', newline='') as file:
        writer = csv.writer(file, delimiter = ',', quotechar = ' ', quoting = csv.QUOTE_MINIMAL)
        for row in input_ds:
            writer.writerow(row)
    print(f'Completed writing {nr_rows} rows to {output_folder}/{output_csv_name}.')
        
        

In [5]:
generate_event_csv(root, data_csv)

Completed earthquakes
Completed noise
Completed event_paths_no_nan_no_induced_no_explosions.csv


In [6]:
nr_eq, nr_ex, nr_noise, nr_total = get_class_distribution_from_csv(data_csv)
print(f'Earthquakes: {nr_eq}, Explosions: {nr_ex}, Noise: {nr_noise}, Total: {nr_total}')

Earthquakes: 6852, Explosions: 0, Noise: 91612, Total: 98464


In [9]:
# Generate balanced data csv using data_csv:
even_classes_csv(data_csv, balanced_csv)

6852
6852
Sample_size = 6852 single_class length: 6852
Sample_size = 6852 single_class length: 91612
[6852, 6852]


In [10]:
ds = csv_to_numpy(balanced_csv)

train_ds, test_val_ds = train_test_split(ds, test_size = 0.2)
val_ds, test_ds = train_test_split(test_val_ds, test_size = 0.3)

In [11]:
print(len(train_ds), len(val_ds), len(test_ds))
print(len(train_ds) + len(val_ds) + len(test_ds))

10963 1918 823
13704


In [12]:
assert_true_labels(ds)

In [13]:

root_sub = 'balanced'
output_folder = f"{csv_folder}/2_class/{root_sub}"

# Test set:
output_csv_name = "test_set.csv"
generate_subset_csv(test_ds, output_csv_name, output_folder)

# Validation set:
output_csv_name = "validation_set.csv"
generate_subset_csv(val_ds, output_csv_name, output_folder)

# Train set:
output_csv_name = "train_set.csv"
generate_subset_csv(train_ds, output_csv_name, output_folder)


Completed writing 823 rows to C:\Documents\Thesis_ssd\MasterThesis-2.0\csv_folder/2_class/balanced/test_set.csv.
Completed writing 1918 rows to C:\Documents\Thesis_ssd\MasterThesis-2.0\csv_folder/2_class/balanced/validation_set.csv.
Completed writing 10963 rows to C:\Documents\Thesis_ssd\MasterThesis-2.0\csv_folder/2_class/balanced/train_set.csv.
