In [2]:
import json
import h5py
import numpy as np
import pandas as pd
import h5py as h5
import json
import sys
import os
import csv
import re
import random
import pprint

from sklearn.model_selection import train_test_split

In [9]:
root = 'C:\Documents\Thesis_ssd\Master Thesis\data_tord_may2020'
output_folder = 'C:\Documents\Thesis_ssd\Master Thesis\MasterThesis-2.0\csv_folder'
explo_path = f'{root}/explosions/'
earth_path = f'{root}/earthquakes/'
noise_path = f'{root}/noise/'
induced_path = f'{root}/induced/'
data_csv = 'event_paths_no_nan_no_induced.csv'
balanced_csv = 'balanced_csv_3_class.csv'

### Bypassing issues
 - Due to either dataset issues or ignorance, I was forced to label the dataset by their file names, rather than event info.
 - The dataset contains a lot of NaN values, and therefore need to have a method for omitting those datapoints

In [13]:
def generate_event_csv(root, csv_name, output_folder):
    with open(output_folder + '/' + csv_name, 'w+', newline='') as file:
        writer = csv.writer(file, delimiter = ' ', quotechar = '|', quoting = csv.QUOTE_MINIMAL)
        
        for event_type in os.listdir(root):
            if event_type == 'induced':
                continue
            event_list = os.listdir(root+'/'+event_type)
            for event in event_list:
                path = f'{root}/{event_type}/{event}'
                trace_array, label, info = path_to_trace(path)
                if np.any(pd.isnull(trace_array)):
                    continue
                if label == "induced or triggered event":
                    continue
                else:
                    writer.writerow(f'{root}/{event_type}/{event},{label}')
            print(f'Completed {event_type}')
    print("Completed " + csv_name)
    
def csv_to_numpy(data_csv):
    nr_rows = 0
    with open(data_csv, 'r') as file:
        csv_reader = csv.reader(file, delimiter = ',')
        nr_rows = len(list(csv_reader))
    dataset = np.empty((nr_rows, 2), dtype=object)
    with open(data_csv, 'r') as file:
        csv_reader = csv.reader(file, delimiter = ',')
        idx = 0
        for row in csv_reader:
            dataset[idx][0] = row[0]
            dataset[idx][1] = row[1]
            idx += 1
        np.random.shuffle(dataset)
        return dataset
    
def path_to_trace(path):
    trace_array = np.empty((3,6001))
    with h5py.File(path, 'r') as dp:
        trace_array[:3] = dp.get('traces')
        info = np.array(dp.get('event_info'))
        info = json.loads(str(info))
    # No event type for noise, so handling that below
    if path.split('/')[1] == 'noise':
        label = 'noise'
    else:
        label = info['event_type']
    # Since we consider induced earthquakes as earthquakes we need to handle that as well:
    if label == "induced or triggered event":
        label = "earthquake"
    return trace_array, label, info

def assert_true_labels(ds):
    for path, label in ds:
        _, label_from_trace, _ = path_to_trace(path)
        if label_from_trace != 'noise':
            label_from_trace = label_from_trace + 's'
            label = label + 's'
        assert path.split('/')[1] == label == label_from_trace, f'Mismatch between {path.split("/")[1]} and {label} and {label_from_trace}, at path: {path}'


def get_class_distribution_from_csv(data_csv):
    with open(data_csv) as file:
        nr_earthquakes = 0
        nr_explosions = 0
        nr_noise = 0
        nr_total = 0
        for row in file:
            event_type = row.split(',')[1].rstrip()
            if event_type == "earthquake":
                nr_earthquakes += 1
            elif event_type == "explosion":
                nr_explosions += 1
            elif event_type == "noise":            
                nr_noise += 1
            nr_total += 1
        
        return nr_earthquakes, nr_explosions, nr_noise, nr_total
    
def even_classes_csv(data_csv, balanced_csv_name):
    with open(balanced_csv_name, 'w+', newline='') as file:
        writer = csv.writer(file, delimiter = ' ', quotechar = '|', quoting = csv.QUOTE_MINIMAL)
        nr_earthquakes, nr_explosions, nr_noise, nr_total = get_class_distribution_from_csv(data_csv)
        min_class_nr = min([nr_earthquakes, nr_explosions, nr_noise, nr_total])
        print(min_class_nr)
        csv_numpy = csv_to_numpy(data_csv)
        class_count = []
        pure_index_list = []
        for event_type in os.listdir(root):
            pure_index_single_event = []
            if event_type == "induced":
                continue
                print(event_type)
            else:
                
                print(len(csv_numpy))
                print(f'{csv_numpy[100][1].rstrip()} vs {event_type}')
                for line in range(len(csv_numpy)):
                    
                    if csv_numpy[line][1].rstrip() == event_type:
                        pure_index_single_event.append(np.where(csv_numpy[line]))
                pure_index_list.append(pure_index_single_event)
        for pure_type_index_list in pure_index_list:
            print(len(pure_type_index_list))
                #random_samples = np.random.sample(pure_list, min_class_nr)
                #class_count.append(len(random_samples))
                #for line in range(len(random_samples)):
                  #  writer.writerow([f'{random_samples[line][0]},{random_samples[line][1]}'])
                
        #print(class_count)   

def even_classes_csv2(data_csv, balanced_csv_name, output_folder):
    nr_earthquakes, nr_explosions, nr_noise, nr_total = get_class_distribution_from_csv(data_csv)
    min_class_nr = min([nr_earthquakes, nr_explosions, nr_noise, nr_total])
    print(min_class_nr)
    csv_numpy = csv_to_numpy(data_csv)
    class_count = [0,0,0,0]
    earthquake_list = []
    explosion_list = []
    noise_list = []
    pure_classes = []
    final_class_count = []
    for path, label in csv_numpy:
        if label == "explosion":
            explosion_list.append([path,label])
            class_count[0] += 1
            class_count[3] += 1
        if label == "earthquake":
            earthquake_list.append([path,label])
            class_count[1] += 1
            class_count[3] += 1
        if label == "noise":
            noise_list.append([path,label])
            class_count[2] += 1
            class_count[3] += 1
    pure_classes.append(earthquake_list)
    pure_classes.append(explosion_list)
    pure_classes.append(noise_list)
    print(len(pure_classes[0]))
    with open(output_folder + '/' + balanced_csv_name, 'w+', newline='') as file:
        writer = csv.writer(file, delimiter = ' ', quotechar = '|', quoting = csv.QUOTE_MINIMAL)
        for single_class in pure_classes:
            print(f'Sample_size = {min_class_nr} single_class length: {len(single_class)}' )
            random_samples = random.sample(single_class, min_class_nr)
            final_class_count.append(len(random_samples))
            for path, label in random_samples:
                writer.writerow([f'{path},{label}'])
        print(final_class_count)
            
            
        
        
def generate_subset_csv(input_ds, output_csv_name, output_folder):
    nr_rows = len(input_ds)
    with open(output_folder + '/' + output_csv_name, 'w+', newline='') as file:
        writer = csv.writer(file, delimiter = ',', quotechar = ' ', quoting = csv.QUOTE_MINIMAL)
        for row in input_ds:
            writer.writerow(row)
    print(f'Completed writing {nr_rows} rows to {output_folder}/{output_csv_name}.')
        
        

In [14]:
generate_event_csv(root, data_csv, output_folder)

Completed earthquakes


KeyboardInterrupt: 

In [7]:
nr_eq, nr_ex, nr_noise, nr_total = get_class_distribution_from_csv(data_csv)
print(f'Earthquakes: {nr_eq}, Explosions: {nr_ex}, Noise: {nr_noise}, Total: {nr_total}')

Earthquakes: 0, Explosions: 0, Noise: 0, Total: 206250


### Splitting data

##### Balanced data

In [6]:
# Generate balanced data csv using data_csv:
even_classes_csv2(data_csv, balanced_csv)

0
0
Sample_size = 0 single_class length: 0
Sample_size = 0 single_class length: 0
Sample_size = 0 single_class length: 0
[0, 0, 0]


In [7]:
ds = csv_to_numpy(balanced_csv)

train_ds, test_val_ds = train_test_split(ds, test_size = 0.2)
val_ds, test_ds = train_test_split(test_val_ds, test_size = 0.3)

In [8]:
print(len(train_ds), len(val_ds), len(test_ds))
print(len(train_ds) + len(val_ds) + len(test_ds))

16444 2878 1234
20556


In [9]:
assert_true_labels(ds)

##### Creating balanced splitted csvs

In [10]:

root_sub = 'balanced'
output_folder = f"csv_folder_3_class/{root_sub}"

# Test set:
output_csv_name = "test_set.csv"
generate_subset_csv(test_ds, output_csv_name, output_folder)

# Validation set:
output_csv_name = "validation_set.csv"
generate_subset_csv(val_ds, output_csv_name, output_folder)

# Train set:
output_csv_name = "train_set.csv"
generate_subset_csv(train_ds, output_csv_name, output_folder)


Completed writing 1234 rows to csv_folder_3_class/balanced/test_set.csv.
Completed writing 2878 rows to csv_folder_3_class/balanced/validation_set.csv.
Completed writing 16444 rows to csv_folder_3_class/balanced/train_set.csv.


In [11]:
assert_true_labels(csv_to_numpy('csv_folder_3_class/balanced/test_set.csv'))
assert_true_labels(csv_to_numpy('csv_folder_3_class/balanced/validation_set.csv'))
assert_true_labels(csv_to_numpy('csv_folder_3_class/balanced/train_set.csv'))