In [1]:
import numpy as np
import os
import sys
import pandas as pd
import seaborn as sns

import pylab as pl
import h5py

import tensorflow as tf
from tensorflow.keras import mixed_precision
from sklearn.metrics import classification_report, precision_recall_fscore_support

os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
os.environ['CUDA_VISIBLE_DEVICES']="0" 

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)


from sklearn.metrics import confusion_matrix

base_dir = '/media/tord/T7/Thesis_ssd/MasterThesis3'
os.chdir(base_dir)
from Classes.DataProcessing.LoadData import LoadData
from Classes.DataProcessing.HelperFunctions import HelperFunctions
from Classes.DataProcessing.DataHandler import DataHandler
from Classes.Modeling.TrainSingleModelRam import TrainSingleModelRam
from Classes.DataProcessing.RamLoader import RamLoader

from GlobalUtils import GlobalUtils

utils = GlobalUtils()


import sys


helper = HelperFunctions()

tf.config.optimizer.set_jit(True)
mixed_precision.set_global_policy('mixed_float16')

1 Physical GPUs, 1 Logical GPUs

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: GeForce RTX 3090, compute capability 8.6


In [2]:
load_args = {
    'earth_explo_only' : False,
    'noise_earth_only' : False,
    'noise_not_noise' : False,
    'downsample' : True,
    'upsample' : True,
    'frac_diff' : 1,
    'seed' : 1,
    'subsample_size' : 0.25,
    'balance_non_train_set' : False,
    'use_true_test_set' : True,
    'even_balance' : False
}
loadData = LoadData(**load_args)
train_ds = loadData.train
test_ds = loadData.test
handler = DataHandler(loadData)



If this is an error, please set use_true_test_set = False and reload the kernel
Mapping train redundancy: : [--------------------------------------->] 100 %

Mapping test redundancy: : [--------------------------------------->] 100 %



Loaded true test set, accompanied by a train set for preprocessing fitting.
Distribution (Label: (counts, proportion)) of
Train ds:
earthquake: (26527, 0.3371)  |  explosion: (25650, 0.3259)  |  noise: (26524, 0.337)  
Test ds:
earthquake: (698, 0.0396)  |  explosion: (8365, 0.4742)  |  noise: (8579, 0.4863)  


In [3]:
use_time_augmentor = True
scaler_name = "normalize"
use_noise_augmentor = True
filter_name = None
band_min = 2.0
band_max = 4.0
highpass_freq = 0.075


shutdown = False

num_classes = len(list(set(loadData.label_dict.values())))



ramLoader = RamLoader(loadData, 
                      handler, 
                      use_time_augmentor = use_time_augmentor, 
                      use_noise_augmentor = use_noise_augmentor, 
                      scaler_name = scaler_name,
                      filter_name = filter_name, 
                      band_min = band_min,
                      band_max = band_max,
                      highpass_freq = highpass_freq, 
                      load_test_set = True, 
                      meier_load = False)

x_test, y_test, noiseAug = ramLoader.load_to_ram()

Initializing loading of the test set
Step 1: Fit augmentors and scalers on training data
Fitting train time augmentor: [--------------------------------------->] 100 %

Fit process completed after 449.16010069847107 seconds. Total datapoints fitted: 78701.
Average time per datapoint: 0.005707171455235271


Stage one loading training set, timeAug: [--------------------------------------->] 100 %

Fit process of normalizer skipped as unecessary


Stage two loading training set, labels and normalize scaler: [--------------------------------------->] 100 %

Fitting noise augmentor: [--------------------------------------->] 100 %

Step 2: Load and transform the test set, using the previously fitted scaler and augmentors
Fitting test time augmentor: [--------------------------------------->] 100 %

Fit process completed after 158.3871157169342 seconds. Total datapoints fitted: 17642.
Average time per datapoint: 0.008977843539107482


Stage one loading test set, timeAug: [-------------------

ValueError: not enough values to unpack (expected 5, got 3)

In [None]:
# Path of the best performing model on Noise vs Not-Noise
save_dir_3n = '/media/tord/T7/Thesis_ssd/SavedModels/CNN'
model_name_3n = 'CNN_3N_final'
model_3n_path = f'{save_dir_3n}/{model_name_3n}'
# This model is trained with batch_size of 256
# Batch size will be 1 for predictions
print(model_3n_path)
model_3n = helper.load_model(model_3n_path)



# Path of the best performing model on Explosion vs Earthquake
save_dir_ee = '/media/tord/T7/Thesis_ssd/SavedModels/CNN'
model_name_ee = 'CNN_EE_final'
model_ee_path = f'{save_dir_ee}/{model_name_ee}'
# This model is trained with batch_size of 256
# Batch size will be 1 for predictions
model_ee = helper.load_model(model_ee_path)

def predict_final_model(loadData, model_3n, model_ee, test_traces, test_labels, noiseAug, num_channels_3n, num_channels_ee):
    # This function assumes data to be preprocessed with normalize scaler
    noise_not_noise_dict = loadData.noise_not_noise_dict()
    predictions_3n = helper.predict_RamGenerator(model_3n, test_traces, test_labels, 1, True, noiseAug, None, num_channels_3n)
    rounded_3n_predictions = np.rint(predictions_3n)
    
    earth_explo_dict = loadData.earth_explo_dict()
    not_noise_traces, not_noise_labels = get_not_noise_traces(predictions_3n, test_traces, noise_not_noise_dict)
    predictions_ee = helper.predict_RamGenerator(model_ee, not_noise_traces, not_noise_labels, 1, True, noiseAug, None, num_channels_ee)
    rounded_ee_predictions = np.rint(predictions_ee)
    
    # Converting the 3N predictions into string
    sconverted_3n_predictions = convert_predictions(rounded_3n_predictions, noise_not_noise_dict)
    # We now have a list of ["noise", "not-noise"...]
    # We want to take the not-noise elements and convert them into their final predictions.
    # We can do this by using a list of indexes of the not noise elements, and replace them by their true final predictions.
    not_noise_indexes = get_not_noise_indexes(rounded_3n_predictions, noise_not_noise_dict)
    
    
    """
    At this point in the function we have two lists of predictions.
    The first list corresponds to the order of the test set, and is in the form [0,1,...0,1]
    Where 3N predictions are 1, we predict these traces in the EE model.
    This creates the second list, of 0 and 1. 
    
    Our test_traces contains the actual labels in string format. 
    I need to convert all of the predictions into strings, and recreate the order of the true label list.
    
    
    UNFORSEEN PROBLEM: Due to everything being run batchwise, we will not be predicting every sample.
    POSSIBLE FIX: Set batch_size to 1, and pray I do not use batch normalization. I DONT, lets try it.
    
    
    """
    return convert_ee_predictions(rounded_ee_predictions, earth_explo_only_dict, converted_3n_predictions, not_noise_indexes)
    
def get_not_noise_indexes(rounded_predictions, noise_not_noise_dict):
    not_noise_indexes = np.where(rounded_predictions == noise_not_noise_dict["not-noise"])
    return not_noise_indexes
    
def get_not_noise_traces(predictions, test_traces, noise_not_noise_dict):
    rounded_predictions = np.rint(predictions)
    not_noise_indexes = get_not_noise_indexes(rounded_predictions, noise_not_noise_dict)
    not_noise_traces = test_traces[not_noise_indexes]
    not_noise_labels = test_traces[not_noise_indexes]
    return not_noise_traces, not_noise_labels

def convert_ee_predictions(rounded_predictions, label_dict, converted_3n_predictions, not_noise_indexes):
    final_predictions = converted_3n_predictions.copy()
    converted_ee_predictions = convert_predictions(rounded_predictions, label_dict)
    for idx, nn_index in enumerate(not_noise_indexes):
        final_predictions[nn_index] = converted_ee_predictions[idx]
    return final_predictions
    
def convert_predictions(rounded_predictions, label_dict):
    transformed_predictions = np.empty((rounded_predictions.shape))
    for idx, pred in enumerate(rounded_predictions):
        transformed_predictions[idx] =  label_dict.get(pred)
    return transformed_predictions

In [None]:
def evaluate_full_pipeline(loadData, model_3n, model_ee, test_traces, test_labels, noiseAug, num_channels_3n, num_channels_ee):
    test_pred = predict_final_model(loadData, model_3n, model_ee, test_traces, test_labels, noiseAug, num_channels_3n, num_channels_ee)
    final_dict = {"noise" : 0, "explosion" : 1, "earthquake" : 2}
    # Let us now convert the final predictions and true labels into one-hot encoding
    final_preds = [final_dict[x] for x in test_pred]
    final_true = [final_dixt[x] for x in test_labels]
    assert len(final_preds) == len(final_true), f"preds len {len(final_preds)}, true len {len(final_true)}"
    # We now have equal length predictions and true labels in number format [0,1,2].
    # Should be able to attempt to print a confusion matrix now
    conf=tf.math.confusion_matrix(final_true, final_preds, num_classes = 3)
    precision, recall, fscore = precision_recall_fscore_support(final_true, final_preds, beta = 2, average = "macro", zero_division = 0.0)
    report = classification_report(final_true, final_preds, target_names = final_dict)
    return conf, report, precision, recall, fscore
    

In [None]:
conf, report, precision, recall, fscore = evaluate_full_pipeline(loadData, model_3n, model_ee, x_test, y_test, noiseAug, 3, 3)

In [None]:
conf

In [None]:
report

In [None]:
precision, recall, fscore