## Subsetting to maximize free memory
Preprocessing and fitting all data at once crashes my system, so I am using the following procedure to reduce memory demands during model training.

a) divide data into subsets of ~2500  

b) create a train-test-split for each subset  
* maintain consistent pos:neg ratio throughout all subsets  

c) import a subset, use it to train the model, delete subset before importing the next one for further training  

This notebook re-feng-shuis the spectrograms created in prior notebook by completing steps a & b from above, and pickling each subset so it can be pulled in and easily unpacked for model training later.

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import gc

from sklearn.model_selection import train_test_split

import random
random.seed(42)

In [2]:
in_path = '../scratch_data/tts_arrays_sm/'
files = os.listdir(in_path)
files

['tts_3_neg.pkl',
 'tts_2_pos.pkl',
 'tts_2_neg.pkl',
 'tts_1_pos.pkl',
 'tts_1_neg.pkl',
 'tts_4_neg.pkl',
 'tts_5_neg.pkl',
 'tts_masked_neg.pkl',
 'tts_masked_pos.pkl',
 'tts_6_neg.pkl']

In [3]:
in_path = '../scratch_data/tts_arrays_sm/'
files = os.listdir(in_path)
target_files = [i for i in files if "masked" not in i] #no augmentation

pos_specs = []
neg_specs = []
for i in target_files:
    with open (in_path + i, mode = 'rb') as pickle_in:
        temp = pickle.load(pickle_in)
    if 'pos' in i:
        pos_specs += temp
    else:
        neg_specs += temp

In [4]:
#baseline, & ratio to maintain in all tts
print(f'# positives: {len(pos_specs)}\n# negatives: {len(neg_specs)}\n')
print(f'% pos (minority): {len(pos_specs)/(len(pos_specs)+len(neg_specs))}')
print(f'% neg (majority): {len(neg_specs)/(len(pos_specs)+len(neg_specs))}')

# positives: 5588
# negatives: 16766

% pos (minority): 0.24997763263845396
% neg (majority): 0.750022367361546


In [5]:
#standard scalar for each spectrogram
#opt to standardize per input, not use mean/std of whole dataset
#could still use re-scaling to get to smaller range, but sticking with this for now
pos_specs = [(i-np.mean(i)) / np.std(i) for i in pos_specs]
neg_specs = [(i-np.mean(i)) / np.std(i) for i in neg_specs]

In [6]:
#data grouped by deployment site, shuffle before splitting
random.shuffle(pos_specs)
random.shuffle(neg_specs)

In [7]:
#zip with labels so you know who's who
pos_specs = list(zip(pos_specs, [1]*len(pos_specs)))
neg_specs = list(zip(neg_specs, [0]*len(neg_specs)))

In [8]:
#subset totalling 2500 with 1:4 ratio (p:n) = 625 positives : 1875 negatives per subset
poschunks = [pos_specs[i:i+625] for i in range(0, len(pos_specs), 625)]
negchunks = [neg_specs[i:i+1875] for i in range(0, len(neg_specs), 1875)]
print(len(poschunks), len(negchunks))
print(len(poschunks[-1]), len(negchunks[-1])) #ratio still good in leftovers

9 9
588 1766


In [12]:
#concatenate pos and negatives to make whole tts set
tts_list = [(poschunks[i] + negchunks[i]) for i in range(len(poschunks))]
len(tts_list)

9

In [13]:
#perform tts on each list item, export for use in model training sessions
for i in range(len(tts_list)):
    temp_spec, temp_lab = zip(*tts_list[i]) #unzip
    temp_spec = np.array(temp_spec)
    temp_lab = np.array(temp_lab)
    X_train, X_test, y_train, y_test = train_test_split(temp_spec, temp_lab, 
                                                        random_state=42,
                                                        stratify = temp_lab)
    X_train = X_train.reshape(X_train.shape[0], 307, 460, 1)
    X_test = X_test.reshape(X_test.shape[0], 307, 460, 1)
    train_zip = list(zip(X_train, y_train))
    test_zip = list(zip(X_test, y_test))
    with open (f'../scratch_data/tts_chunked_sm/ch{i}_train.pkl', mode = 'wb') as pickle_out:
        pickle.dump(train_zip, pickle_out)  
    with open (f'../scratch_data/tts_chunked_sm/ch{i}_test.pkl', mode = 'wb') as pickle_out:
        pickle.dump(test_zip, pickle_out)

See you in modeling