In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

## Load files, select according to frequency

In [2]:
base_path = '/project2/ishanu/YI_EARTHQUAKE_current/'
save_path = f'{base_path}/transformer/datasets/'
fnames = np.array(list((Path(f'{base_path}/split').glob('*'))))

In [None]:
# load split files
freqs = []
for i, fname in enumerate(fnames):
    with open(fname, 'r') as fh:
        data = np.genfromtxt(fh)
        freq = sum(data > 0) / len(data)
        freqs.append(freq)
freqs = np.array(freqs)

# select sequence with 
# NOTE: only use lower and upper with one decimal place
# if using numbers with higher precision, 
# make sure to modify the filename
lower, upper = .1, .2
fnames_selected = fnames[ (freqs > lower) & (freqs < upper) ]
stem_fname = f'{save_path}/split_selected_{int(lower * 10)}-{int(upper * 10)}.txt'
with open(stem_fname, 'w') as fh:
    np.savetxt(fh, fnames_selected, fmt='%s')

## Form and save train and test datasets

In [None]:
def get_train_test(data, len_train, input_len=50, output_len=1):
    
    """
    Form train and test input-output pairs
    Input
    """
    # Form train input and output
    total_len = input_len + output_len
    num_train_pairs = len_train - total_len + 1
    train_input = []
    train_output = []
    for i in range(num_train_pairs):
        train_input.append(data[i: i + input_len])
        train_output.append(data[i + input_len: i + input_len + output_len])

    # Form test input and output
    test_input = []
    test_output = []
    num_test_pairs = (len(data) - len_train) - output_len + 1
    for i in range(num_test_pairs):
        test_input.append(data[len_train + i - input_len: len_train + i])
        test_output.append(data[len_train + i: len_train + i + output_len])
    
    return train_input, train_output, test_input, test_output

In [None]:
def save_input_output(fname, input, output):
    assert len(input) == len(output), 'input and output have differnet length'
    lines =  [' '.join(map(str, i)) + '\t' + ' '.join(map(str, o)) + '\n' 
              for i, o in zip(input, output)]
    with open(fname, 'w') as fh:
        fh.writelines(lines)

###  Get train and test length

In [None]:
len_train = len(pd.date_range(start='2009-01-01', end='2019-12-31', freq='1D')) // 3
len_test = len(pd.date_range(start='2020-01-01', end='2020-08-21', freq='1D')) // 3
print(f'train length = {len_train}')
print(f'test length = {len_test}')

In [None]:
input_len = 50
output_len = 1

with open(f'{save_path}/split_selected_1-2.txt', 'r') as fh:
    for line in fh:
        fname = line.strip()
        data = np.genfromtxt(fname)
        data[data > 0] = 1
        data = data.astype(int)
        train_input, train_output, test_input, test_output = get_train_test(data, len_train, input_len, output_len)

        
        prefix = fname.split('/')[-1]
        train_fname = f'{save_path}/{prefix}_train.dat'
        save_input_output(train_fname, train_input, train_output)
        
        test_fname = f'{save_path}/{prefix}_test.dat'
        save_input_output(test_fname, test_input, test_output)