In [1]:
# The data consists of a one dimensional time series x with 600 Mio data points.
# At test time, we will see a time series of length 150,000 to predict the next earthquake.
# The idea of this kernel is to randomly sample chunks of length 150,000 from x, derive some
# features and use them to update weights of a recurrent neural net with 150,000 / 1000 = 150
# time steps.

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from numpy.random import seed
seed(639)

In [2]:
nrows = 10000000
batch_size = 32

In [3]:
second_earthquake = 50085877
src_dir = '/run/media/hoosiki/WareHouse3/mtb/datasets/LANL'

In [4]:
df_train = pd.read_csv(src_dir + '/train.csv',
                       nrows = nrows,
                       dtype = {'acoustic_data': np.float32, 'time_to_failure': np.float32})
df_train.head(10)

Unnamed: 0,acoustic_data,time_to_failure
0,12.0,1.4691
1,6.0,1.4691
2,8.0,1.4691
3,5.0,1.4691
4,8.0,1.4691
5,8.0,1.4691
6,9.0,1.4691
7,7.0,1.4691
8,-5.0,1.4691
9,3.0,1.4691


In [5]:
data_train = df_train.values

In [6]:
# Helper function for the data generator. Extracts mean, standard deviation, and quantiles per time step.
# Can easily be extended. Expects a two dimensional array.
def extract_features(z):
    return np.c_[z.mean(axis = 1),
                 z.min(axis = 1),
                 z.max(axis = 1),
                 z.std(axis = 1)]

In [7]:
# For a given ending position "last_index", we split the last 150,000 values
# of "x" into 150 pieces of length 1000 each. So n_steps * step_length should equal 150,000.
# From each piece, a set features are extracted. This results in a feature matrix
# of dimesion (150 time steps x features).
def create_X(x, last_index = None, n_steps = 150, step_length = 1000):
    if last_index == None:
        last_index = len(x)
        
    assert last_index - n_steps * step_length >= 0
    
    # Reshaping and approximate standardization with mean 5 and std 3.
    temp = (x[(last_index - n_steps * step_length):last_index].reshape(n_steps, -1) - 5) / 3

    # Extracts features of sequences of full length 1000, of the last 100 values and finally also
    # of the last 10 observations.
    return np.c_[extract_features(temp),
                 extract_features(temp[:, -step_length // 10:]),
                 extract_features(temp[:, -step_length // 100:])]

In [8]:
# Query "create_X" to figure out the number of features
n_features = create_X(data_train[0:150000]).shape[1]
print('Our RNN is based on %i features' % n_features)

Our RNN is based on 12 features


In [9]:
# The generator endlessly selects "batch_size" ending positions of sub-time series. For each ending position,
# the "time_to_failure" serves as target, while the features are created by the function "create_X".
def generator(data, min_index = 0, max_index = None, batch_size = 16, n_steps = 150, step_length = 1000):
    if max_index is None:
        max_index = len(data) - 1
    
    while True:
        # Pick indices of ending positions
        rows = np.random.randint(min_index + n_steps * step_length, max_index, size = batch_size)
        
        # Initialize feature matrices and targets
        samples = np.zeros((batch_size, n_steps, n_features))
        targets = np.zeros(batch_size, )
        
        for j, row in enumerate(rows):
            samples[j] = create_X(data[:, 0], last_index = row, n_steps = n_steps, step_length = step_length)
            targets[j] = data[row - 1, 1]
        yield samples, targets

In [10]:
# Initialize generators
gen_train = generator(data_train, batch_size = batch_size)
gen_valid = generator(data_train, batch_size = batch_size, max_index = second_earthquake)

In [None]:
x_train = np.array(list(gen_train))
x_valid = np.array(list(gen_valid))

In [None]:
x_train.shape