# BASIC IDEA OF THE KERNEL

The data consists of a one dimensional time series x with 600 Mio data points. 

At test time, we will see a time series of length 150'000 to predict the next earthquake.

The idea of this kernel is to randomly sample chunks of length 150'000 from x, 

derive some features and use them to update weights of a recurrent neural net with 150'000 / 1000 = 150 time steps. 

In [1]:
import numpy as np 
import pandas as pd
import os
from tqdm import tqdm

# Fix seeds
from numpy.random import seed
seed(639)
from tensorflow import set_random_seed
set_random_seed(5944)

In [2]:
# Import
#float_data = pd.read_csv("../input/train.csv", dtype={"acoustic_data": np.float32, "time_to_failure": np.float32}).values
df_train = pd.read_hdf("../input/train.hdf", key='0')
float_data = df_train.values

In [3]:
# Helper function for the data generator. Extracts mean, standard deviation, and quantiles per time step.
# Can easily be extended. Expects a two dimensional array.
def extract_features(z):
     return np.c_[z.mean(axis=1), 
                  z.min(axis=1),
                  z.max(axis=1),
                  z.std(axis=1)]

In [4]:
# For a given ending position "last_index", we split the last 150'000 values 
# of "x" into 150 pieces of length 1000 each. So n_steps * step_length should equal 150'000.
# From each piece, a set features are extracted. This results in a feature matrix 
# of dimension (150 time steps x features).  
def create_X(x, last_index=None, n_steps=150, step_length=1000):
    if last_index == None:
        last_index=len(x)
       
    assert last_index - n_steps * step_length >= 0

    # Reshaping and approximate standardization with mean 5 and std 3.
    temp = (x[(last_index - n_steps * step_length):last_index].reshape(n_steps, -1) - 5 ) / 3
    
    # Extracts features of sequences of full length 1000, of the last 100 values and finally also 
    # of the last 10 observations. 
    return np.c_[extract_features(temp),
                 extract_features(temp[:, -step_length // 10:]),
                 extract_features(temp[:, -step_length // 100:])]

In [5]:
# Query "create_X" to figure out the number of features
X = create_X(float_data[0:150000])
n_features = X.shape[1]
print("Our RNN is based on %i features"% n_features)

Our RNN is based on 12 features


In [6]:
X.shape

(150, 12)

In [7]:
# The generator endlessly selects "batch_size" ending positions of sub-time series. For each ending position,
# the "time_to_failure" serves as target, while the features are created by the function "create_X".
def generator(data, min_index=0, max_index=None, batch_size=16, n_steps=150, step_length=1000):
    if max_index is None:
        max_index = len(data) - 1
     
    while True:
        # Pick indices of ending positions
        rows = np.random.randint(min_index + n_steps * step_length, max_index, size=batch_size)
         
        # Initialize feature matrices and targets
        samples = np.zeros((batch_size, n_steps, n_features))
        targets = np.zeros(batch_size, )
        
        for j, row in enumerate(rows):
            samples[j] = create_X(data[:, 0], last_index=row, n_steps=n_steps, step_length=step_length)
            targets[j] = data[row - 1, 1]
        yield samples, targets

In [8]:
# Position of second (of 16) earthquake. Used to have a clean split
# between train and validation
second_earthquake = 50085877
float_data[second_earthquake, 1]

0.0006954822

In [9]:
data = float_data 

batch_size=16
n_steps=150
step_length=1000
batch_size=batch_size
min_index=second_earthquake + 1
max_index = len(data) - 1

# Pick indices of ending positions
rows = np.random.randint(min_index + n_steps * step_length, max_index, size=batch_size)

# Initialize feature matrices and targets
samples = np.zeros((batch_size, n_steps, n_features))
targets = np.zeros(batch_size, )

for j, row in enumerate(rows):
    samples[j] = create_X(data[:, 0], last_index=row, n_steps=n_steps, step_length=step_length)
    targets[j] = data[row - 1, 1]

In [10]:
# Initialize generators
batch_size = 32

#train_gen = generator(float_data, batch_size=batch_size) # Use this for better score
train_gen = generator(float_data, batch_size=batch_size, min_index=second_earthquake + 1)
valid_gen = generator(float_data, batch_size=batch_size, max_index=second_earthquake)

In [13]:
# Define model
from keras.models import Sequential
from keras.layers import Dense, CuDNNGRU
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint, TensorBoard

cb = [
    ModelCheckpoint("model.hdf5", save_best_only=True, period=3),
    #TensorBoard(log_dir="tflog/", histogram_freq=1)
]

model = Sequential()
model.add(CuDNNGRU(48, input_shape=(None, n_features)))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnngru_2 (CuDNNGRU)       (None, 48)                8928      
_________________________________________________________________
dense_3 (Dense)              (None, 10)                490       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
Total params: 9,429
Trainable params: 9,429
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Compile and fit model
model.compile(optimizer=adam(lr=0.0005), loss="mae")

history = model.fit_generator(train_gen,
                              steps_per_epoch=1000,
                              epochs=30,
                              verbose=2,
                              callbacks=cb,
                              validation_data=valid_gen,
                              validation_steps=200)

Epoch 1/30
 - 30s - loss: 2.0844 - val_loss: 1.7549
Epoch 2/30
 - 29s - loss: 2.0863 - val_loss: 1.6739
Epoch 3/30
 - 30s - loss: 2.0929 - val_loss: 1.7704
Epoch 4/30
 - 34s - loss: 2.0835 - val_loss: 1.8700
Epoch 5/30
 - 34s - loss: 2.0768 - val_loss: 1.6996
Epoch 6/30
 - 32s - loss: 2.0630 - val_loss: 1.8198
Epoch 7/30
 - 32s - loss: 2.0776 - val_loss: 1.8230
Epoch 8/30
 - 32s - loss: 2.0831 - val_loss: 1.7619
Epoch 9/30
 - 32s - loss: 2.0827 - val_loss: 1.8405
Epoch 10/30
 - 32s - loss: 2.0630 - val_loss: 1.6870
Epoch 11/30
 - 32s - loss: 2.0531 - val_loss: 1.6264
Epoch 12/30
 - 32s - loss: 2.0445 - val_loss: 1.5964
Epoch 13/30
 - 32s - loss: 2.0432 - val_loss: 1.6392
Epoch 14/30
 - 32s - loss: 2.0435 - val_loss: 1.6984
Epoch 15/30
 - 32s - loss: 2.0333 - val_loss: 1.6971
Epoch 16/30
 - 32s - loss: 2.0392 - val_loss: 1.5970
Epoch 17/30
 - 32s - loss: 2.0390 - val_loss: 1.7006
Epoch 18/30
 - 33s - loss: 2.0451 - val_loss: 1.5917
Epoch 19/30
 - 33s - loss: 2.0241 - val_loss: 1.7006
Ep

In [None]:
# Visualize accuracies
import matplotlib.pyplot as plt

def perf_plot(history, what = 'loss'):
    x = history.history[what]
    val_x = history.history['val_' + what]
    epochs = np.asarray(history.epoch) + 1
    
    plt.plot(epochs, x, 'bo', label = "Training " + what)
    plt.plot(epochs, val_x, 'b', label = "Validation " + what)
    plt.title("Training and validation " + what)
    plt.xlabel("Epochs")
    plt.legend()
    plt.show()
    return None

perf_plot(history)

In [None]:
# Load submission file
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id', dtype={"time_to_failure": np.float32})

# Load each test data, create the feature matrix, get numeric prediction
for i, seg_id in enumerate(tqdm(submission.index)):
  #  print(i)
    seg = pd.read_csv('../input/test/' + seg_id + '.csv')
    x = seg['acoustic_data'].values
    submission.time_to_failure[i] = model.predict(np.expand_dims(create_X(x), 0))

submission.head()

# Save
submission.to_csv('submission.csv')