In [1]:
import numpy as np
import pandas as pd
import keras
import timeit
import copy 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from keras.layers import CuDNNLSTM,CuDNNGRU
from keras.layers import Dense, Dropout, Activation,LSTM,GRU
from keras.preprocessing.sequence import TimeseriesGenerator
import gc
from random import shuffle
from keras.layers import TimeDistributed
from sklearn.externals import joblib 
import scipy as sp

Using TensorFlow backend.


In [2]:
# Definitions
batch_size = 300
total_samples = 629145480
lookback=150
nb_classes=29
lines_to_read=10000000

data_dir = "/home/slow-storage/Kaggle/Earthquake/"
#train_file = data_dir + "train_augment_scaled.csv"
train_file = data_dir + "train.csv"

#modeltype = "lstm"
modeltype = "gru"

In [3]:
train_placeholders = np.arange(0,lines_to_read)

In [4]:
train_size=train_placeholders.shape[0]
train_steps = int(train_size / batch_size)

In [5]:
#scaler = joblib.load('/home/slow-storage/Kaggle/Earthquake/scaler_model.pkl')

In [6]:
traindf = pd.read_csv(train_file,dtype=np.float64,nrows=lines_to_read)

In [7]:
def window_stop(data,LOOKBACK_WINDOW,PREDICT_WINDOW):
    examples = LOOKBACK_WINDOW
    y_examples = PREDICT_WINDOW
    nb_samples = len(data) - examples - y_examples #makes sure it can be split into lookback windows properly
    return nb_samples

def input_features(nb_samples,LOOKBACK_WINDOW,data):
    input_list = [np.expand_dims(data[i:LOOKBACK_WINDOW+i,:], axis=0) for i in range(nb_samples)]
    input_mat = np.concatenate(input_list, axis=0)
    return input_mat


In [1]:
class EarthquakeDataReader(keras.utils.Sequence):

    def __init__(self, placeholders, df, batch_size, steps_per_epoch, lookback, gentype):
        
        self.placeholders = placeholders
        self.batch_size = batch_size
        self.df = df
        self.lookback = lookback
        self.steps_per_epoch = steps_per_epoch
        self.gentype = gentype
        
    def __len__(self):
        return self.steps_per_epoch
    
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.placeholders))

    def __getitem__(self,index):
        
         
        indexes = self.placeholders[index*self.batch_size:(index+1)*self.batch_size]
        indexlist = indexes.tolist()
        xdf = self.df.iloc[indexlist]
        
        xtmp= pd.DataFrame()
        
        xtmp['acoustic_data'] = xdf['acoustic_data'].copy()
        xtmp['fft']        = np.abs(np.fft.fft(xdf['acoustic_data']))
        xtmp['ema']        = xdf['acoustic_data'].ewm(span=1000,adjust=False).mean()
        xtmp['sma']        = xdf['acoustic_data'].rolling(1000,min_periods=1).mean()
        xtmp['volitility'] = xdf['acoustic_data'].rolling(1000,min_periods=1).std(ddof=0)
        xtmp['momentum']   = xdf['acoustic_data'] - xdf['acoustic_data'].shift(1).fillna(0)
        xtmp['min']        = xdf['acoustic_data'].rolling(1000,min_periods=1).min()
        xtmp['max']        = xdf['acoustic_data'].rolling(1000,min_periods=1).max()
        xtmp['sum']        = xdf['acoustic_data'].rolling(1000,min_periods=1).sum()
        xtmp['mean']       = xdf['acoustic_data'].rolling(1000,min_periods=1).mean()
        xtmp['median']     = xdf['acoustic_data'].rolling(1000,min_periods=1).median()
        xtmp['kurtosis']   = xdf['acoustic_data'].rolling(1000,min_periods=1).kurt() 
        xtmp['q95']        = xdf['acoustic_data'].rolling(1000,min_periods=1).quantile(0.95)
        xtmp['q99']        = xdf['acoustic_data'].rolling(1000,min_periods=1).quantile(0.99)
        xtmp['q05']        = xdf['acoustic_data'].rolling(1000,min_periods=1).quantile(0.05)
        xtmp['q01']        = xdf['acoustic_data'].rolling(1000,min_periods=1).quantile(0.01)
        
        xtmp['per_10']     = np.percentile(xdf['acoustic_data'], 10)
        xtmp['per_20']     = np.percentile(xdf['acoustic_data'], 20)
        xtmp['per_30']     = np.percentile(xdf['acoustic_data'], 30)
        xtmp['per_40']     = np.percentile(xdf['acoustic_data'], 40)
        xtmp['per_50']     = np.percentile(xdf['acoustic_data'], 50)
        xtmp['per_60']     = np.percentile(xdf['acoustic_data'], 60)
        xtmp['per_70']     = np.percentile(xdf['acoustic_data'], 70)
        xtmp['per_80']     = np.percentile(xdf['acoustic_data'], 80)
        xtmp['per_90']     = np.percentile(xdf['acoustic_data'], 90)
        
        xtmp['kstat_1']    = sp.stats.kstat(xdf['acoustic_data'], 1)
        xtmp['kstat_2']    = sp.stats.kstat(xdf['acoustic_data'], 2)
        xtmp['kstat_3']    = sp.stats.kstat(xdf['acoustic_data'], 3)
        xtmp['kstat_4']    = sp.stats.kstat(xdf['acoustic_data'], 4)
        
        xtmp = xtmp.fillna(0)
        
        if self.gentype == "train":
            y = xdf['time_to_failure'].to_numpy()
            remove=lookback+1
            y = y[:-remove]
        
        
        
        x = xtmp.to_numpy()
        
        
        del xdf
        del xtmp
        
        LOOKBACK_WINDOW = lookback
        PREDICT_WINDOW = 1

        nb_samples = window_stop(x,LOOKBACK_WINDOW,PREDICT_WINDOW)

        x = input_features(nb_samples,LOOKBACK_WINDOW,x)
        
        del nb_samples 
        gc.collect()
        
        
        if self.gentype == "train":        
            return x,y      
        elif self.gentype == "test":        
            return x

NameError: name 'keras' is not defined

In [9]:
train_gen = EarthquakeDataReader(train_placeholders,traindf,batch_size,train_steps,lookback,"train")


In [10]:
print(type(train_gen))

<class '__main__.EarthquakeDataReader'>


In [11]:
def build_LSTM():
    model = Sequential()

    model.add(LSTM(36,
                kernel_initializer = 'normal',
                input_shape=(lookback,nb_classes),
                #return_sequences=True,
                return_sequences=True,
                activation="relu"
              ))
    model.add(Dropout(.4))
    model.add(LSTM(18,
                kernel_initializer = 'normal',
                return_sequences=False,
                activation='relu'
                #activation='relu'
              ))
    model.add(Dropout(.4))
    model.add(Dense(128,
                kernel_initializer = 'normal',
                activation="relu"
               ))
    model.add(Dropout(.4))
    model.add(Dense(16,
                kernel_initializer = 'normal',
                activation="relu"
               ))
    model.add(Dropout(.4))
    model.add(Dense(1,
                #activation='linear'
                activation=None
               ))
    return model
    

In [12]:
def build_GRU():
    model = Sequential()

    model.add(GRU(36,
                kernel_initializer = 'normal',
                input_shape=(lookback,nb_classes),
                #return_sequences=True,
                return_sequences=True,
                activation="relu"
              ))
    model.add(Dropout(.4))
    model.add(GRU(18,
                kernel_initializer = 'normal',
                return_sequences=False,
                activation='relu'
                #activation='relu'
              ))
    model.add(Dropout(.4))
    model.add(Dense(128,
                kernel_initializer = 'normal',
                activation="relu"
               ))
    model.add(Dropout(.4))
    model.add(Dense(16,
                kernel_initializer = 'normal',
                activation="relu"
               ))
    model.add(Dropout(.4))
    model.add(Dense(1,
                #activation='linear'
                activation=None
               ))
    return model
    

In [13]:
if modeltype == "lstm":
    model = build_LSTM()
    
if modeltype == "gru":
    model = build_GRU()

#optim = Adam(lr = 0.0001)
optim = Adam(lr = 0.0001)

model.compile(loss='mae', optimizer=optim)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 150, 36)           7128      
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 36)           0         
_________________________________________________________________
gru_2 (GRU)                  (None, 18)                2970      
_________________________________________________________________
dropout_2 (Dropout)          (None, 18)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               2432      
_________________________________________________________________
dropout_3 (Dropout)  

In [14]:
best_model = data_dir + modeltype + '_weights_best.hdf5'
current_model = data_dir + modeltype + '_weights_current.hdf5'



checkpoint = ModelCheckpoint(best_model, 
                             monitor="loss",
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')


callbacks_list = [checkpoint]


In [15]:
model.load_weights(best_model)
print("Train Size:", train_steps)
#print("Valid Size:", valid_steps)

history = model.fit_generator(train_gen,
                    shuffle=False,
                    steps_per_epoch=train_steps, 
                    epochs=130,
                    verbose=1,
                    max_queue_size=1000,
                    #initial_epoch=1,
                    #pickle_safe=True,
                    use_multiprocessing=False,          
                    #use_multiprocessing=True,
                    #validation_data=valid_gen,
                    #validation_steps=valid_steps,
                    workers=5,         
                    callbacks=callbacks_list
                   )

Train Size: 33333
Instructions for updating:
Use tf.cast instead.
Epoch 1/130
 1662/33333 [>.............................] - ETA: 2:16:36 - loss: 0.4475

KeyboardInterrupt: 

In [17]:

model.save_weights(current_model)

In [18]:

outdir="/home/slow-storage/Kaggle/Earthquake/test"

In [19]:
import os
files = os.listdir(outdir)
files.sort() 


test_placeholders = np.arange(0,150000)
test_size=test_placeholders.shape[0]
test_steps = int(test_size / batch_size)

model.load_weights(best_model)

final_pred = {}

for file in files:
    
    testdf = pd.read_csv(outdir+"/"+file)
    testdf = testdf.fillna(0)
    
    test_gen = EarthquakeDataReader(test_placeholders,testdf,batch_size,test_steps,lookback,"test")
    
    predictions = model.predict_generator(
                                         test_gen,
                                         workers=3,
                                         steps=test_steps,
                                         use_multiprocessing=True
                                         )
    last = np.take(predictions, [0])
    final_pred[file] = last
    print(file,":",last)

seg_00030f.csv : [5.1416483]
seg_0012b5.csv : [5.1416483]
seg_00184e.csv : [5.1416483]
seg_003339.csv : [5.1416483]
seg_0042cc.csv : [5.1416483]


Process ForkPoolWorker-32:
Traceback (most recent call last):
Process ForkPoolWorker-31:
Process ForkPoolWorker-33:
  File "/home/slow-storage/local/Anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/slow-storage/local/Anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/slow-storage/local/Anaconda3/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/home/slow-storage/local/Anaconda3/lib/python3.7/site-packages/keras/utils/data_utils.py", line 401, in get_index
    return _SHARED_SEQUENCES[uid][i]
  File "<ipython-input-8-ffc558d19cfc>", line 81, in __getitem__
    x = input_features(nb_samples,LOOKBACK_WINDOW,x)


KeyboardInterrupt: 

  File "<ipython-input-7-0777cde4f660>", line 8, in input_features
    input_list = [np.expand_dims(data[i:LOOKBACK_WINDOW+i,:], axis=0) for i in range(nb_samples)]
  File "<ipython-input-7-0777cde4f660>", line 8, in <listcomp>
    input_list = [np.expand_dims(data[i:LOOKBACK_WINDOW+i,:], axis=0) for i in range(nb_samples)]
Process ForkPoolWorker-35:
KeyboardInterrupt
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/slow-storage/local/Anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/slow-storage/local/Anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/slow-storage/local/Anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/slow-storage/local/Anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
 