In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool

In [9]:
def gen_features(X):
    strain = []
    strain.append(X.mean())
    strain.append(X.std())
    strain.append(X.min())
    strain.append(X.max())
    strain.append(X.kurtosis())
    strain.append(X.skew())
    strain.append(np.quantile(X,0.95))
    strain.append(np.quantile(X,0.90))
    strain.append((X.loc[abs(X - X.mean()) < 20]).kurtosis()) # truncated kurtosis 
    strain.append(np.quantile(X,0.75) - np.quantile(X,0.25)) #iqr
    strain.append(X.rolling(100).std()) # rolling stdev
    return pd.Series(strain)

In [11]:
train = pd.read_csv('LANL-Earthquake-Prediction/train.csv', iterator=True, chunksize=150_000, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

X_train = pd.DataFrame()
y_train = pd.Series(dtype=np.float64)
for df in train:
    ch = gen_features(df['acoustic_data'])
    X_train = X_train.append(ch, ignore_index=True)
    y_train = y_train.append(pd.Series(df['time_to_failure'].values[-1]))

In [8]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,4.884113,5.101106,-98.0,104.0,33.662481,-0.024061,11.0,10.0,2.402105,4.0
1,4.725767,6.588824,-154.0,181.0,98.758517,0.390561,12.0,10.0,2.527633,5.0
2,4.906393,6.967397,-106.0,140.0,33.555211,0.217391,13.0,10.0,2.554792,5.0
3,4.90224,6.922305,-199.0,197.0,116.548172,0.757278,12.0,10.0,2.568523,5.0
4,4.90872,7.30111,-126.0,145.0,52.977905,0.064531,12.0,10.0,2.709675,5.0


In [7]:
train_pool = Pool(X_train, y_train)
m = CatBoostRegressor(iterations=10000, loss_function='MAE')
m.fit(X_train, y_train, silent=True)
m.best_score_

{'learn': {'MAE': 1.0687883324981735}}