In [72]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.model_selection import train_test_split
import glob
import os
from tqdm import tqdm


In [63]:
def gen_features(X):
    strain = []
    strain.append(X.mean())
    strain.append(X.std())
    strain.append(X.min())
    strain.append(X.max())
    strain.append(X.kurtosis())
    strain.append(X.skew())
    strain.append(np.quantile(X,0.95))
    strain.append(np.quantile(X,0.90))
    strain.append((X.loc[abs(X - X.mean()) < 20]).kurtosis()) # truncated kurtosis 
    strain.append(np.quantile(X,0.75) - np.quantile(X,0.25)) #iqr
    ### std_nopeak: https://stackoverflow.com/questions/51006163/pandas-how-to-detect-the-peak-points-outliers-in-a-dataframe
    # df = X.copy(deep = True) #temp df
    # import stats
    # df_Z = df[(np.abs(stats.zscore(df)) < 2).all(axis=1)] # Use z-score of 2 to remove peaks
    # ix_keep = df_Z.index
    # df_keep = df.loc[ix_keep] # Subset the raw dataframe with the indexes you'd like to keep
    # strain.append(df_keep.std())
    ### mfcc - https://www.kaggle.com/ilu000/1-private-lb-kernel-lanl-lgbm/
    # import librosa
    # mfcc = librosa.feature.mfcc(X.values)
    # strain.append(mfcc.mean(axis=1))
    ### power spectrum
    # from scipy.signal import find_peaks
    # strain.append(X.find_peaks(height=100)) # peak count
    # strain.append(X.rolling(50).std()) # rolling stdev
    return pd.Series(strain)

In [64]:
train = pd.read_csv('LANL-Earthquake-Prediction/train.csv', iterator=True, chunksize=150_000, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

X_train = pd.DataFrame()
y_train = pd.Series(dtype=np.float64)
for df in train:
    ch = gen_features(df['acoustic_data'])
    X_train = X_train.append(ch, ignore_index=True)
    y_train = y_train.append(pd.Series(df['time_to_failure'].values[-1]))

In [65]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,4.884113,5.101106,-98.0,104.0,33.662481,-0.024061,11.0,10.0,2.402105,4.0
1,4.725767,6.588824,-154.0,181.0,98.758517,0.390561,12.0,10.0,2.527633,5.0
2,4.906393,6.967397,-106.0,140.0,33.555211,0.217391,13.0,10.0,2.554792,5.0
3,4.90224,6.922305,-199.0,197.0,116.548172,0.757278,12.0,10.0,2.568523,5.0
4,4.90872,7.30111,-126.0,145.0,52.977905,0.064531,12.0,10.0,2.709675,5.0


In [57]:
# colNames = list(X_train.columns)
# colNames[0]

0

In [59]:
# X_train.drop(colNames[10], axis=1, inplace=True)
# X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,4.884113,5.101106,-98.0,104.0,33.662481,-0.024061,11.0,10.0,2.402105,4.0
1,4.725767,6.588824,-154.0,181.0,98.758517,0.390561,12.0,10.0,2.527633,5.0
2,4.906393,6.967397,-106.0,140.0,33.555211,0.217391,13.0,10.0,2.554792,5.0
3,4.90224,6.922305,-199.0,197.0,116.548172,0.757278,12.0,10.0,2.568523,5.0
4,4.90872,7.30111,-126.0,145.0,52.977905,0.064531,12.0,10.0,2.709675,5.0


In [66]:
feature_names = list(X_train.columns)

In [67]:
print(feature_names)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [68]:
train_X, test_X, train_y, test_y = train_test_split(X_train, y_train, test_size=0.25)

In [69]:
train_pool = Pool(train_X, train_y, feature_names=feature_names)
test_pool = Pool(test_X, test_y, feature_names=feature_names)

In [70]:
model = CatBoostRegressor(iterations=500, random_seed=0)
summary = model.select_features(
    train_pool,
    eval_set=test_pool,
    features_for_select=list(range(train_pool.num_col())),
    num_features_to_select=5,
    steps=3,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=True,
    logging_level='Silent',
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [71]:
print('Selected features:', summary['selected_features_names'])

Selected features: ['1', '6', '7', '8', '9']


In [15]:
train_pool = Pool(X_train, y_train)
m = CatBoostRegressor(iterations=10000, loss_function='MAE', boosting_type='Ordered')
m.fit(X_train, y_train, silent=True)
m.best_score_

CatBoostError: Input data must have at least one feature

In [75]:
submission = pd.read_csv('LANL-Earthquake-Prediction/sample_submission.csv')
submission.shape

X_test = pd.DataFrame(index=range(submission.shape[0]), dtype=np.float32)

# Load files located in the test directory
test_files = glob.glob(os.path.join('LANL-Earthquake-Prediction/test', '*_*.*'))

# Feature engineering for the test set
# Iterate over all files in the test directory and
# create the test dataframe with new features
all_test_files = len(test_files)
#all_test_files =600
for seg_id in tqdm(range(all_test_files)):
    seg = pd.read_csv(test_files[seg_id])
    gen_features(X_test)

  0%|          | 0/2624 [00:00<?, ?it/s]


IndexError: index -1 is out of bounds for axis 0 with size 0

In [None]:
model = CatBoostRegressor(iterations=2,
                          learning_rate=1,
                          depth=2)
train_labels = list(range(len(X_test)))
# Fit model
model.fit(X_test, train_labels)
# Get predictions
preds = model.predict(X_test)
preds