In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.model_selection import train_test_split
import glob
import os
from tqdm import tqdm


In [31]:
def gen_features(X):
    strain = []
    strain.append(X.mean())
    strain.append(X.std())
    strain.append(X.min())
    strain.append(X.max())
    strain.append(X.kurtosis())
    strain.append(X.skew())
    # strain.append(np.quantile(X,0.95))
    # strain.append(np.quantile(X,0.90))
    # strain.append((X.loc[abs(X - X.mean()) < 20]).kurtosis()) # truncated kurtosis 
    # strain.append(np.quantile(X,0.75) - np.quantile(X,0.25)) #iqr
    ### std_nopeak: https://stackoverflow.com/questions/51006163/pandas-how-to-detect-the-peak-points-outliers-in-a-dataframe
    # df = X.copy(deep = True) #temp df
    # import stats
    # df_Z = df[(np.abs(stats.zscore(df)) < 2).all(axis=1)] # Use z-score of 2 to remove peaks
    # ix_keep = df_Z.index
    # df_keep = df.loc[ix_keep] # Subset the raw dataframe with the indexes you'd like to keep
    # strain.append(df_keep.std())
    ### mfcc - https://www.kaggle.com/ilu000/1-private-lb-kernel-lanl-lgbm/
    # import librosa
    # mfcc = librosa.feature.mfcc(X.values)
    # strain.append(mfcc.mean(axis=1))
    ### power spectrum
    # from scipy.signal import find_peaks
    # strain.append(X.find_peaks(height=100)) # peak count
    # strain.append(np.quantile(X.rolling(50).std().dropna(), 0.2)) # rolling stdev
    return pd.Series(strain)

In [170]:
train = pd.read_csv('LANL-Earthquake-Prediction/train.csv', iterator=True, chunksize=150_000, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

X_train = pd.DataFrame()
y_train = pd.Series(dtype=np.float64)
for df in train:
    ch = gen_features(df['acoustic_data'])
    X_train = X_train.append(ch, ignore_index=True)
    y_train = y_train.append(pd.Series(df['time_to_failure'].values[-1]))

In [162]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,4.884113,5.101106,-98.0,104.0,33.662481,-0.024061,11.0,10.0,2.402105,4.0
1,4.725767,6.588824,-154.0,181.0,98.758517,0.390561,12.0,10.0,2.527633,5.0
2,4.906393,6.967397,-106.0,140.0,33.555211,0.217391,13.0,10.0,2.554792,5.0
3,4.90224,6.922305,-199.0,197.0,116.548172,0.757278,12.0,10.0,2.568523,5.0
4,4.90872,7.30111,-126.0,145.0,52.977905,0.064531,12.0,10.0,2.709675,5.0


In [57]:
# colNames = list(X_train.columns)
# colNames[0]

0

In [59]:
# X_train.drop(colNames[10], axis=1, inplace=True)
# X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,4.884113,5.101106,-98.0,104.0,33.662481,-0.024061,11.0,10.0,2.402105,4.0
1,4.725767,6.588824,-154.0,181.0,98.758517,0.390561,12.0,10.0,2.527633,5.0
2,4.906393,6.967397,-106.0,140.0,33.555211,0.217391,13.0,10.0,2.554792,5.0
3,4.90224,6.922305,-199.0,197.0,116.548172,0.757278,12.0,10.0,2.568523,5.0
4,4.90872,7.30111,-126.0,145.0,52.977905,0.064531,12.0,10.0,2.709675,5.0


In [176]:
feature_names = list(X_train.columns)

In [67]:
print(feature_names)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [174]:
train_X, test_X, train_y, test_y = train_test_split(X_train, y_train, test_size=0.25)

In [177]:
train_pool = Pool(train_X, train_y, feature_names=feature_names)
test_pool = Pool(test_X, test_y, feature_names=feature_names)

In [178]:
model = CatBoostRegressor(iterations=500, random_seed=0)
summary = model.select_features(
    train_pool,
    eval_set=test_pool,
    features_for_select=list(range(train_pool.num_col())),
    num_features_to_select=5,
    steps=3,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=True,
    logging_level='Silent',
    plot=True
)

The number of features selection steps (3) is greater than the number of features to eliminate (1). The number of steps was reduced to 1.


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [179]:
print('Selected features:', summary['selected_features_names'])

Selected features: ['0', '1', '2', '3', '4']


In [180]:
train_pool = Pool(X_train, y_train)
m = CatBoostRegressor(iterations=10000, loss_function='MAE', boosting_type='Ordered')
m.fit(X_train, y_train, silent=True)
m.best_score_

{'learn': {'MAE': 1.432196220426527}}

In [67]:
### OLD WAY FOR LOADING ALL TEST FILES: (from https://www.kaggle.com/afsanehm/lanl-earthquake-prediction)
submission = pd.read_csv('LANL-Earthquake-Prediction/sample_submission.csv')
submission.shape

X_test = pd.DataFrame(index=range(submission.shape[0]), dtype=np.float32)

# Load files located in the test directory
test_files = glob.glob(os.path.join('LANL-Earthquake-Prediction/test', '*_*.*'))

# Feature engineering for the test set
# Iterate over all files in the test directory and
# create the test dataframe with new features
all_test_files = len(test_files)
#all_test_files =600
for seg_id in tqdm(range(all_test_files)):
    seg = pd.read_csv(test_files[seg_id])
    ch = gen_features(X_test)
    # X_test = X_test.append(ch, ignore_index=True)


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.

  0%|          | 1/2624 [00:00<01:34, 27.85it/s]


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [111]:
### NEW WAY FOR LOADING ONE TEST FILE (based on how loading training data above, since not shown in the github repo)
test = pd.read_csv('LANL-Earthquake-Prediction/test/seg_0a0fbb.csv')

X_test = pd.DataFrame()
y_test = pd.Series(dtype=np.float64)
ch = gen_features(test['acoustic_data'])
X_test = X_test.append(ch, ignore_index=True)
# no y_test because no time to failure column

In [171]:
train = pd.read_csv('LANL-Earthquake-Prediction/train.csv', iterator=True, chunksize=150_000, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})
for chunk in train:
    print(chunk)
    break

# train_pool.num_row()
# X_test
# x_test = pd.DataFrame(X_test.values)
# x_test

        acoustic_data  time_to_failure
0                  12         1.469100
1                   6         1.469100
2                   8         1.469100
3                   5         1.469100
4                   8         1.469100
...               ...              ...
149995              1         1.430797
149996              6         1.430797
149997              6         1.430797
149998              2         1.430797
149999              0         1.430797

[150000 rows x 2 columns]


In [186]:
# Making prediction for 
# model = CatBoostRegressor(iterations=2,
#                           learning_rate=1,
#                           depth=2)
# train_labels = list(range(len(X_train)))
# model.fit(X_train, train_labels)
# x_test = pd.DataFrame(X_test.values)
preds = m.predict(X_test)
print(preds)

[9.01240502]
