In [1]:
import os
import sys
import glob
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [2]:
df_all = pd.read_csv('s3://stormpetrels/samples/labels/samples_all.csv')

In [3]:
nice_features =  ['freq_Q25', 'freq_Q75', 'freq_mean', 'freq_median',
       'freq_mode', 'freq_peak', 'offset', 'onset', 'pitch_IQR',
       'pitch_Q25', 'pitch_Q75', 'pitch_max', 'pitch_mean',
       'pitch_median', 'pitch_min', 'yaafe_LPC', 'yaafe_LSF.0', 'yaafe_LSF.1',
       'yaafe_LSF.2', 'yaafe_LSF.3', 'yaafe_LSF.4', 'yaafe_LSF.5',
       'yaafe_LSF.6', 'yaafe_LSF.7', 'yaafe_LSF.8', 'yaafe_LSF.9',
       'yaafe_MFCC.0', 'yaafe_MFCC.1', 'yaafe_MFCC.10', 'yaafe_MFCC.11',
       'yaafe_MFCC.12', 'yaafe_MFCC.2', 'yaafe_MFCC.3', 'yaafe_MFCC.4',
       'yaafe_MFCC.5', 'yaafe_MFCC.6', 'yaafe_MFCC.7', 'yaafe_MFCC.8',
       'yaafe_MFCC.9', 'yaafe_OBSI.0', 'yaafe_OBSI.1', 'yaafe_OBSI.2',
       'yaafe_OBSI.3', 'yaafe_OBSI.4', 'yaafe_OBSI.5', 'yaafe_OBSI.6',
       'yaafe_OBSI.7', 'yaafe_OBSI.8', 'yaafe_SpectralFlatness',
       'yaafe_SpectralFlux', 'yaafe_SpectralRolloff',
       'yaafe_SpectralVariation']

In [4]:
df = df_all[nice_features]

In [5]:
y = df_all['petrel']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=42)

In [10]:
dtrain = xgb.DMatrix(X_train, label=y_train.values)
dtest = xgb.DMatrix(X_test, label=y_test.values)

In [11]:
param = {'max_depth': 3, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic', 'subsample': 0.8, 'gamma': 0.1}
watchlist = [(dtest, 'eval'), (dtrain, 'train')]

In [12]:
bst = xgb.train(param, dtrain, 60, watchlist)

[0]	eval-error:0.027074	train-error:0.017978
[1]	eval-error:0.019214	train-error:0.017228
[2]	eval-error:0.024454	train-error:0.017228
[3]	eval-error:0.024454	train-error:0.017978
[4]	eval-error:0.022707	train-error:0.017603
[5]	eval-error:0.018341	train-error:0.016479
[6]	eval-error:0.018341	train-error:0.017228
[7]	eval-error:0.018341	train-error:0.016854
[8]	eval-error:0.018341	train-error:0.017603
[9]	eval-error:0.018341	train-error:0.017603
[10]	eval-error:0.018341	train-error:0.017603
[11]	eval-error:0.017467	train-error:0.017228
[12]	eval-error:0.017467	train-error:0.017228
[13]	eval-error:0.017467	train-error:0.017603
[14]	eval-error:0.017467	train-error:0.017603
[15]	eval-error:0.017467	train-error:0.017603
[16]	eval-error:0.017467	train-error:0.017978
[17]	eval-error:0.017467	train-error:0.017603
[18]	eval-error:0.017467	train-error:0.017228
[19]	eval-error:0.017467	train-error:0.017603
[20]	eval-error:0.017467	train-error:0.017603
[21]	eval-error:0.017467	train-error:0.01760

In [32]:
h5_files = glob.glob('../data/h5/*.h5')
dfs = []
for path in h5_files:
    df = pd.read_hdf(path)
    name = os.path.splitext(os.path.basename(path))[0]
    df['filename'] = name
    dfs.append(df)

In [33]:
dfs = pd.concat(dfs)

In [34]:
dfs = dfs.rename(columns={'freq_peak.1': 'freq_peak'})
df_pred = dfs[nice_features]

In [35]:
p = xgb.DMatrix(df_pred)

In [36]:
pred = bst.predict(p)

In [37]:
dfs['petrel_prob'] = pred
dfs['petrel'] = pred > 0.5

In [29]:
pred.shape

(483956,)

In [38]:
dfs.columns.values

array(['onset', 'offset', 'freq_mean', 'freq_median', 'freq_mode',
       'freq_Q25', 'freq_Q75', 'freq_IQR', 'freq_peak', 'freq_peak.2',
       'freq_peak.3', 'pitch_median', 'pitch_mean', 'pitch_Q25',
       'pitch_Q75', 'pitch_IQR', 'pitch_min', 'pitch_max',
       'yaafe_Chroma.0', 'yaafe_Chroma.1', 'yaafe_Chroma.2',
       'yaafe_Chroma.3', 'yaafe_Chroma.4', 'yaafe_Chroma.5',
       'yaafe_Chroma.6', 'yaafe_Chroma.7', 'yaafe_Chroma.8',
       'yaafe_Chroma.9', 'yaafe_Chroma.10', 'yaafe_Chroma.11',
       'yaafe_LPC', 'yaafe_LSF.0', 'yaafe_LSF.1', 'yaafe_LSF.2',
       'yaafe_LSF.3', 'yaafe_LSF.4', 'yaafe_LSF.5', 'yaafe_LSF.6',
       'yaafe_LSF.7', 'yaafe_LSF.8', 'yaafe_LSF.9', 'yaafe_MFCC.0',
       'yaafe_MFCC.1', 'yaafe_MFCC.2', 'yaafe_MFCC.3', 'yaafe_MFCC.4',
       'yaafe_MFCC.5', 'yaafe_MFCC.6', 'yaafe_MFCC.7', 'yaafe_MFCC.8',
       'yaafe_MFCC.9', 'yaafe_MFCC.10', 'yaafe_MFCC.11', 'yaafe_MFCC.12',
       'yaafe_OBSI.0', 'yaafe_OBSI.1', 'yaafe_OBSI.2', 'yaafe_OBSI.3',
     