# Initial Setup

In [None]:
import numpy as np 
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import style
style.use('ggplot')

import seaborn as sns
sns.set()

from IPython.display import HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

from os import listdir
print(listdir("../input"))

import timeit
from tqdm import tqdm

# Training Set

In [None]:
train_nrows = !wc -l ../input/train.csv
train_nrows_val = int(train_nrows[0].split()[0])
print('{:,} rows'.format(train_nrows_val))

In [None]:
!head ../input/train.csv

In [None]:
column_names = !head -n1 ../input/train.csv
print(column_names[0].split(','))

In [None]:
df_train = pd.read_csv('../input/train.csv', skiprows = 0, nrows=train_nrows_val//100,
                       dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64}) #use chunksize to iterate

In [None]:
def display_df_with_preset_precision(df, precision):
    curr_precision = pd.get_option("display.precision")
    pd.set_option("display.precision", precision)
    display(df)
    pd.set_option("display.precision", curr_precision)
display_df_with_preset_precision(df_train.head(), 16)

In [None]:
#keep minimal mem footprint 
try:
    del(df_train)    
except NameError:
    pass

In [None]:
start_time = timeit.default_timer()
try:
    del(df_train_iter)    
except NameError:
    pass

df_train_iter = pd.read_csv('../input/train.csv', chunksize=train_nrows_val//100,
                       dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64},iterator=True) #use chunksize to iterate
df_after_jump_agg = pd.DataFrame()
for df in df_train_iter:
    df['diff_in_time_to_failure']=df['time_to_failure'].diff()
    df_jumps = df.loc[(df['diff_in_time_to_failure'] > 0)]
    #display(df_jumps)
    df_after_jump_agg=df_after_jump_agg.append(df_jumps)
print('elapsed time: {:.2f} sec'.format(timeit.default_timer()-start_time))    

In [None]:
print(df_after_jump_agg.shape)
df_after_jump_agg

In [None]:
start_time = timeit.default_timer()
try:
    del(df_train_iter)    
except NameError:
    pass

df_train_iter = pd.read_csv('../input/train.csv', chunksize=train_nrows_val//100,
                       dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64},iterator=True) #use chunksize to iterate
df_before_jump_agg = pd.DataFrame()
for df in df_train_iter:
    if len(df.index.intersection(df_after_jump_agg.index-1)) > 0:
        try:
            #display(df.loc[df.index.intersection(df_agg.index-1),:])
            df_before_jump_agg=df_before_jump_agg.append(df.loc[df.index.intersection(df_after_jump_agg.index-1),:])
        except KeyError:
            print('KeyError')
            pass
print('elapsed time: {:.2f} sec'.format(timeit.default_timer()-start_time))

In [None]:
print(df_before_jump_agg.shape)
df_before_jump_agg

In [None]:
index_ranges = [(ent[0],ent[1]) for ent in zip([0]+list(df_after_jump_agg.index)[:-1],list(df_before_jump_agg.index))]
index_ranges

In [None]:
train_set_lengths =np.array([ent[1]-ent[0] for ent in zip([0]+list(df_before_jump_agg.index)[:-1],list(df_before_jump_agg.index))])
train_set_lengths

In [None]:
train_set_lengths/train_set_lengths.mean()

In [None]:
range_index = 3 #

In [None]:
start_time = timeit.default_timer()
try:
    del(df_sample)    
except NameError:
    pass
df_sample = pd.read_csv('../input/train.csv', skiprows = index_ranges[range_index][0], nrows= index_ranges[range_index][1]-index_ranges[range_index][0],
                       dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})
df_sample.columns=['acoustic_data','time_to_failure']
df_sample['acoustic_data'].plot();
plt.show()
df_sample['time_to_failure'].plot();
plt.show()
print('elapsed time: {:.2f} sec'.format(timeit.default_timer()-start_time))

We have 16 train sequences. We may use multiple ways to separate them into train and validation groups. This way we will be able to efficiently utilize the training data we have

In [None]:
max_time_to_failure_points = pd.read_csv('../input/train.csv', skiprows = 0, nrows= 1, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})['time_to_failure'].append(df_after_jump_agg['time_to_failure'])
max_time_to_failure_points

In [None]:
max_time_to_failure_points.values[:-1]

In [None]:
decline_angle_tangents = np.array([ent[0]/ent[1] for ent in zip(max_time_to_failure_points.values[:-1], max_time_to_failure_points.index[1:])])
print(decline_angle_tangents.mean())
print(decline_angle_tangents.std())

In [None]:
#keep minimal mem footprint 
try:
    del(df_sample)    
except NameError:
    pass

# Test Set

In [None]:
test_seg_files = listdir("../input/test")
test_seg_files[:5]

In [None]:
#!ls -l ../input/test | wc -l
len(test_seg_files)

In [None]:
os.path.join("../input/test",test_seg_files[0])

In [None]:
!wc -l {os.path.join("../input/test",test_seg_files[0])}

In [None]:
!head {os.path.join("../input/test",test_seg_files[0])}

In [None]:
from ipywidgets import interact
import ipywidgets as widgets

In [None]:
def plot_test_seg_by_index(idx):
    df_test_seg = pd.read_csv(os.path.join("../input/test",test_seg_files[idx]), dtype={'acoustic_data': np.int16})
    df_test_seg['acoustic_data'].plot();

In [None]:
interact(plot_test_seg_by_index, idx=widgets.IntSlider(min=0,max=len(test_seg_files)-1,step=1,value=0));

In [None]:
#all seg files have the same length: 150,000 samples
seg_files_lengths = !for filename in ../input/test/*; do wc -l $filename; done
{ent.split(' ')[0] for ent in seg_files_lengths}

# Models

Try the following:<br>
1. simple dot product
2. fourier 
3. wavelets
4. voice spectrogrum
5. xgb on 

In [None]:
range_index=0 #first training sequence is the shortest one
df_sample = pd.read_csv('../input/train.csv', skiprows = index_ranges[range_index][0], nrows= index_ranges[range_index][1]-index_ranges[range_index][0],
                       dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})
df_sample.columns=['acoustic_data','time_to_failure']
df_sample['acoustic_data'].plot();
plt.show()
print ('time_to_failure decline rate = {:.16f}'.format(df_sample['time_to_failure'][0]/df_sample['time_to_failure'].shape[0]))
df_sample['time_to_failure'].plot();
plt.show()

In [None]:
df_sample['time_to_failure'][0]/df_sample['time_to_failure'].shape[0]

In [None]:
df_sample['time_to_failure'].shape[0]

we need to find points of maximum cross-correlation between the segment and the training sequence<br>
To save time, the correlation should be calculated efficiently - this means do not implement it by yourself in python. We need to find best implementation of the correlation function. 

In [None]:
df_sample['acoustic_data'].head()

In [None]:
df_sample['acoustic_data'].values

In [None]:
df_test_seg = pd.read_csv(os.path.join("../input/test",test_seg_files[0]), dtype={'acoustic_data': np.int16})
df_test_seg.head()

In [None]:
df_test_seg['acoustic_data'].values

In [None]:
from scipy import signal

In [None]:
signal_corr = signal.correlate(df_sample['acoustic_data'].values, df_test_seg['acoustic_data'].values,mode='same', method='fft')

In [None]:
pd.DataFrame(signal_corr).head(100).plot();

In [None]:
#start_time = timeit.default_timer()
#np_corr = np.correlate(df_sample['acoustic_data'].values, df_test_seg['acoustic_data'].values)
#print('elapsed time: {:.2f} sec'.format(timeit.default_timer()-start_time))    
#this takes too long

In [None]:
from scipy.signal import spectrogram

In [None]:
f, t, Sxx = spectrogram(df_test_seg['acoustic_data'].values)
#plt.pcolormesh(t, f, Sxx)
plt.plot(Sxx)
#plt.ylabel('Frequency [Hz]')
#plt.xlabel('Time [sec]')
plt.show()

In [None]:
df_test_seg['acoustic_data'].values.shape[0]

In [None]:
df_test_seg['acoustic_data'].values

In [None]:
df_sample['acoustic_data'].values[0:0+df_test_seg['acoustic_data'].values.shape[0]].dot(df_test_seg['acoustic_data'].values)

In [None]:
start_time = timeit.default_timer()
corr_list = [df_sample['acoustic_data'].values[offset:offset+df_test_seg['acoustic_data'].values.shape[0]].dot(df_test_seg['acoustic_data'].values) \
             for offset in range(0,df_sample['acoustic_data'].values.shape[0]-df_test_seg['acoustic_data'].values.shape[0])]
print('elapsed time: {} sec'.format(timeit.default_timer()-start_time))

In [None]:
pd.DataFrame(np.abs(corr_list)[-200:]).plot();

In [None]:
df_sample['acoustic_data'].values.mean()

In [None]:
df_test_seg['acoustic_data'].values.mean()

In [None]:
pd.DataFrame(df_test_seg['acoustic_data'].values).plot()

In [None]:
corr_offset_of_max = -1
max_corr_value = 0
for offset in range(0,df_sample['acoustic_data'].values.shape[0]-df_test_seg['acoustic_data'].values.shape[0]):
    corr_value = np.abs(df_sample['acoustic_data'].values[offset:offset+df_test_seg['acoustic_data'].values.shape[0]].dot(df_test_seg['acoustic_data'].values))
    if np.abs(corr_value) > max_corr_value:
        max_corr_value = corr_value
        corr_offset_of_max = offset
        print('new max_corr_value = {}'.format(max_corr_value))

In [None]:
#pd.DataFrame(np.stack([df_sample['acoustic_data'].values[:df_test_seg['acoustic_data'].values.shape[0]], df_test_seg['acoustic_data'].values],axis=1)).corr().iloc[0,1]

https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.fft.html <br>
https://docs.scipy.org/doc/scipy/reference/tutorial/fftpack.html

In [None]:
#df_test_seg['acoustic_data'].values
from scipy.fftpack import fft
# Number of sample points
#N = 600
# sample spacing
#T = 1.0 / 800.0
#x = np.linspace(0.0, N*T, N)
#y = np.sin(50.0 * 2.0*np.pi*x) + 0.5*np.sin(80.0 * 2.0*np.pi*x)

#yf = fft(y)
yf = fft(df_test_seg['acoustic_data'].values)
#df_test_seg['acoustic_data'].values

#xf = np.linspace(0.0, 1.0/(2.0*T), N//2)

#plt.plot(xf, 2.0/N * np.abs(yf[0:N//2]))
plt.plot(np.abs(yf))
plt.grid()
plt.show()

In [None]:
yf

In [None]:
from scipy import fftpack

In [None]:
from scipy.signal import cwt

# Evaluation