# Initial Setup

In [None]:
import numpy as np 
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import style
style.use('ggplot')

import seaborn as sns
sns.set()

from IPython.display import HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

from os import listdir
print(listdir("../input"))

import timeit
from tqdm import tqdm

# Training Set

In [None]:
train_nrows = !wc -l ../input/train.csv
train_nrows_val = int(train_nrows[0].split()[0])
print('{:,} rows'.format(train_nrows_val))

In [None]:
!head ../input/train.csv

In [None]:
column_names = !head -n1 ../input/train.csv
print(column_names[0].split(','))

In [None]:
df_train = pd.read_csv('../input/train.csv', skiprows = 0, nrows=train_nrows_val//100,
                       dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64}) #use chunksize to iterate

In [None]:
def display_df_with_preset_precision(df, precision):
    curr_precision = pd.get_option("display.precision")
    pd.set_option("display.precision", precision)
    display(df)
    pd.set_option("display.precision", curr_precision)
display_df_with_preset_precision(df_train.head(), 16)

In [None]:
#keep minimal mem footprint 
try:
    del(df_train)    
except NameError:
    pass

In [None]:
start_time = timeit.default_timer()
try:
    del(df_train_iter)    
except NameError:
    pass

df_train_iter = pd.read_csv('../input/train.csv', chunksize=train_nrows_val//100,
                       dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64},iterator=True) #use chunksize to iterate
df_after_jump_agg = pd.DataFrame()
for df in df_train_iter:
    df['diff_in_time_to_failure']=df['time_to_failure'].diff()
    df_jumps = df.loc[(df['diff_in_time_to_failure'] > 0)]
    #display(df_jumps)
    df_after_jump_agg=df_after_jump_agg.append(df_jumps)
print('elapsed time: {:.2f} sec'.format(timeit.default_timer()-start_time))    

In [None]:
print(df_after_jump_agg.shape)
df_after_jump_agg

In [None]:
start_time = timeit.default_timer()
try:
    del(df_train_iter)    
except NameError:
    pass

df_train_iter = pd.read_csv('../input/train.csv', chunksize=train_nrows_val//100,
                       dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64},iterator=True) #use chunksize to iterate
df_before_jump_agg = pd.DataFrame()
for df in df_train_iter:
    if len(df.index.intersection(df_after_jump_agg.index-1)) > 0:
        try:
            #display(df.loc[df.index.intersection(df_agg.index-1),:])
            df_before_jump_agg=df_before_jump_agg.append(df.loc[df.index.intersection(df_after_jump_agg.index-1),:])
        except KeyError:
            print('KeyError')
            pass
print('elapsed time: {:.2f} sec'.format(timeit.default_timer()-start_time))

In [None]:
print(df_before_jump_agg.shape)
df_before_jump_agg

In [None]:
index_ranges = [(ent[0],ent[1]) for ent in zip([0]+list(df_after_jump_agg.index)[:-1],list(df_before_jump_agg.index))]
index_ranges

In [None]:
train_set_lengths =np.array([ent[1]-ent[0] for ent in zip([0]+list(df_before_jump_agg.index)[:-1],list(df_before_jump_agg.index))])
train_set_lengths

In [None]:
train_set_lengths/train_set_lengths.mean()

In [None]:
range_index = 3

In [None]:
start_time = timeit.default_timer()
try:
    del(df_sample)    
except NameError:
    pass
df_sample = pd.read_csv('../input/train.csv', skiprows = index_ranges[range_index][0], nrows= index_ranges[range_index][1]-index_ranges[range_index][0],
                       dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})
df_sample.columns=['acoustic_data','time_to_failure']
df_sample['acoustic_data'].plot();
plt.show()
df_sample['time_to_failure'].plot();
plt.show()
print('elapsed time: {:.2f} sec'.format(timeit.default_timer()-start_time))

In [None]:
#keep minimal mem footprint 
try:
    del(df_sample)    
except NameError:
    pass

# Test Set

In [None]:
test_seg_files = listdir("../input/test")
test_seg_files[:5]

In [None]:
#!ls -l ../input/test | wc -l
len(test_seg_files)

In [None]:
os.path.join("../input/test",test_seg_files[0])

In [None]:
!wc -l {os.path.join("../input/test",test_seg_files[0])}

In [None]:
!head {os.path.join("../input/test",test_seg_files[0])}

In [None]:
from ipywidgets import interact
import ipywidgets as widgets

In [None]:
def plot_test_seg_by_index(idx):
    df_test_seg = pd.read_csv(os.path.join("../input/test",test_seg_files[idx]), dtype={'acoustic_data': np.int16})
    df_test_seg['acoustic_data'].plot();

In [None]:
interact(plot_test_seg_by_index, idx=widgets.IntSlider(min=0,max=len(test_seg_files)-1,step=1,value=0));