In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.metrics import mean_absolute_error
from scipy.stats import kurtosis


data = pd.read_csv("data/train_small.csv")
plt.plot(data['time_to_failure'],data['acoustic_data'])
plt.show()

data.shape


In [None]:
def load_data(file_name):
    data = pd.read_csv(file_name,dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})
    return data

In [None]:
def preproc_data(data, rows=65_536, ratio=0.8):
    # create the segments (each row gets a segment value), no need to randomize
    segments = int(np.floor(data.shape[0] / rows))
    
    # first, create dataframes for X and Y
    X = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['ave', 'std','kurt', 'max', 'min','abs_max',
                                'quantile_095','quantile_099','quantile_09',
                                'quantile_01','quantile_005','quantile_001'])
    Y = pd.DataFrame(index=range(segments), dtype=np.float64,
                           columns=['time_to_failure'])
    
    # split the datasettt into segments to perform agregation calculation on them 
    # should be overlaping segments
    for segment in tqdm(range(segments)):
        seg = data.iloc[segment*rows:segment*rows+rows]
        x = seg['acoustic_data'].values
        y = seg['time_to_failure'].values[-1]
        
        # time to failure 
        Y.loc[segment, 'time_to_failure'] = y
        
        # all our features
        X.loc[segment, 'ave'] = x.mean()
        X.loc[segment, 'kurt'] = kurtosis(x)
        X.loc[segment, 'quantile_095'] = np.quantile(x,0.95)
        X.loc[segment, 'quantile_099'] = np.quantile(x,0.99)
        X.loc[segment, 'quantile_09'] = np.quantile(x,0.9)
        X.loc[segment, 'quantile_01'] = np.quantile(x,0.1)
        X.loc[segment, 'quantile_005'] = np.quantile(x,0.05)
        X.loc[segment, 'quantile_001'] = np.quantile(x,0.01)
        X.loc[segment, 'std'] = x.std()
        X.loc[segment, 'max'] = x.max()
        X.loc[segment, 'min'] = x.min()
        X.loc[segment, 'abs_max'] = abs(x).max()
    
    train_test_cut_id = int(len(X)*ratio)
    return X[:train_test_cut_id], Y[:train_test_cut_id], X[train_test_cut_id:], Y[train_test_cut_id:]

In [None]:
if False:
    data = load_data('data/train.csv')

In [None]:
X_train, Y_train, X_test, Y_test = preproc_data(data)
X_train.shape

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
def mean(array, step):
    array_len= len(array)
    array_mean = list(array[:])
    mean_step = step
    for i in range(mean_step,array_len):
        array_mean[i] = sum([array[i-j] for j in range(mean_step)])/mean_step
    return array_mean

In [None]:
svm = NuSVR()
svm.fit(X_train_scaled, Y_train.values.flatten())
y_pred = svm.predict(X_test_scaled)

In [None]:
plt.plot(y_pred)
plt.plot(Y_test)
plt.show()

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(Y_train.values.flatten(), y_pred)
plt.xlim(0, 20)
plt.ylim(0, 20)
plt.xlabel('actual', fontsize=12)
plt.ylabel('predicted', fontsize=12)
plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
plt.show()

In [None]:
for i in range(1,20):
    y_pred_mean = mean(y_pred,i)
    score = mean_absolute_error(Y_test.values.flatten(), y_pred_mean)
    print(f'mean: {i}, Score: {score:0.3f}')

In [None]:
y_pred_mean = mean(y_pred,13)
plt.plot(y_pred_mean)

In [None]:
# without quantile : 2.202, 2.031 (8)
# with 5 quantiles : 2.171, 2.023 (8)

