# Example Workbook Illustrating the Use of tsfresh

In [1]:
# Automatically reload external modules (see https://ipython.org/ipython-doc/3/config/extensions/autoreload.html for more information)
%load_ext autoreload
%autoreload 2

# Set up system path to include our "anoog" python package
import sys
sys.path.append('../src')

## Import Packages / Modules

In [2]:
# general imports
import os
import numpy as np
import pandas as pd

# our own library
import anoog

# plotting
import matplotlib.pyplot as plt
import plotly_express as px

# Learning
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

# Feature Extraction (https://tsfresh.readthedocs.io)
from tsfresh import feature_extraction
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.transformers import RelevantFeatureAugmenter, FeatureAugmenter

## Loading Data

In [4]:
dataPath = '../data/Thema1'
seriesIDs = ['wood', 'plastic']

(sdf, mdf) = anoog.io.csv.load_tsfresh(dataPath, seriesIDs)

FileNotFoundError: [WinError 3] Das System kann den angegebenen Pfad nicht finden: '../data/Thema1\\wood'

## Inspecting Data

In [None]:
sdf.head()

In [None]:
sdf.tail()

In [None]:
mdf

## Plot Data

In [None]:
plotInterval = 100

px.line(sdf[0::plotInterval], x=sdf.index[0::plotInterval], y=['Audio', 'Voltage', 'Current', 'ID'])      # only plot every nth data point to keep performance in check

## Resample Data

In [None]:
resamplingInterval = '10ms'

rsdf = sdf.resample(resamplingInterval, label='right', closed='right', on='Time').mean()
rsdf.dropna(inplace=True)
rsdf.reset_index(inplace=True)

## Plot Resampled Data

In [None]:
plotInterval = 10

px.line(rsdf[0::plotInterval], x=rsdf.index[0::plotInterval], y=['Audio', 'Voltage', 'Current', 'ID'])      # only plot every nth data point to keep performance in check

## Use tsfresh to Generate / Extract Features from Timeseries Data

In [None]:
X_extracted = extract_features(rsdf, column_id="ID", column_sort="Time")
X_extracted

## Fill / Replace Invalid Values

In [None]:
# 2. Impute (replace Nan/Inf) features
X_imputed = impute(X_extracted)
X_imputed

## Prepare Target Labels

In [None]:
labelEnc = LabelEncoder()

y = pd.Series(labelEnc.fit_transform(mdf['Material'].values), index=mdf['ID'])

print(y)

In [None]:
# TODO: Split test and training data
y_train = y
X_train = X_imputed

## Filter / Select Relevant Features

In [None]:
X_train_filtered = select_features(X_train, y_train)
X_train_filtered

## Fetch Selected Feature Parameter

In [None]:
kind_to_fc_parameters = feature_extraction.settings.from_columns(X_train_filtered)
kind_to_fc_parameters # this map can be later used as parameter for the extract_features function to only calculate the relevant features selected during training

## tsfresh in sklearn Pipeline

In [None]:
# In a pipeline, the input features are simply a list of IDs
X = pd.DataFrame(index=y.index)

# Create pipeline
# Option 1: Pipeline with filtering / selecting relevant features
# pipeline = Pipeline([('augmenter', RelevantFeatureAugmenter(column_id='ID', column_sort='Time')),
#                      ('classifier', RandomForestClassifier())])
                     
# Option 2: Pipeline without filtering / selecting relevant features (feature selection only works for a reasonably sized dataset)
pipeline = Pipeline([('augmenter', FeatureAugmenter(column_id='ID', column_sort='Time', impute_function=impute)),
                     ('classifier', RandomForestClassifier())])

# Set sensor timeseries data
pipeline.set_params(augmenter__timeseries_container=rsdf)

In [None]:
# Cross validation
scores = cross_val_score(pipeline, X, y, cv = 9)
print(scores)