We will use [tsfresh](https://tsfresh.readthedocs.io/) to create features. The module takes flat dataframes, as described [in the docs](https://tsfresh.readthedocs.io/en/latest/text/data_formats.html).

We will take the UC Berkeley milling data set and create a flat dataframe from it.

In [1]:
import pathlib
from pathlib import Path
import scipy.io as sio
import numpy as np
import pandas as pd
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
from src.data.data_prep_utils import MillingDataPrep

%matplotlib inline
%load_ext autoreload
%autoreload 2

Set folder paths.

In [2]:
root_dir = Path.cwd().parent.parent
print(root_dir)

folder_raw_data_milling = root_dir / 'tests/integration/fixtures'
# folder_processed_data_milling = root_dir / 'data/processed/milling'
df_label_path=folder_raw_data_milling / "labels_with_tool_class_truncated.csv"
print(folder_raw_data_milling)
df_label_path.exists()

/home/tim/Documents/feat-store
/home/tim/Documents/feat-store/tests/integration/fixtures


True

In [3]:
milldata = MillingDataPrep(root_dir / 'tests/integration/fixtures' / 'mill_truncated.mat', path_df_labels=df_label_path, window_size=64, stride=64, cut_drop_list=None)

In [4]:
df = milldata.create_xy_dataframe()


In [5]:
df.tail()

Unnamed: 0,cut_id,cut_no,case,time,ae_spindle,ae_table,vib_spindle,vib_table,smcdc,smcac,tool_class
8955,1_69,1,1,0.236,0.239258,0.179443,0.625,1.77002,7.182617,0.043945,0
8956,1_69,1,1,0.24,0.213013,0.175781,0.622559,1.691895,7.016602,0.48584,0
8957,1_69,1,1,0.244,0.22522,0.24353,0.672607,1.806641,6.938477,1.142578,0
8958,1_69,1,1,0.248,0.270386,0.296631,0.671387,1.884766,7.075195,1.328125,0
8959,1_69,1,1,0.252,0.323486,0.245972,0.740967,1.945801,7.011719,2.521973,0


In [10]:
df.dtypes

cut_id          object
cut_no           int64
case             int64
time           float32
ae_spindle     float32
ae_table       float32
vib_spindle    float32
vib_table      float32
smcdc          float32
smcac          float32
tool_class       int64
dtype: object

In [23]:
# save the dataframe to csv.gz
df.to_csv(root_dir / 'tests/integration/fixtures' / 'milling_truncated_results.csv.gz', compression='gzip', index=False)

In [25]:
df_gt = pd.read_csv(root_dir / 'tests/integration/fixtures' / 'milling_truncated_results.csv.gz', compression='gzip',)
df_gt.head()

Unnamed: 0.1,Unnamed: 0,cut_id,cut_no,case,time,ae_spindle,ae_table,vib_spindle,vib_table,smcdc,smcac,tool_class
0,0,0_0,0,1,0.0,0.219727,0.272827,0.733643,2.116699,6.84082,0.124512,0
1,1,0_0,0,1,0.004,0.246582,0.322266,0.778809,2.277832,6.660156,-0.561523,0
2,2,0_0,0,1,0.008,0.294189,0.283813,0.758057,2.34375,6.508789,-2.099609,0
3,3,0_0,0,1,0.012,0.323486,0.26001,0.726318,2.44873,6.542969,-2.731934,0
4,4,0_0,0,1,0.016,0.290527,0.253296,0.653076,2.546387,6.621094,-3.505859,0


In [11]:
df.shape

(8960, 11)

In [18]:
col_names_ordered = ['cut_id', 'cut_no', 'case', 'time', 'ae_spindle', 'ae_table', 'vib_spindle', 'vib_table', 'smcdc', 'smcac','tool_class']
col_dtype = [str, int, int, np.float32, np.float32, np.float32, np.float32, np.float32, np.float32, np.float32, int]
col_dtype_dict = dict(zip(col_names_ordered, col_dtype))

df_gt = pd.read_csv(folder_raw_data_milling / 'milling_truncated_results.csv.gz', compression='gzip',)
df_gt = df_gt.astype(col_dtype_dict)
df_gt.tail()

KeyError: 'Only a column name can be used for the key in a dtype mappings argument.'

In [13]:
df_gt.dtypes

cut_id          object
case             int64
time           float32
ae_spindle     float32
ae_table       float32
vib_spindle    float32
vib_table      float32
smcdc          float32
smcac          float32
tool_class       int64
dtype: object

In [14]:
from pandas.testing import assert_frame_equal
a = assert_frame_equal(df, df_gt)

AssertionError: DataFrame are different

DataFrame shape mismatch
[left]:  (8960, 11)
[right]: (8960, 10)

In [None]:
a

In [None]:
df.to_csv(folder_processed_data_milling / "milling_truncated.csv.gz", compression="gzip", index=False)