We will use [tsfresh](https://tsfresh.readthedocs.io/) to create features. The module takes flat dataframes, as described [in the docs](https://tsfresh.readthedocs.io/en/latest/text/data_formats.html).

We will take the UC Berkeley milling data set and create a flat dataframe from it.

In [1]:
import pathlib
from pathlib import Path
import scipy.io as sio
import numpy as np
import pandas as pd
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
from src.data.data_prep_utils import MillingDataPrep

%matplotlib inline
%load_ext autoreload
%autoreload 2

Set folder paths.

In [2]:
root_dir = Path.cwd().parent.parent
print(root_dir)

folder_raw_data_milling = root_dir / 'data' / 'raw' / 'milling'
folder_processed_data_milling = root_dir / 'data' / 'processed' / 'milling'
df_label_path = folder_processed_data_milling / "labels_with_tool_class.csv"
print(folder_raw_data_milling)
df_label_path.exists()

c:\_Python\feat-store
c:\_Python\feat-store\data\raw\milling


True

In [10]:
milldata = MillingDataPrep(folder_raw_data_milling / 'mill.mat', path_df_labels=df_label_path, window_size=64, stride=64, )

In [8]:
df = milldata.create_xy_dataframe()
df.shape

(740480, 10)

In [12]:
x, y = milldata.create_xy_arrays()


In [13]:
print(x.shape)
print(y.shape)

(11570, 64, 6)
(11570, 64, 3)


In [9]:
df.tail()

Unnamed: 0,cut_id,case,time,ae_spindle,ae_table,vib_spindle,vib_table,smcdc,smcac,tool_class
740475,166_65,166,0.236,0.261841,0.254517,0.2771,1.12793,9.995117,3.129883,1
740476,166_65,166,0.24,0.214844,0.178223,0.2771,1.030273,9.995117,1.601562,1
740477,166_65,166,0.244,0.181885,0.164185,0.275879,1.035156,9.995117,0.288086,1
740478,166_65,166,0.248,0.264282,0.26123,0.275879,1.218262,9.995117,-1.386719,1
740479,166_65,166,0.252,0.239258,0.211182,0.272217,1.015625,9.995117,-3.149414,1


In [29]:
df.dtypes

cut_id          object
case             int32
time           float32
ae_spindle     float32
ae_table       float32
vib_spindle    float32
vib_table      float32
smcdc          float32
smcac          float32
tool_class       int32
dtype: object

In [32]:
df.shape

(8960, 10)

In [33]:
col_names_ordered = ['cut_id', 'case', 'time', 'ae_spindle', 'ae_table', 'vib_spindle', 'vib_table', 'smcdc', 'smcac','tool_class']
col_dtype = [str, int, np.float32, np.float32, np.float32, np.float32, np.float32, np.float32, np.float32, int]
col_dtype_dict = dict(zip(col_names_ordered, col_dtype))

df_gt = pd.read_csv(folder_raw_data_milling / 'milling_truncated_results.csv.gz', compression='gzip',)
df_gt = df_gt.astype(col_dtype_dict)
df_gt.tail()

Unnamed: 0,cut_id,case,time,ae_spindle,ae_table,vib_spindle,vib_table,smcdc,smcac,tool_class
8955,1_69,1,0.236,0.239258,0.179443,0.625,1.77002,7.182617,0.043945,0
8956,1_69,1,0.24,0.213013,0.175781,0.622559,1.691895,7.016602,0.48584,0
8957,1_69,1,0.244,0.22522,0.24353,0.672607,1.806641,6.938477,1.142578,0
8958,1_69,1,0.248,0.270386,0.296631,0.671387,1.884766,7.075195,1.328125,0
8959,1_69,1,0.252,0.323486,0.245972,0.740967,1.945801,7.011719,2.521973,0


In [35]:
df_gt.dtypes

cut_id          object
case             int32
time           float32
ae_spindle     float32
ae_table       float32
vib_spindle    float32
vib_table      float32
smcdc          float32
smcac          float32
tool_class       int32
dtype: object

In [38]:
from pandas.testing import assert_frame_equal
a = assert_frame_equal(df, df_gt)

In [39]:
a

In [25]:
df.to_csv(folder_processed_data_milling / "milling_truncated.csv.gz", compression="gzip", index=False)