In [9]:
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
import os
import re
from multiprocessing import Pool

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
path_data_dir = Path().cwd().parent.parent / "data"
print(path_data_dir)

folder_raw_data = path_data_dir / "raw"
folder_raw_data_cnc = path_data_dir / "raw/cnc/data_splits" 

/home/tim/Documents/feat-store/data


In [67]:
tool_no = 54
pattern_match = re.compile(f"\w_{tool_no}_")
files = os.listdir(folder_raw_data_cnc)
file_list = [
    Path(folder_raw_data_cnc) / filename
    for filename in files
    if filename.endswith(".pickle") and re.search(pattern_match, filename) is not None
]

file_list

[PosixPath('/home/tim/Documents/feat-store/data/raw/cnc/data_splits/1540300905_54_3.pickle'),
 PosixPath('/home/tim/Documents/feat-store/data/raw/cnc/data_splits/1540299472_54_1.pickle'),
 PosixPath('/home/tim/Documents/feat-store/data/raw/cnc/data_splits/1540298755_54_0.pickle'),
 PosixPath('/home/tim/Documents/feat-store/data/raw/cnc/data_splits/1540300010_54_5.pickle'),
 PosixPath('/home/tim/Documents/feat-store/data/raw/cnc/data_splits/1540301084_54_8.pickle'),
 PosixPath('/home/tim/Documents/feat-store/data/raw/cnc/data_splits/1540299830_54_8.pickle'),
 PosixPath('/home/tim/Documents/feat-store/data/raw/cnc/data_splits/1540300010_54_7.pickle'),
 PosixPath('/home/tim/Documents/feat-store/data/raw/cnc/data_splits/1540299293_54_8.pickle'),
 PosixPath('/home/tim/Documents/feat-store/data/raw/cnc/data_splits/1540299472_54_7.pickle'),
 PosixPath('/home/tim/Documents/feat-store/data/raw/cnc/data_splits/1540298934_54_7.pickle'),
 PosixPath('/home/tim/Documents/feat-store/data/raw/cnc/data

In [215]:
col_selection = ["current_sub"]

def read_pickle(filename):
    'converts a filename to a pandas dataframe'
    df = pickle.load(open(filename, 'rb'))
    id = filename.stem
    df["id"] = id
    # df["unix_date"] = int(id.split("_")[0])
    # df["index_no"] = int(id.split("_")[-1])
    # df["tool_no"] = int(id.split("_")[-2])
    df["time"] = np.arange(0, len(df)) / 1000.0
    
    return df[["id", "time"] + col_selection]
    

def main(folder_raw_data_cnc, tool_no=54):
    """Runs data processing scripts to turn raw data from (../raw) into
    cleaned data ready to be analyzed (saved in ../processed).
    """

    # get a list of file names
    pattern_match = re.compile(f"\w_{tool_no}_")
    files = os.listdir(folder_raw_data_cnc)
    file_list = [
        Path(folder_raw_data_cnc) / filename
        for filename in files
        if filename.endswith(".pickle") and re.search(pattern_match, filename) is not None
    ]

    # set up your pool
    with Pool(processes=2) as pool:  # or whatever your hardware can support

        # have your pool map the file names to dataframes
        df_list = pool.map(read_pickle, file_list)

        # reduce the list of dataframes to a single dataframe
        combined_df = pd.concat(df_list, ignore_index=True)

        return combined_df

In [222]:
if __name__ == "__main__":
    df = main(folder_raw_data_cnc)
    print("Final df shape:", df.shape)

df.head()

Final df shape: (450816, 3)


Unnamed: 0,id,time,current_sub
0,1540300905_54_3,0.0,603
1,1540300905_54_3,0.001,666
2,1540300905_54_3,0.002,666
3,1540300905_54_3,0.003,745
4,1540300905_54_3,0.004,690


In [223]:
col_selection = ["current_sub"]
def add_info_col_from_cut_id(df, col_selection):
    """
    Adds a column with the cut id
    """
    df["tool_no"] = df["id"].apply(lambda x: int(x.split("_")[-2]))
    df["unix_date"] = df["id"].apply(lambda x: int(x.split("_")[0]))
    df["index_no"] = df["id"].apply(lambda x: int(x.split("_")[-1]))
    return df[["id", "unix_date", "index_no", "tool_no"] + col_selection]

df = add_info_col_from_cut_id(df, col_selection)
df.head()

Unnamed: 0,id,unix_date,index_no,tool_no,current_sub
0,1540300905_54_3,1540300905,3,54,603
1,1540300905_54_3,1540300905,3,54,666
2,1540300905_54_3,1540300905,3,54,666
3,1540300905_54_3,1540300905,3,54,745
4,1540300905_54_3,1540300905,3,54,690


In [224]:
# load high-level labels
df_labels = pd.read_csv(path_data_dir / "processed/cnc/high_level_dummy.csv")

# select all rows in df_labels where "failed_tools" is not empty
df_labels = df_labels[df_labels["failed_tools"].notna()]

# convert each "failed_tools" string to a list
df_labels["failed_tools"] = df_labels["failed_tools"].apply(lambda x: x.split(" "))

df_labels = df_labels.explode('failed_tools')

# replace all NaNs in "failed" column with 0
# df_labels.loc[df_labels["failed"].isna(), "failed"] = 0

# change dtype of "failed" column to int
df_labels = df_labels[["unix_date", "failed", "failed_tools"]]
df_labels["unix_date"] = df_labels["unix_date"].astype(int)
df_labels["failed"] = df_labels["failed"].astype(int)

# drop any rows where "failed_tools" is not a numeric value
df_labels = df_labels[df_labels["failed_tools"].apply(lambda x: x.isnumeric())]
df_labels["failed_tools"] = df_labels["failed_tools"].astype(int)

df_labels.head()

Unnamed: 0,unix_date,failed,failed_tools
0,1540298755,1,54
0,1540298755,1,15
1,1540298934,1,57
3,1540299293,3,54
3,1540299293,3,57


In [225]:
# get dtype of df_labels
df_labels.dtypes

unix_date       int64
failed          int64
failed_tools    int64
dtype: object

In [226]:
dfm = pd.merge(df, df_labels[["unix_date", "failed", "failed_tools"]], left_on=["unix_date", "tool_no"], right_on=["unix_date", "failed_tools"], how="left").drop(columns=["failed_tools"])
dfm["failed"] = dfm["failed"].fillna(0).astype(int)

# drop "failed_tools" column
# dfm = dfm.drop(columns=["failed_tools"])

dfm["failed"] = dfm["failed"].astype(int)

# save df to csv
dfm.to_csv("cnc_data_with_labels.csv", index=False)
dfm.head()

Unnamed: 0,id,unix_date,index_no,tool_no,current_sub,failed
0,1540300905_54_3,1540300905,3,54,603,1
1,1540300905_54_3,1540300905,3,54,666,1
2,1540300905_54_3,1540300905,3,54,666,1
3,1540300905_54_3,1540300905,3,54,745,1
4,1540300905_54_3,1540300905,3,54,690,1


In [107]:
df.loc[df["failed_tools"].isna(), "failed_tools"] = "54"
# convert each "failed_tools" string to a list
df["failed_tools"] = df["failed_tools"].copy().apply(lambda x: x.split(" "))

df.head()

Unnamed: 0,id,unix_date,index_no,time,tool_no,current_main,current_sub,failed,failed_tools
0,1540300905_54_3,1540300905,3,0.0,54,1,603,1,"[54, 15]"
1,1540300905_54_3,1540300905,3,0.001,54,1,666,1,"[54, 15]"
2,1540300905_54_3,1540300905,3,0.002,54,1,666,1,"[54, 15]"
3,1540300905_54_3,1540300905,3,0.003,54,2,745,1,"[54, 15]"
4,1540300905_54_3,1540300905,3,0.004,54,2,690,1,"[54, 15]"


In [58]:
# select all rows in df_labels where "failed_tools" is not empty
df_labels_failed = df_labels[df_labels["failed_tools"].notna()].copy()

# convert each "failed_tools" string to a list
df_labels_failed["failed_tools"] = df_labels_failed["failed_tools"].copy().apply(lambda x: x.split(" "))

df_labels_failed.tail()

Unnamed: 0,unix_date,date,cut_dir,part,file_name,tools,len_cut,no_points,signals_names,failed,failed_tools,comment
5588,1574901434,2019-11-27 19:37,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$001.csv,1 2 3 6 8 11 13 17 22 23 32 36 37 51 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,1,"[54, 8]",
5589,1574901790,2019-11-27 19:43,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$002.csv,1 2 3 6 8 11 13 17 22 23 32 36 37 51 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,1,"[54, 8]","“7:50 PM tool 54, 8 wear”"
5590,1574902144,2019-11-27 19:49,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$003.csv,2 6 8 22 32 36 37 51 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,2,"[54, 8]",
5591,1574902862,2019-11-27 20:01,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$004.csv,1 2 3 6 8 11 13 22 32 36 37 51 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,2,"[54, 8]",
5592,1574903330,2019-11-27 20:08,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$005.csv,2 6 8 22 32 36 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,2,"[54, 8]",


In [59]:
df_labels_failed.explode('failed_tools')

Unnamed: 0,unix_date,date,cut_dir,part,file_name,tools,len_cut,no_points,signals_names,failed,failed_tools,comment
90,1540315198,2018-10-23 13:19,/home/tim/Documents/Checkfluid-Project/data/ra...,KS-NPC-6FF,Data_KS-NPC-6FF_90.mat,1 3 5 8 11 13 15 17 21 22 23 32 36 37 54 57,177.0,177000,current_main current_sub cut_signal error_x er...,2,57,
91,1540315377,2018-10-23 13:22,/home/tim/Documents/Checkfluid-Project/data/ra...,KS-NPC-6FF,Data_KS-NPC-6FF_91.mat,1 3 5 8 11 13 15 17 21 22 23 32 36 37 54 57,177.0,177000,current_main current_sub cut_signal error_x er...,3,57,“only tool change in this period was T5757 at ...
92,1540315794,2018-10-23 13:29,/home/tim/Documents/Checkfluid-Project/data/ra...,KS-NPC-6FF,Data_KS-NPC-6FF_92.mat,1 3 5 8 11 13 15 17 21 22 23 32 36 37 54 57,177.0,177000,current_main current_sub cut_signal error_x er...,2,57,
93,1540316136,2018-10-23 13:35,/home/tim/Documents/Checkfluid-Project/data/ra...,KS-NPC-6FF,Data_KS-NPC-6FF_93.mat,1 3 5 8 11 13 15 17 21 22 23 32 36 37 54 57,177.0,177000,current_main current_sub cut_signal error_x er...,2,57,
94,1540316477,2018-10-23 13:41,/home/tim/Documents/Checkfluid-Project/data/ra...,KS-NPC-6FF,Data_KS-NPC-6FF_94.mat,1 3 5 8 11 13 15 17 21 22 23 32 36 37 54 57,177.0,177000,current_main current_sub cut_signal error_x er...,2,57,
...,...,...,...,...,...,...,...,...,...,...,...,...
5590,1574902144,2019-11-27 19:49,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$003.csv,2 6 8 22 32 36 37 51 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,2,8,
5591,1574902862,2019-11-27 20:01,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$004.csv,1 2 3 6 8 11 13 22 32 36 37 51 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,2,54,
5591,1574902862,2019-11-27 20:01,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$004.csv,1 2 3 6 8 11 13 22 32 36 37 51 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,2,8,
5592,1574903330,2019-11-27 20:08,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$005.csv,2 6 8 22 32 36 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,2,54,


# Fix the Datetime

In [69]:
file = "Data_KS-NPC-6FF_0.mat"

# get the local modified time of the file
unixtime = int(os.path.getctime(file))
unixtime

1654041803

In [70]:
# convert unixtime to readable date
from datetime import datetime
datetime.fromtimestamp(unixtime).strftime('%Y-%m-%d %H:%M:%S')


'2022-05-31 17:03:23'

In [66]:
# convert the local modified time to EST

unixtime_est

1549490402

# Scratch

In [16]:
df = read_pickle(file_list[0])
df.head()

Unnamed: 0,cut_signal,current_main,current_sub,power_main,power_sub,error_x,error_z,speed_main,speed_sub,tool_no
27432,1,1,825,0.007324,2.659912,0.0005,-0.385,0,-5002,54
27433,1,3,806,0.009766,2.680664,0.0005,-0.374,0,-5001,54
27434,1,1,802,0.020752,2.801514,0.0005,-0.364,0,-4998,54
27435,1,2,759,0.024414,2.807617,0.0005,-0.353,0,-5001,54
27436,1,2,718,0.024414,2.827148,0.0005,-0.343,0,-5000,54


In [17]:
file_list[0].stem

'1540309555_54_5'

In [15]:
df['cut_signal'].unique()

array([1], dtype=int16)