In [1]:
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
import os
from multiprocessing import Pool

%load_ext autoreload
%autoreload 2

In [2]:
path_data_dir = Path().cwd().parent.parent / "data"
print(path_data_dir)

folder_raw_data = path_data_dir / "raw"
folder_raw_data_cnc = path_data_dir / "raw/cnc" 

/home/tim/Documents/feat-store/data


In [3]:
files = os.listdir(folder_raw_data_cnc)
file_list = [
    Path(folder_raw_data_cnc) / filename
    for filename in files
    if filename.endswith(".pickle")
]

In [14]:
col_selection = ["tool_no", "current_main", "current_sub"]

def read_pickle(filename):
    'converts a filename to a pandas dataframe'
    df = pickle.load(open(filename, 'rb'))
    id = filename.stem
    df["id"] = id
    df['timestamp'] = id.split("_")[0]
    df['index_no'] = id.split("_")[-1]
    
    return df[["id", "timestamp", "index_no"] + col_selection]
    

def main(folder_interim_data):
    """Runs data processing scripts to turn raw data from (../raw) into
    cleaned data ready to be analyzed (saved in ../processed).
    """

    # get a list of file names
    files = os.listdir(folder_interim_data)
    file_list = [
        Path(folder_interim_data) / filename
        for filename in files
        if filename.endswith(".pickle")
    ]

    # set up your pool
    with Pool(processes=2) as pool:  # or whatever your hardware can support

        # have your pool map the file names to dataframes
        df_list = pool.map(read_pickle, file_list)

        # reduce the list of dataframes to a single dataframe
        combined_df = pd.concat(df_list, ignore_index=True)

        return combined_df

In [60]:
if __name__ == "__main__":
    df = main(folder_raw_data_cnc)
    print("Final df shape:", df.shape)

Final df shape: (38416, 6)


In [61]:
df["timestamp"] = df["timestamp"].astype(int)
df.head()

Unnamed: 0,id,timestamp,index_no,tool_no,current_main,current_sub
0,1540309555_54_5,1540309555,5,54,1,825
1,1540309555_54_5,1540309555,5,54,3,806
2,1540309555_54_5,1540309555,5,54,1,802
3,1540309555_54_5,1540309555,5,54,2,759
4,1540309555_54_5,1540309555,5,54,2,718


In [62]:
# load high-level labels
df_labels = pd.read_csv(path_data_dir / "processed/cnc/high_level_labels_MASTER_update2020-08-06_new-jan-may-data.csv")

# replace all NaNs in "failed" column with 0
df_labels.loc[df_labels["failed"].isna(), "failed"] = 0

# change dtype of "failed" column to int
df_labels["failed"] = df_labels["failed"].astype(int)
df_labels.head()

Unnamed: 0,unix_date,date,cut_dir,part,file_name,tools,len_cut,no_points,signals_names,failed,failed_tools,comment
0,1540298755,2018-10-23 08:45,/home/tim/Documents/Checkfluid-Project/data/ra...,KS-NPC-6FF,Data_KS-NPC-6FF_0.mat,1 3 5 8 11 13 15 17 21 22 23 32 36 37 54 57,177.0,177000,current_main current_sub cut_signal error_x er...,0,,
1,1540298934,2018-10-23 08:48,/home/tim/Documents/Checkfluid-Project/data/ra...,KS-NPC-6FF,Data_KS-NPC-6FF_1.mat,1 3 5 8 11 13 15 17 21 22 23 32 36 37 54 57,177.0,177000,current_main current_sub cut_signal error_x er...,0,,
2,1540299114,2018-10-23 08:51,/home/tim/Documents/Checkfluid-Project/data/ra...,KS-NPC-6FF,Data_KS-NPC-6FF_2.mat,1 3 5 8 11 13 15 17 21 22 23 32 36 37 54 57,177.0,177000,current_main current_sub cut_signal error_x er...,0,,
3,1540299293,2018-10-23 08:54,/home/tim/Documents/Checkfluid-Project/data/ra...,KS-NPC-6FF,Data_KS-NPC-6FF_3.mat,1 3 5 8 11 13 15 17 21 22 23 32 36 37 54 57,177.0,177000,current_main current_sub cut_signal error_x er...,0,,
4,1540299472,2018-10-23 08:57,/home/tim/Documents/Checkfluid-Project/data/ra...,KS-NPC-6FF,Data_KS-NPC-6FF_4.mat,1 3 5 8 11 13 15 17 21 22 23 32 36 37 54 57,177.0,177000,current_main current_sub cut_signal error_x er...,0,,


In [63]:
df = pd.merge(df, df_labels, left_on="timestamp", right_on="unix_date", how="inner")
df.head()

Unnamed: 0,id,timestamp,index_no,tool_no,current_main,current_sub,unix_date,date,cut_dir,part,file_name,tools,len_cut,no_points,signals_names,failed,failed_tools,comment


In [43]:
# select all rows in df_labels where "failed_tools" is not empty
df_labels_failed = df_labels[df_labels["failed_tools"].notna()].copy()

# convert each "failed_tools" string to a list
df_labels_failed["failed_tools"] = df_labels_failed["failed_tools"].copy().apply(lambda x: x.split(" "))

df_labels_failed.tail()

Unnamed: 0,unix_date,date,cut_dir,part,file_name,tools,len_cut,no_points,signals_names,failed,failed_tools,comment
5588,1574901434,2019-11-27 19:37,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$001.csv,1 2 3 6 8 11 13 17 22 23 32 36 37 51 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,1,"[54, 8]",
5589,1574901790,2019-11-27 19:43,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$002.csv,1 2 3 6 8 11 13 17 22 23 32 36 37 51 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,1,"[54, 8]","“7:50 PM tool 54, 8 wear”"
5590,1574902144,2019-11-27 19:49,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$003.csv,2 6 8 22 32 36 37 51 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,2,"[54, 8]",
5591,1574902862,2019-11-27 20:01,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$004.csv,1 2 3 6 8 11 13 22 32 36 37 51 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,2,"[54, 8]",
5592,1574903330,2019-11-27 20:08,/home/tim/Documents/Checkfluid-Project/data/ra...,191128,metal_1000hz$005.csv,2 6 8 22 32 36 54 57,352.0,352000,current_main current_sub cut_signal error_z sp...,2,"[54, 8]",


# Fix the Datetime

In [69]:
file = "Data_KS-NPC-6FF_0.mat"

# get the local modified time of the file
unixtime = int(os.path.getctime(file))
unixtime

1654041803

In [70]:
# convert unixtime to readable date
from datetime import datetime
datetime.fromtimestamp(unixtime).strftime('%Y-%m-%d %H:%M:%S')


'2022-05-31 17:03:23'

In [66]:
# convert the local modified time to EST

unixtime_est

1549490402

# Scratch

In [16]:
df = read_pickle(file_list[0])
df.head()

Unnamed: 0,cut_signal,current_main,current_sub,power_main,power_sub,error_x,error_z,speed_main,speed_sub,tool_no
27432,1,1,825,0.007324,2.659912,0.0005,-0.385,0,-5002,54
27433,1,3,806,0.009766,2.680664,0.0005,-0.374,0,-5001,54
27434,1,1,802,0.020752,2.801514,0.0005,-0.364,0,-4998,54
27435,1,2,759,0.024414,2.807617,0.0005,-0.353,0,-5001,54
27436,1,2,718,0.024414,2.827148,0.0005,-0.343,0,-5000,54


In [17]:
file_list[0].stem

'1540309555_54_5'

In [15]:
df['cut_signal'].unique()

array([1], dtype=int16)