In [6]:
from pathlib import Path
from dataclasses import dataclass
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys

# Add the root project directory to the Python path
ROOT = Path.cwd().parent  # This will get the project root since the notebook is in 'notebooks/'
sys.path.append(str(ROOT))

from src.processing import preprocessing
from configs.path_config import GROUP1, EXTRACTED_DATA_DIR

In [7]:
@dataclass
class DataEntry:
    info: str
    data: pd.DataFrame

class LoadData:
    def __init__(self, root, group):
        self.data_dict = dict()
        for i, file in enumerate(group):
            path = EXTRACTED_DATA_DIR / 'group1' / file
            info = file
            data = pd.read_csv(path)
            self.data_dict[i] = DataEntry(info, data)


In [8]:
process_data = preprocessing.preprocessing_pipeline

# Assuming LoadData and Case classes are already defined

group = GROUP1  # List of files to load
load_data = LoadData(EXTRACTED_DATA_DIR, group)

# Access the data_dict
strains = load_data.data_dict
print(f'Old strains: {strains}')

n_points = 15
std_multiplier = 3

# Process each strain's data
for i, strain in strains.items():
    print(f'Processing file: {strain.info}')
    
    # Call the processing function to preprocess the entire DataFrame
    processed_data = process_data(strain.data, n_points, std_multiplier)
    
    # The 'Strain' column has been overwritten directly in the processing function
    strains[i].data = processed_data

print(f'New strains: {strains}')

Old strains: {0: DataEntry(info='N-B_Far_Comp.txt_N13, B, 0.03.csv', data=           Time_index                 Time  Strain
0      20090605000000  2009-06-05 00:00:00     NaN
1      20090605040000  2009-06-05 04:00:00     NaN
2      20090605080000  2009-06-05 08:00:00     NaN
3      20090605120000  2009-06-05 12:00:00     NaN
4      20090605160000  2009-06-05 16:00:00     NaN
...               ...                  ...     ...
26334  20210611000000  2021-06-11 00:00:00    32.0
26335  20210611040000  2021-06-11 04:00:00    37.0
26336  20210611080000  2021-06-11 08:00:00    23.0
26337  20210611120000  2021-06-11 12:00:00    13.0
26338  20210611160000  2021-06-11 16:00:00     NaN

[26339 rows x 3 columns]), 1: DataEntry(info='N-B_Mid1_Comp.txt_N5, B, 0.02.csv', data=           Time_index                 Time  Strain
0      20090605000000  2009-06-05 00:00:00     NaN
1      20090605040000  2009-06-05 04:00:00     NaN
2      20090605080000  2009-06-05 08:00:00     NaN
3      20090605120000 

In [9]:
# class MIMII(Dataset):
#     def __init__(self, root, machine, train=True, transform=None, target_transform=None):
#         if train:
#             self.audio_path = os.path.join(root, f"dev_data_{machine}", machine.split("_")[0], "train")
#             self.audio_files = os.listdir(self.audio_path)
#             self.labels = [int(f.split("_")[0] == "anomaly") for f in self.audio_files]
#         else:
#             self.audio_path = os.path.join(root, f"dev_data_{machine}", machine.split("_")[0], "test")
#             self.audio_files = os.listdir(self.audio_path)
#             self.labels = [int(f.split("_")[0] == "anomaly") for f in self.audio_files]
#         self.transform = transform
#         self.target_transform = target_transform

#     def __len__(self):
#         return len(self.audio_files)

#     def __getitem__(self, idx):
#         file_path = os.path.join(self.audio_path, self.audio_files[idx])
#         label = self.labels[idx]
#         if self.transform:
#             f = self.transform(file_path)
#         else:
#             # default feature representation
#             f = file_to_features(file_path).astype(np.float32)
#         if self.target_transform:
#             label = self.target_transform(label)
#         return f, label