In [18]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import numpy as np
import pandas as pd

In [13]:
class EpilepsyDataset(Dataset):
    def __init__(self, path_to_data: str, verbose: bool=True):
        if os.path.exists(path_to_data) is False:
            raise ValueError('There is no such path')
            
        self.path = path_to_data
        self.folders_with_patients = os.listdir(self.path)
        self.verbose = verbose
        
        self.patients_data = []
        for patient in self.folders_with_patients:
            self.patients_data.append(os.listdir(os.path.join(self.path + patient)))
            
        self.folders_with_patients = self.folders_with_patients[0]
        self._sorted_data = self._get_sorted_data()
        
    def get_patients_data(self, patient: str) -> list[str]:
        return os.listdir(os.path.join(self.path, patient))
    
    def get_patients_names(self) -> list[str]:
        return os.listdir(self.path)
    
    def get_all_sensors_records_for_patient(self, patient: str) -> list[str]:
        if patient not in self.get_patients_names():
            raise ValueError('There is no such name')
         
        full_path_to_patient = os.path.join(self.path, patient) 
        
        return [name for name in os.listdir(full_path_to_patient) 
                if os.path.isdir(os.path.join(full_path_to_patient, name)) 
                and name[0] != '.']
    
    def _get_sorted_segments(self, sensor_folder: str) -> list[list]:
        """
        Get types of sensors from folder and sort its data by number of segment
        """
        
        delimiter = '_'
        full_path = os.path.join(self.path, sensor_folder)
        sensors_files_names = [file.split(delimiter) for file in os.listdir(full_path)]

        cropped_sensors_files_names = [file[:-2] for file in sensors_files_names]
        
        index_of_parameters = 3
        unique_parameters = set(file[index_of_parameters] for file in cropped_sensors_files_names)

        return_data = []
        for unique_item in unique_parameters:
            data = [file for file in sensors_files_names if unique_item in file]
            index_of_segments_number = 5
            sorted_list = sorted(data, key=lambda x: int(x[index_of_segments_number].split('.')[0]))
            return_data.append(['_'.join(file) for file in sorted_list])
    
        return return_data
    
    def _read_segment(self, path_to_segment: str):
        return pd.read_parquet(path_to_segment)
    
    def _upsample(self, data: np.array, sample_rate: float, new_sample_rate: float, mode: str = 'bicubic'):
        scale_factor = new_sample_rate / sample_rate
        upsampler = nn.Upsample(scale_factor, mode)
        return upsampler(data)
    
    def _get_sorted_data(self):
        """
        Return path to segments sorted by its index
        """
        self._dataset_data = []
        print(self.folders_with_patients)
        for patient in self.folders_with_patients:
            if self.verbose:
                print('Patient: {}'.format(patient))
                
            sensors = self.get_all_sensors_records_for_patient(patient)
            data = []
            for sensor in sensors:
                records = self._get_sorted_segments(os.path.join(patient, sensor))
                
                full_path_records = []
                for sensor_record in records:
                    full_path_records.append(list(map(lambda x: os.path.join(self.path, patient, sensor, x), sensor_record)))                                
                    data.append(full_path_records)
                    
            self._dataset_data.append(data)        

        return self._dataset_data
    
    def __len__(self):
        pass

    def __getitem__(self, idx):
        pass
    
    def _get_fullconnected_data_signals(self):
        """
        Get full connected segments data from all sensor from folders
        """
        combine_signals = {} 
        for patient in self._sorted_data:
            for sensor in patient:
                signal_combined_list =[]
                
                for signal in sensor:
                    dfs=[]
                    key_name = re.sub("_segment_\d+.parquet", "", signal[0])
                    key_name = re.sub("/.+/.+/", "", key_name)
                    for segment in signal:
                        df = pd.read_parquet(segment)

                        # Append the dataframe to the list
                        dfs.append(df)
                    combined_signal = pd.concat(dfs, axis=0)
                    print(key_name)
                    print(combined_signal)
                    combine_signals[key_name] = combined_signal
        return combine_signals
        

In [26]:
from dataset_formatter import DatasetFormatter 
import dataset_formatter
from importlib import reload
reload(dataset_formatter)
#print(dataset._sorted_data)
dataset =  DatasetFormatter('/workspace/data_seerpy/data_seerpy/data/')
print(dataset.folders_with_patients)

['MSEL_01575']


In [6]:
dataset.preprocess('/workspace/new_data/', 180)

Patient: MSEL_01575: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [04:36<00:00, 34.62s/it]


In [27]:
reload(dataset_formatter)
print(dataset.labels_set())

Patient: MSEL_01575: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41079/41079 [01:26<00:00, 474.62it/s]

          Patient Segment Label
0      MSEL_01575       0     0
1      MSEL_01575       1     0
2      MSEL_01575       2     0
3      MSEL_01575       3     0
4      MSEL_01575       4     0
...           ...     ...   ...
41074  MSEL_01575   41074     0
41075  MSEL_01575   41075     0
41076  MSEL_01575   41076     0
41077  MSEL_01575   41077     0
41078  MSEL_01575   41078     0

[41079 rows x 3 columns]





In [7]:
reload(dataset_formatter)
dataset.simple_normilization('/workspace/normalized_data/')


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 92697/92697 [09:03<00:00, 170.57it/s]


In [50]:
os.listdir('/workspace/new_data')

['MSEL_01110-ICU_Acc z_12578.parquet',
 'MSEL_01842_Acc x_1685.parquet',
 'MSEL_01097_Acc Mag_10435.parquet',
 'MSEL_01808_Acc Mag_9601.parquet',
 'MSEL_01844_TEMP_3661.parquet',
 'MSEL_01808_TEMP_994.parquet',
 'MSEL_01097_Acc z_6921.parquet',
 'MSEL_01842_Acc y_7204.parquet',
 'MSEL_01575_Acc Mag_17681.parquet',
 'MSEL_00501_Acc Mag_11367.parquet',
 'MSEL_01838_Acc y_5609.parquet',
 'MSEL_01097_Acc x_5311.parquet',
 'MSEL_01844_BVP_1513.parquet',
 'MSEL_01808_HR_2874.parquet',
 'MSEL_01097_Acc Mag_12094.parquet',
 'MSEL_01838_Acc z_8302.parquet',
 'MSEL_01097_Acc y_4130.parquet',
 'MSEL_01110-ICU_HR_1853.parquet',
 'MSEL_01575_Acc z_13079.parquet',
 'MSEL_00501_Acc y_11874.parquet',
 'MSEL_01844_Acc y_6380.parquet',
 'MSEL_00172_BVP_1455.parquet',
 'MSEL_01842_Acc Mag_1615.parquet',
 'MSEL_01808_EDA_2017.parquet',
 'MSEL_01838_Acc x_14451.parquet',
 'MSEL_01097_Acc Mag_16802.parquet',
 'MSEL_01097_Acc z_13670.parquet',
 'MSEL_01838_Acc z_2061.parquet',
 'MSEL_01575_Acc Mag_18168.parq