In [1]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import numpy as np
import pandas as pd

In [16]:
class EpilepsyDataset(Dataset):
    def __init__(self, path_to_data: str, verbose: bool=True):
        if os.path.exists(path_to_data) is False:
            raise ValueError('There is no such path')
            
        self.path = path_to_data
        self.folders_with_patients = os.listdir(self.path)
        self.verbose = verbose
        
        self.patients_data = []
        for patient in self.folders_with_patients:
            self.patients_data.append(os.listdir(os.path.join(self.path + patient)))
            
        self.folders_with_patients = self.folders_with_patients[0]
        self._sorted_data = self._get_sorted_data()
        
    def get_patients_data(self, patient: str) -> list[str]:
        return os.listdir(os.path.join(self.path, patient))
    
    def get_patients_names(self) -> list[str]:
        return os.listdir(self.path)
    
    def get_all_sensors_records_for_patient(self, patient: str) -> list[str]:
        if patient not in self.get_patients_names():
            raise ValueError('There is no such name')
         
        full_path_to_patient = os.path.join(self.path, patient) 
        
        return [name for name in os.listdir(full_path_to_patient) 
                if os.path.isdir(os.path.join(full_path_to_patient, name)) 
                and name[0] != '.']
    
    def _get_sorted_segments(self, sensor_folder: str) -> list[list]:
        """
        Get types of sensors from folder and sort its data by number of segment
        """
        
        delimiter = '_'
        full_path = os.path.join(self.path, sensor_folder)
        sensors_files_names = [file.split(delimiter) for file in os.listdir(full_path)]

        cropped_sensors_files_names = [file[:-2] for file in sensors_files_names]
        
        index_of_parameters = 3
        unique_parameters = set(file[index_of_parameters] for file in cropped_sensors_files_names)

        return_data = []
        for unique_item in unique_parameters:
            data = [file for file in sensors_files_names if unique_item in file]
            index_of_segments_number = 5
            sorted_list = sorted(data, key=lambda x: int(x[index_of_segments_number].split('.')[0]))
            return_data.append(['_'.join(file) for file in sorted_list])
    
        return return_data
    
    def _read_segment(self, path_to_segment: str):
        return pd.read_parquet(path_to_segment)
    
    def _upsample(self, data: np.array, sample_rate: float, new_sample_rate: float, mode: str = 'bicubic'):
        scale_factor = new_sample_rate / sample_rate
        upsampler = nn.Upsample(scale_factor, mode)
        return upsampler(data)
    
    def _get_sorted_data(self):
        """
        Return path to segments sorted by its index
        """
        self._dataset_data = []
        print(self.folders_with_patients)
        for patient in self.folders_with_patients:
            if self.verbose:
                print('Patient: {}'.format(patient))
                
            sensors = self.get_all_sensors_records_for_patient(patient)
            data = []
            for sensor in sensors:
                records = self._get_sorted_segments(os.path.join(patient, sensor))
                
                full_path_records = []
                for sensor_record in records:
                    full_path_records.append(list(map(lambda x: os.path.join(self.path, patient, sensor, x), sensor_record)))                                
                    data.append(full_path_records)
                    
            self._dataset_data.append(data)        

        return self._dataset_data
    
    def __len__(self):
        pass

    def __getitem__(self, idx):
        pass
    
    def _get_fullconnected_data_signals(self):
        """
        Get full connected segments data from all sensor from folders
        """
        combine_signals = {} 
        for patient in self._sorted_data:
            for sensor in patient:
                signal_combined_list =[]
                
                for signal in sensor:
                    dfs=[]
                    key_name = re.sub("_segment_\d+.parquet", "", signal[0])
                    key_name = re.sub("/.+/.+/", "", key_name)
                    for segment in signal:
                        df = pd.read_parquet(segment)

                        # Append the dataframe to the list
                        dfs.append(df)
                    combined_signal = pd.concat(dfs, axis=0)
                    print(key_name)
                    print(combined_signal)
                    combine_signals[key_name] = combined_signal
        return combine_signals
        

In [18]:
import dataset_formatter
from dataset_formatter import DatasetFormatter 
from importlib import reload
reload(dataset_formatter)
#print(dataset._sorted_data)
dataset =  DatasetFormatter('/workspace/data_seerpy/data_seerpy/data/')

In [19]:
dataset.preprocess('/workspace/new_data/', 180)



Patient: MSEL_01575:   0%|                                                                                                                                               | 0/8 [00:00<?, ?it/s][A[A

Patient: MSEL_01575:  12%|████████████████▉                                                                                                                      | 1/8 [00:00<00:05,  1.17it/s][A[A

Patient: MSEL_01575:  25%|█████████████████████████████████▊                                                                                                     | 2/8 [00:01<00:05,  1.13it/s][A[A

Patient: MSEL_01575:  38%|██████████████████████████████████████████████████▋                                                                                    | 3/8 [00:02<00:04,  1.14it/s][A[A

Patient: MSEL_01575:  50%|███████████████████████████████████████████████████████████████████▌                                                                   | 4/8 [00:05<00:07,  1.85s/it][A[A

Pat

In [6]:
dataset.balance_by_downsaple("/workspace/labels.csv", "/workspace/", "balanced_label.csv", 1, 1300)

0    11597
1      112
Name: Label, dtype: int64
0    112
1    112
Name: Label, dtype: int64


In [4]:
dataset.simple_normalization('/workspace/normalized_data/')


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 708299/708299 [1:16:18<00:00, 154.70it/s]


In [5]:
len(os.listdir('/workspace/new_data'))

708299

In [14]:

part = pd.read_parquet('/workspace/normalized_data/MSEL_01808_Acc Mag_1252.parquet')
print(len(part))

23040


In [None]:
os.listdir('/workspace/normalized_data')

In [4]:
dataset.labels_set()


Patient: MSEL_01575:  34%|████████████████████████████████████████▋                                                                               | 835218/2464772 [3:01:34<5:54:16, 76.66it/s]


KeyboardInterrupt: 

In [None]:
dataset.balance_by_downsaple("/workspace/labels.csv", "/workspace/", "balanced_label.csv", 1)

In [67]:
sensors_signals_names = []
sensors_data_segments = []
signals_names, records = dataset._get_sorted_segments("/workspace/data_seerpy/data_seerpy/data/MSEL_00172/Empatica-EDA")
full_path_records = []
for signal_name, sensor_record in zip(signals_names, records):
    full_path_records.append(list(map(lambda x: os.path.join(dataset.path, 'MSEL_00172', 'Empatica-EDA', x), sensor_record)))                                
sensors_data_segments.append(full_path_records)
dfs = [pd.read_parquet(segment) for segment in sensors_data_segments[0]]
concatenated_df = pd.concat(dfs, ignore_index=True)
print(concatenated_df['time'][0])                
num_parts = int(len(concatenated_df) / (dataset.segment_time * dataset.frequency))
print(num_parts)  

1556307768007.8125
2981


In [69]:
filename = os.path.join('/workspace/data_seerpy/data_seerpy/data/MSEL_00172/MSEL_00172_labels.csv')
labels_file = pd.read_csv(filename)
duration = labels_file['duration'].iloc[0]
print('label.csv')
print("start_time: ", start_time)
print("End_time: ", duration + start_time)
print("Samples_count: ", int(duration /1000 * 128))
print("Sample_Rate: ", 128)
print('----------------------------------------------')
print('Concatenated .parquet')
print("start_time: ", concatenated_df['time'][0])  
print("End_time: ", concatenated_df['time'][len(concatenated_df)-1])
print("Samples_count: ", len(concatenated_df))
print("Sample_Rate: ",  len(concatenated_df) * 1000 / (concatenated_df['time'][len(concatenated_df)-1] - concatenated_df['time'][0]))
start_time = labels_file['startTime'].iloc[0]
segment_length = dataset.segment_time * dataset.frequency
total_segments = int(duration /1000 * 128 / (segment_length))

label.csv
start_time:  1556830134000
End_time:  1557082078113.09
Samples_count:  32248846
Sample_Rate:  128
----------------------------------------------
Concatenated .parquet
start_time:  1556307768007.8125
End_time:  1556559596125.0
Samples_count:  32197097
Sample_Rate:  127.85346354326064


In [26]:
def _get_sorted_segments(sensor_folder: str) -> (list, list[list]):
        """
        Get types of sensors from folder and sort its data by number of segment
        """
        
        delimiter = '_'
        full_path = sensor_folder
        sensors_files_names = [file.split(delimiter) for file in os.listdir(full_path)]

        cropped_sensors_files_names = [file[:-2] for file in sensors_files_names]
        
        index_of_parameters = 3
        unique_parameters = set(file[index_of_parameters] for file in cropped_sensors_files_names)

        return_data = []
        for unique_item in unique_parameters:
            data = [file for file in sensors_files_names if unique_item in file]
            index_of_segments_number = 5
            sorted_list = sorted(data, key=lambda x: int(x[index_of_segments_number].split('.')[0]))
            return_data.append(['_'.join(file) for file in sorted_list])
    
        return unique_parameters, return_data
_, eda = _get_sorted_segments('/workspace/data_seerpy/data_seerpy/data/MSEL_00172/Empatica-EDA/')
_, bvp = _get_sorted_segments('/workspace/data_seerpy/data_seerpy/data/MSEL_00172/Empatica-BVP/')

full_eda = [pd.read_parquet(os.path.join('/workspace/data_seerpy/data_seerpy/data/MSEL_00172/Empatica-EDA/', x)) for x in eda[0]]
full_bvp = [pd.read_parquet(os.path.join('/workspace/data_seerpy/data_seerpy/data/MSEL_00172/Empatica-BVP/', x)) for x in bvp[0]]

full_eda = pd.concat(full_eda)
full_bvp = pd.concat(full_bvp)



# Calculate the difference between current and previous timestamp in terms of days
full_bvp['time_diff'] = full_bvp['time'].diff()

# Find where the difference is greater than 1
holes = full_bvp['time_diff'] != 7.8125

# Print the holes
print(full_bvp[holes])

           time         data   time_diff
0  1.556308e+12  130615232.0         NaN
0  1.556466e+12  130615232.0  61132.8125


In [27]:

# Calculate the difference between current and previous timestamp in terms of days
full_eda['time_diff'] = full_eda['time'].diff()

# Find where the difference is greater than 1
holes = full_eda['time_diff'] != 7.8125
full_eda[holes] = full_eda[holes].fillna(0)

print(sum(full_eda[holes]['time_diff']))

289328.125


In [7]:
holes = [False, False, True, False, True, False, False]
last_hole_index = holes[holes].last_valid_index()

TypeError: list indices must be integers or slices, not list