In [1]:

from pathlib import Path
from pyphm.datasets.ims import ImsDataLoad
import pandas as pd
import os
import numpy as np
import time
import datetime
import csv
import multiprocessing as mp

%load_ext autoreload
%autoreload 2

In [2]:
root_dir = Path.cwd().parent
print(root_dir)
path_data_raw_folder = Path(root_dir / 'data' )
print(path_data_raw_folder)

/home/tim/Documents/PyPHM
/home/tim/Documents/PyPHM/data


In [3]:
ims = ImsDataLoad(path_data_raw_folder, 'ims', download=False)

In [4]:
path_1st_folder = ims.path_1st_folder
path_2nd_folder = ims.path_2nd_folder
path_3rd_folder = ims.path_3rd_folder

file_list = sorted(os.listdir(path_2nd_folder))
file_name = file_list[0]

In [5]:
def process_raw_csv(file_info_dict) -> None:
    """Load an individual sample (.csv file) of the IMS data set."""

    path_run_folder = file_info_dict["path_run_folder"]
    file_name = file_info_dict["file_name"]
    sample_freq = file_info_dict["sample_freq"]
    col_names = file_info_dict["col_names"]
    run_no = file_info_dict["run_no"]
    sample_index = file_info_dict["sample_index"]

    # load the .csv file
    signals_array = np.loadtxt(path_run_folder / file_name, delimiter="\t")

    id_list = [f"{run_no}_{sample_index}"] * len(signals_array)
    run_list = [run_no] * len(signals_array)
    file_list = [file_name] * len(signals_array)
    time_step_array = np.linspace(
        0.0, len(signals_array) / sample_freq, len(signals_array)
    )

    df = pd.DataFrame(np.vstack(signals_array), columns=col_names, dtype=np.float32)
    df["id"] = id_list
    df["run"] = run_list
    df["file"] = file_list
    df["time_step"] = np.hstack(time_step_array)

    return df.astype({"id": str, "run": int, "file": str, "time_step": np.float32})

In [5]:
path_run_folder = ims.path_1st_folder
col_names = ims.col_1st_names


# create a list of dictionaries containing the metadata for each file
file_info_list = []
for i, file_name in enumerate(sorted(os.listdir(path_run_folder))):
    file_info_list.append(
        {
            "path_run_folder": path_run_folder,
            "file_name": file_name,
            "sample_freq": 20480.0,
            "col_names": col_names,
            "run_no": 1,
            "sample_index": i,
        }
    )

with mp.Pool(processes=4) as pool:

    # from https://stackoverflow.com/a/36590187
    df_run = pool.map(self.process_raw_csv, file_info_list)
    df = pd.concat(df_run, ignore_index=True)

col_names_ordered = ["id", "run", "file", "time_step"] + col_names



In [None]:
def load_run_as_df(
    run_no: int,
    n_jobs: int = None,
) -> None:
    """Load the three runs as individual dataframes."""

    if run_no == 1:
        col_names = self.col_1st_names
        path_run_folder = self.path_1st_folder
    elif run_no == 2:
        col_names = self.col_2nd_names
        path_run_folder = self.path_2nd_folder
    else:
        col_names = self.col_3rd_names
        path_run_folder = self.path_3rd_folder

    # get list of every file in the folder and sort by ascending date
    file_list = sorted(os.listdir(path_run_folder))

    # create a list of dictionaries containing the metadata for each file
    file_info_list = []
    for i, file_name in enumerate(sorted(os.listdir(path_run_folder))):
        file_info_list.append(
            {
                "path_run_folder": path_run_folder,
                "file_name": file_name,
                "sample_freq": 20480.0,
                "col_names": col_names,
                "run_no": run_no,
                "sample_index": i,
            }
        )

    # get number of cpu cores
    if n_jobs is None:
        n_jobs = mp.cpu_count() - 2
    if n_jobs < 1:
        n_jobs = 1

    # load the dataframes in parallel
    with mp.Pool(processes=n_jobs) as pool:

        # from https://stackoverflow.com/a/36590187
        df_run = pool.map(self.process_raw_csv, file_info_list)
        df = pd.concat(df_run, ignore_index=True)

    col_names_ordered = ["id", "run", "file", "time_step"] + col_names

    return df[col_names_ordered]