In [1]:
import os
import glob
import pandas as pd
import numpy as np
import constants as const
from tqdm import tqdm

<h4>Dataset folder</h4>

In [2]:
CENTER_FOLDER = os.path.join(const.DATASET_FOLDER, const.CENTER)
LEFT_FOLDER = os.path.join(const.DATASET_FOLDER, const.LEFT)
RIGHT_FOLDER = os.path.join(const.DATASET_FOLDER, const.RIGHT)

In [3]:
list_center_sensor = os.listdir(CENTER_FOLDER)
list_left_sensor = os.listdir(LEFT_FOLDER)
list_right_sensor = os.listdir(RIGHT_FOLDER)

In [4]:
print(len(list_center_sensor))
print(len(list_left_sensor))
print(len(list_right_sensor))

504
496
494


In [5]:
intersect = list(set(list_center_sensor) & set(list_left_sensor) & set(list_right_sensor))

In [6]:
len(intersect)

490

In [7]:
intersect[0]

'T0_Id016020.actLabel'

In [8]:
list_center_sensor = sorted([os.path.join(CENTER_FOLDER, sess) for sess in intersect])
list_left_sensor = sorted([os.path.join(LEFT_FOLDER, sess) for sess in intersect])
list_right_sensor = sorted([os.path.join(RIGHT_FOLDER, sess) for sess in intersect])

In [9]:
list_center_sensor[0]

'/home/anhtt163/Documents/BKHN/materials/statistical_machine_learning/dataset/IneritialGaitActionDataset/CenterSensor/T0_Id000000.actLabel'

In [10]:
class Stats:
    def __init__(self):
        self.freq = const.FREQ
        self.mapping_labels = const.MAPPING_LABELS
        # containers
        self.dfs = {}
        self.dur_by_act = {const.CENTER: {}, const.LEFT: {}, const.RIGHT: {}}
        
    def set_dur(self, df, pos):
        df[const.GROUP] = (df[const.LABEL].diff().fillna(0) != 0).cumsum()
        for group, df in df.groupby([const.GROUP]):
            assert len(df[const.LABEL].unique()) == 1, "There are many labels in this group"
            label = self.mapping_labels[df.iloc[0][const.LABEL]]
            dur = len(df) / self.freq  # convert to sec
            if label not in self.dur_by_act[pos]:
                self.dur_by_act[pos][label] = [dur]
            else:
                self.dur_by_act[pos][label].append(dur)
    
    def stat_dur(self, func=np.mean):
        for pos, dict_acts in stats.dur_by_act.items():
            print(pos)
            for act, durs in dict_acts.items():
                print(f"{act}:\t {func(durs):.2f}s")
            print()
            
    def set_df(self, path, *df):
        self.dfs[path] = pd.concat(df, axis=1)
        

class Processor:
    def __init__(self, parent_folder):
        self.parent_folder = parent_folder

    @staticmethod
    def process_window(df, freq_threshold=const.FREQ_THRESHOLD):
        src_lbl = f"{const.CENTER}_{const.LABEL}"
        dst_lbl = const.LABEL
        
        # get frequency of each label
        lbl, cnt = np.unique(df[src_lbl], return_counts=True)
        freq = cnt / np.sum(cnt)
        max_idx = np.argmax(freq)
        # return None if not meet threshold
        if freq[max_idx] < freq_threshold:
            return None
        
        # get most frequent as label
        df[src_lbl] = lbl[max_idx]
        
        # get necessary cols only
        columns = [src_lbl] + [f"{pos}_{sensor}" for pos in const.POS for sensor in const.SENSORS ]
        df = df[columns].rename(columns={src_lbl: dst_lbl})
        
        return df

    @staticmethod
    def sliding_window(df, window_size, step):
        len_df = len(df)
        start_indexes = range(0, len_df, step)
        for start_index in start_indexes:
            end_index = start_index + window_size
            if end_index >= len_df:
                break
            yield df.iloc[start_index: end_index]

    def save_window(self, df, sess, counter):
        if df is not None:
            path = os.path.join(self.parent_folder, sess, f"{counter:08d}.csv")
            os.makedirs(os.path.dirname(path), exist_ok=True)
            df.to_csv(path, index=False)
    
    def run(self, window_size=const.WINDOW_SIZE, step=const.STEP):
        # TODO: add multiple threads
        for sess, df in tqdm(stats.dfs.items()):
            for counter, cut_df in enumerate(self.sliding_window(df=df, window_size=window_size, step=step)):
                self.save_window(df=self.process_window(cut_df), sess=sess, counter=counter)

In [11]:
stats = Stats()
for paths in zip(list_center_sensor, list_left_sensor, list_right_sensor):
    df_container = []
    # TODO: add multiple threads
    for path in paths:
        pos_path = os.path.dirname(path)
        pos = os.path.basename(pos_path)
        assert pos in const.POS, f"{pos} not in {const.POS}"
        df = pd.read_csv(path, skiprows=1, delimiter="\t")
        # do statistics
        stats.set_dur(df, pos)    
        # concat
        df.columns = [f"{pos}_{col}" for col in df.columns]
        df_container.append(df)
    
    session = os.path.basename(paths[0])
    stats.set_df(session, *df_container)

In [12]:
stats.stat_dur(func=np.mean)

CenterSensor
level_walk:	 5.25s
invalid:	 2.93s
upstairs:	 1.37s
slope_down:	 2.10s
slope_up:	 2.04s
downstairs:	 1.20s

LeftSensor
level_walk:	 4.11s
invalid:	 2.94s
upstairs:	 1.37s
slope_down:	 2.10s
slope_up:	 2.04s
downstairs:	 1.20s

RightSensor
level_walk:	 4.09s
invalid:	 2.93s
upstairs:	 1.37s
slope_down:	 2.10s
slope_up:	 2.04s
downstairs:	 1.20s



In [13]:
len(stats.dfs)

490

In [14]:
Processor(const.WINDOWS_FOLDER).run()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 490/490 [01:02<00:00,  7.79it/s]
