In [1]:
import glob
import json
import pandas as pd
import os
import functools


In [2]:
DATA_PATH = "./DATA"

In [3]:
class Annotest:
    SEP = "-"
    # FKEYS = ["app_code 0", "sub_id 1", "file_class 2", "file_ID 3", "comment 4", "file_format 5", "file_name 6", "file_path 7", "file_match 8"]
    fkeys = {
        "APP_CODE": "app_code",
        "SID":"sub_id",
        "FCLASS": "file_class",
        "FID": "file_ID",
        "COMNT": "comment",
        "FFMT": "file_format",
        "FNAME": "file_name",
        "FPATH": "file_path",
        "FMATCH": "file_match"
    }

    ANOT_FILE_TYPES = ["txt", "csv"]
    ANOT_LABELS = ["aa", "ee", "uu", "oo", "ii", "xx", "bb1", "bb2", "bb3", "bb4"]
    ALL_FILES_DF: pd.DataFrame
    ALL_METADATA_DF: pd.DataFrame
    ALL_AUDIO_ANOTE_DF: pd.DataFrame

    ALL_SUBJECTS_LIST: tuple[int, list]

    def __init__(self, dataset_path: str) -> None:
        all_files = glob.glob(f"{dataset_path}/*/pnoistor_*")

        self.ALL_FILES_DF = self.make_files_df(all_files)
        self.ALL_SUBJECTS_LIST = self.get_subject_list()
        
        self.ALL_METADATA_DF = self.make_metadata_df()
        self.ALL_AUDIO_ANOTE_DF = self.make_aud_anot_df()


    def make_files_df(self, all_files: list) -> pd.DataFrame:
        files = [self.file_dict(fp) for fp in all_files]
        return pd.DataFrame(files)

    def make_metadata_df(self) -> pd.DataFrame:
        meta_file_df = self.filter_file_df(self.META_FKEYS)

        meta_data = [
            mf.to_dict() | self.read_json(mf[self.fkeys["FPATH"]])
            for _, mf in meta_file_df.iterrows()
        ]

        return pd.DataFrame(meta_data).copy()
    
    def make_aud_anot_df(self):
        anot_file_df = self.filter_file_df(self.AUDIO_ANOT_FKEYS)

        aud_anot = [
            self.read_aud_anot(af)
            for _, af in anot_file_df.iterrows()
        ]
        return pd.concat(aud_anot).copy()

    def read_aud_anot(self, finfo: pd.Series):
        df = pd.read_csv(finfo[self.fkeys["FPATH"]], sep="\t", names=["start", "end", "label"])
        return pd.concat([finfo.to_frame().T]*len(df), ignore_index=True, axis=0).join(df)


    def read_json(self, fpath: str) -> dict:
        with open(fpath, "r") as m:
            return json.load(m)["subjectBiodata"]

    def file_dict(self, fpath: str) -> dict:
        _, fname = os.path.split(fpath)
        match_key = fname.split(".")[0]
        fitems = fname.replace(".", self.SEP).split(self.SEP) + [fname, fpath, match_key]
        fdict = dict(zip(self.fkeys.values(), fitems))

        return fdict

    def get_files_df(self) -> pd.DataFrame:
        return self.ALL_FILES_DF

    def get_subject_list(self) -> list:
        s = pd.unique(self.ALL_FILES_DF[self.fkeys["SID"]])
        return (len(s), s)

    def get_subject_file_df(self, sid: str) -> list:
        filt = self.ALL_FILES_DF[self.fkeys["SID"]] == sid
        return self.ALL_FILES_DF.loc[filt]

    def filter_file_df(self, keys: list) -> pd.DataFrame:

        filts = [self.ALL_FILES_DF[self.fkeys["FNAME"]].str.contains(k) for k in keys]
        filt = functools.reduce(lambda p, c: p & c, filts)

        return self.ALL_FILES_DF.loc[filt].copy()


    # Tests
    META_FKEYS = ["META", "json"]
    AUDIO_FKEYS = ["BA_", "wav"]
    AUDIO_ANOT_FKEYS = ["BA_", "txt"]
    PFT_FKEYS = ["PFT_", "pdf"]
    PFT_ANOT_FKEYS = ["PFT_", "csv"]
    
    def get_aud_anot_typos(self):
        a_df = self.make_aud_anot_df()
        filt = ~a_df["label"].isin(self.ANOT_LABELS)
        return a_df[filt]

    def get_anotless(self, k_file, k_anot):
        f1 = self.filter_file_df(k_file)
        f2 = self.filter_file_df(k_anot)

        fm = f1.merge(f2, how="left", on=[self.fkeys["FMATCH"]])
        k = self.fkeys["FNAME"]
        return fm.loc[fm[f"{k}_y"].isna()].copy()

    def get_anotless_aud(self):
        return self.get_anotless(self.AUDIO_FKEYS, self.AUDIO_ANOT_FKEYS)
    
    def get_anotless_pft(self):
        return self.get_anotless(self.PFT_FKEYS, self.PFT_ANOT_FKEYS)
    
    def check_aud_anot_labels(self):
        pass



pnoiannotest = Annotest(DATA_PATH)

# annotest.get_anotless_aud()

pnoiannotest.get_aud_anot_typos()


Unnamed: 0,app_code,sub_id,file_class,file_ID,comment,file_format,file_name,file_path,file_match,start,end,label
0,pnoistor_feb2023,axxvlsa_beb71fe9,LBA_before_RU,daf1,comnt,txt,pnoistor_feb2023-axxvlsa_beb71fe9-LBA_before_R...,./DATA/axxvlsa_beb71fe9/pnoistor_feb2023-axxvl...,pnoistor_feb2023-axxvlsa_beb71fe9-LBA_before_R...,2.5,39.3,bb


In [4]:
pnoiannotest.ALL_AUDIO_ANOTE_DF


Unnamed: 0,app_code,sub_id,file_class,file_ID,comment,file_format,file_name,file_path,file_match,start,end,label
0,pnoistor_feb2023,ugcnnpt_ba9e41ad,LBA_before_LU,d7de,comnt,txt,pnoistor_feb2023-ugcnnpt_ba9e41ad-LBA_before_L...,./DATA/ugcnnpt_ba9e41ad/pnoistor_feb2023-ugcnn...,pnoistor_feb2023-ugcnnpt_ba9e41ad-LBA_before_L...,2.5,39.3,bb1
0,pnoistor_feb2023,ugcnnpt_ba9e41ad,LBA_before_LL,5f2c,comnt,txt,pnoistor_feb2023-ugcnnpt_ba9e41ad-LBA_before_L...,./DATA/ugcnnpt_ba9e41ad/pnoistor_feb2023-ugcnn...,pnoistor_feb2023-ugcnnpt_ba9e41ad-LBA_before_L...,2.5,39.3,bb3
0,pnoistor_feb2023,ugcnnpt_ba9e41ad,LBA_after_LU,6229,comnt,txt,pnoistor_feb2023-ugcnnpt_ba9e41ad-LBA_after_LU...,./DATA/ugcnnpt_ba9e41ad/pnoistor_feb2023-ugcnn...,pnoistor_feb2023-ugcnnpt_ba9e41ad-LBA_after_LU...,2.5,39.3,bb1
0,pnoistor_feb2023,ugcnnpt_ba9e41ad,VBA_before,f217,comnt,txt,pnoistor_feb2023-ugcnnpt_ba9e41ad-VBA_before-f...,./DATA/ugcnnpt_ba9e41ad/pnoistor_feb2023-ugcnn...,pnoistor_feb2023-ugcnnpt_ba9e41ad-VBA_before-f...,2.5,6.5,aa
1,pnoistor_feb2023,ugcnnpt_ba9e41ad,VBA_before,f217,comnt,txt,pnoistor_feb2023-ugcnnpt_ba9e41ad-VBA_before-f...,./DATA/ugcnnpt_ba9e41ad/pnoistor_feb2023-ugcnn...,pnoistor_feb2023-ugcnnpt_ba9e41ad-VBA_before-f...,6.5,10.5,ee
...,...,...,...,...,...,...,...,...,...,...,...,...
74,pnoistor_feb2023,nxoeahg_0b94e38f,VBA_before,3b0a,comnt,txt,pnoistor_feb2023-nxoeahg_0b94e38f-VBA_before-3...,./DATA/nxoeahg_0b94e38f/pnoistor_feb2023-nxoea...,pnoistor_feb2023-nxoeahg_0b94e38f-VBA_before-3...,370.5,374.5,oo
75,pnoistor_feb2023,nxoeahg_0b94e38f,VBA_before,3b0a,comnt,txt,pnoistor_feb2023-nxoeahg_0b94e38f-VBA_before-3...,./DATA/nxoeahg_0b94e38f/pnoistor_feb2023-nxoea...,pnoistor_feb2023-nxoeahg_0b94e38f-VBA_before-3...,302.5,374.5,bb4
0,pnoistor_feb2023,nxoeahg_0b94e38f,LBA_before_LL,822f,comnt,txt,pnoistor_feb2023-nxoeahg_0b94e38f-LBA_before_L...,./DATA/nxoeahg_0b94e38f/pnoistor_feb2023-nxoea...,pnoistor_feb2023-nxoeahg_0b94e38f-LBA_before_L...,2.5,39.3,bb3
0,pnoistor_feb2023,nxoeahg_0b94e38f,LBA_before_RL,bf92,comnt,txt,pnoistor_feb2023-nxoeahg_0b94e38f-LBA_before_R...,./DATA/nxoeahg_0b94e38f/pnoistor_feb2023-nxoea...,pnoistor_feb2023-nxoeahg_0b94e38f-LBA_before_R...,2.5,39.3,bb4


In [8]:
# annotest.get_anotless()[annotest.get_anotless().value_counts() > 1]

pnoiannotest.get_anotless_aud() #["match_key"].value_counts().to_frame()

Unnamed: 0,app_code_x,sub_id_x,file_class_x,file_ID_x,comment_x,file_format_x,file_name_x,file_path_x,file_match,app_code_y,sub_id_y,file_class_y,file_ID_y,comment_y,file_format_y,file_name_y,file_path_y
2050,pnoistor_feb2023,yxyddvg_2b534070,LBA_after_LU,bfbf,comnt,wav,pnoistor_feb2023-yxyddvg_2b534070-LBA_after_LU...,./DATA/yxyddvg_2b534070/pnoistor_feb2023-yxydd...,pnoistor_feb2023-yxyddvg_2b534070-LBA_after_LU...,,,,,,,,
