In [87]:
import glob
import json
import pandas as pd
import os
import functools


In [88]:
DATA_PATH = "../../DATA/pnoistor_may2023"

In [112]:
class Annotest:
    SEP = "-"
    # FKEYS = ["app_code 0", "sub_id 1", "file_class 2", "file_ID 3", "comment 4", "file_format 5", "file_name 6", "file_path 7", "file_match 8"]
    fkeys = {
        "APP_CODE": "app_code",
        "SID":"sub_id",
        "FCLASS": "file_class",
        "FID": "file_ID",
        "COMNT": "comment",
        "FFMT": "file_format",
        "FNAME": "file_name",
        "FPATH": "file_path",
        "FMATCH": "file_match"
    }

    ANOT_FILE_TYPES = ["txt", "csv"]
    ANOT_LABELS = ["aa", "ee", "uu", "oo", "ii", "xx", "bb1", "bb2", "bb3", "bb4"]
    ALL_FILES_DF: pd.DataFrame
    ALL_METADATA_DF: pd.DataFrame
    ALL_AUDIO_ANOTE_DF: pd.DataFrame

    ALL_SUBJECTS_LIST: tuple[int, list]

    def __init__(self, dataset_path: str) -> None:
        all_files = glob.glob(f"{dataset_path}/*/pnoistor_*")

        self.ALL_FILES_DF = self.make_files_df(all_files)
        self.ALL_SUBJECTS_LIST = self.get_subject_list()
        
        self.ALL_METADATA_DF = self.make_metadata_df()
        self.ALL_AUDIO_ANOTE_DF = self.make_aud_anot_df()


    def make_files_df(self, all_files: list) -> pd.DataFrame:
        files = [self.file_dict(fp) for fp in all_files]
        return pd.DataFrame(files)

    def make_metadata_df(self) -> pd.DataFrame:
        meta_file_df = self.filter_file_df(self.META_FKEYS)

        meta_data = [
            mf.to_dict() | self.read_json(mf[self.fkeys["FPATH"]])
            for _, mf in meta_file_df.iterrows()
        ]

        return pd.DataFrame(meta_data).copy()
    
    def make_aud_anot_df(self):
        anot_file_df = self.filter_file_df(self.AUDIO_ANOT_FKEYS)

        aud_anot = [
            self.read_aud_anot(af)
            for _, af in anot_file_df.iterrows()
        ]
        return pd.concat(aud_anot).copy()

    def read_aud_anot(self, finfo: pd.Series):
        df = pd.read_csv(finfo[self.fkeys["FPATH"]], sep="\t", names=["start", "end", "label"])
        df["line_number"] = range(1, len(df) + 1)
        df["dur"] = df["end"] - df["start"]
        return pd.concat([finfo.to_frame().T]*len(df), ignore_index=True, axis=0).join(df)


    def read_json(self, fpath: str) -> dict:
        with open(fpath, "r") as m:
            return json.load(m)["subjectBiodata"]

    def file_dict(self, fpath: str) -> dict:
        _, fname = os.path.split(fpath)
        match_key = fname.split(".")[0]
        fitems = fname.replace(".", self.SEP).split(self.SEP) + [fname, fpath, match_key]
        fdict = dict(zip(self.fkeys.values(), fitems))

        return fdict

    def get_files_df(self) -> pd.DataFrame:
        return self.ALL_FILES_DF

    def get_subject_list(self) -> list:
        s = pd.unique(self.ALL_FILES_DF[self.fkeys["SID"]])
        return (len(s), s)

    def get_subject_file_df(self, sid: str) -> list:
        filt = self.ALL_FILES_DF[self.fkeys["SID"]] == sid
        return self.ALL_FILES_DF.loc[filt]

    def filter_file_df(self, keys: list) -> pd.DataFrame:

        filts = [self.ALL_FILES_DF[self.fkeys["FNAME"]].str.contains(k) for k in keys]
        filt = functools.reduce(lambda p, c: p & c, filts)

        return self.ALL_FILES_DF.loc[filt].copy()


    # Tests
    META_FKEYS = ["META", "json"]
    AUDIO_FKEYS = ["BA_", "wav|WAV"]
    AUDIO_ANOT_FKEYS = ["BA_", "txt"]
    PFT_FKEYS = ["PFT_", "pdf"]
    PFT_ANOT_FKEYS = ["PFT_", "tsv|csv"]
    
    def get_aud_anot_typos(self):
        a_df = self.make_aud_anot_df()
        filt = ~a_df["label"].isin(self.ANOT_LABELS)
        return a_df[filt]
    
    def get_pairless_breath(self):
        
        pairless = []
        for file_name in pd.unique(self.ALL_AUDIO_ANOTE_DF["file_name"]):
            
            filt = (self.ALL_AUDIO_ANOTE_DF["label"].str.contains("ii|xx")) & (self.ALL_AUDIO_ANOTE_DF["file_name"] == file_name) & (self.ALL_AUDIO_ANOTE_DF["file_name"].str.contains("VBA_"))
            df = self.ALL_AUDIO_ANOTE_DF.loc[filt].reset_index()
            df["shift_cat_label"] = df["label"] + df["label"].shift(-1)
            df_even = df.iloc[::2]
            pairless_df = df_even[df_even["shift_cat_label"] != 'iixx']
            pairless.append(pairless_df)

        return pd.concat(pairless)

    def get_anotless(self, k_file, k_anot):
        f1 = self.filter_file_df(k_file)
        f2 = self.filter_file_df(k_anot)

        fm = f1.merge(f2, how="left", on=[self.fkeys["FMATCH"]])
        k = self.fkeys["FNAME"]
        return fm.loc[fm[f"{k}_y"].isna()].copy()

    def get_anotless_aud(self):
        return self.get_anotless(self.AUDIO_FKEYS, self.AUDIO_ANOT_FKEYS)
    
    def get_anotless_pft(self):
        return self.get_anotless(self.PFT_FKEYS, self.PFT_ANOT_FKEYS)
    
    REPORT_FOLDER = "./report"
    
    def check_anots(self):

        if not os.path.exists("./report"):
            os.mkdir(self.REPORT_FOLDER)

        _a = self.get_anotless_aud()
        _a.to_csv(f"{self.REPORT_FOLDER}/missing_anotes.csv", index=False, columns=['app_code_x', 'sub_id_x', 'file_class_x', 'file_name_x'])

        _a2 = self.get_aud_anot_typos()
        _a2.to_csv(f"{self.REPORT_FOLDER}/anot_typos.csv", index=False, columns=['app_code', 'sub_id', 'file_name', 'label'])

        _a3 = self.get_pairless_breath()
        _a3.to_csv(f"{self.REPORT_FOLDER}/pairless_breath.csv", index=False, columns=['app_code', 'sub_id', 'file_name', 'label', 'line_number'])

        _b = self.get_anotless_pft()
        _b.to_csv(f"{self.REPORT_FOLDER}/missing_pft.csv", index=False, columns=['app_code_x', 'sub_id_x', 'file_class_x', 'file_name_x'])


        





pnoiannotest = Annotest(DATA_PATH)

# annotest.get_anotless_aud()

# pnoiannotest.get_aud_anot_typos()
pnoiannotest.check_anots()


In [90]:
filt_nofile = pnoiannotest.ALL_AUDIO_ANOTE_DF.apply(lambda x: os.path.exists(x["file_path"]), axis=1)
pnoiannotest.ALL_AUDIO_ANOTE_DF[~filt_nofile]

Unnamed: 0,app_code,sub_id,file_class,file_ID,comment,file_format,file_name,file_path,file_match,start,end,label,dur


In [91]:
pnoiannotest.ALL_SUBJECTS_LIST

(5,
 array(['mohammadroshan_269b6326', 'shivaganga_46c140d9',
        'anitajoby_8bf5d711', 'sagayamary_258e8be2', 'valli_82ee8284'],
       dtype=object))

In [92]:
filt = pnoiannotest.ALL_AUDIO_ANOTE_DF["label"].str.contains("ii|xx")
df = pnoiannotest.ALL_AUDIO_ANOTE_DF.loc[filt].reset_index()
df["shift_cat_label"] = df["label"] + df["label"].shift(-1)
df["shift_dif_dur"] = df["start"].shift(-1) - df["end"]
pd.unique(df.iloc[::2]["shift_cat_label"])

# df.iloc[::2].describe()

array(['iixx'], dtype=object)

In [105]:
pnoiannotest.get_pairless_breath()

Unnamed: 0,index,app_code,sub_id,file_class,file_ID,comment,file_format,file_name,file_path,file_match,start,end,label,dur,shift_cat_label


In [94]:

for i, l in df.iterrows():

    # print(i, iixx_df.iloc[i]["label"], iixx_df.iloc[i+1]["label"] )
    if (df.iloc[i]["label"] == "ii"):  
        print(df.iloc[i+1]["label"] == "xx")
    
    # if (iixx_df.iloc[i]["label"] == "xx"):  
    #     assert(iixx_df.iloc[i-1]["label"] == "xx")



True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [95]:
pd.unique(pnoiannotest.ALL_FILES_DF['sub_id']).__len__()
pnoiannotest.ALL_FILES_DF

Unnamed: 0,app_code,sub_id,file_class,file_ID,comment,file_format,file_name,file_path,file_match
0,pnoistor_may2023,mohammadroshan_269b6326,LBA_before_RL,fc9a,comnt,wav,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...
1,pnoistor_may2023,mohammadroshan_269b6326,LBA_before_LU,6f06,comnt,wav,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...
2,pnoistor_may2023,mohammadroshan_269b6326,LBA_before_RU,58c7,comnt,wav,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...
3,pnoistor_may2023,mohammadroshan_269b6326,VBA_after,5903,comnt,WAV,pnoistor_may2023-mohammadroshan_269b6326-VBA_a...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-VBA_a...
4,pnoistor_may2023,mohammadroshan_269b6326,PFT_after,5b4f,comnt,pdf,pnoistor_may2023-mohammadroshan_269b6326-PFT_a...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-PFT_a...
...,...,...,...,...,...,...,...,...,...
72,pnoistor_may2023,valli_82ee8284,LBA_before_LL,4b19,comnt,wav,pnoistor_may2023-valli_82ee8284-LBA_before_LL-...,../../DATA/pnoistor_may2023/valli_82ee8284/pno...,pnoistor_may2023-valli_82ee8284-LBA_before_LL-...
73,pnoistor_may2023,valli_82ee8284,LBA_before_LU,6a5c,comnt,wav,pnoistor_may2023-valli_82ee8284-LBA_before_LU-...,../../DATA/pnoistor_may2023/valli_82ee8284/pno...,pnoistor_may2023-valli_82ee8284-LBA_before_LU-...
74,pnoistor_may2023,valli_82ee8284,PFT_before,4748,comnt,pdf,pnoistor_may2023-valli_82ee8284-PFT_before-474...,../../DATA/pnoistor_may2023/valli_82ee8284/pno...,pnoistor_may2023-valli_82ee8284-PFT_before-474...
75,pnoistor_may2023,valli_82ee8284,META,18a6,comnt,json,pnoistor_may2023-valli_82ee8284-META-18a6-comn...,../../DATA/pnoistor_may2023/valli_82ee8284/pno...,pnoistor_may2023-valli_82ee8284-META-18a6-comnt


In [96]:
pnoiannotest.get_anotless_aud()


Unnamed: 0,app_code_x,sub_id_x,file_class_x,file_ID_x,comment_x,file_format_x,file_name_x,file_path_x,file_match,app_code_y,sub_id_y,file_class_y,file_ID_y,comment_y,file_format_y,file_name_y,file_path_y
0,pnoistor_may2023,mohammadroshan_269b6326,LBA_before_RL,fc9a,comnt,wav,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...,,,,,,,,
1,pnoistor_may2023,mohammadroshan_269b6326,LBA_before_LU,6f06,comnt,wav,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...,,,,,,,,
2,pnoistor_may2023,mohammadroshan_269b6326,LBA_before_RU,58c7,comnt,wav,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...,,,,,,,,
3,pnoistor_may2023,mohammadroshan_269b6326,VBA_after,5903,comnt,WAV,pnoistor_may2023-mohammadroshan_269b6326-VBA_a...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-VBA_a...,,,,,,,,
4,pnoistor_may2023,mohammadroshan_269b6326,LBA_after_LL,b239,comnt,wav,pnoistor_may2023-mohammadroshan_269b6326-LBA_a...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-LBA_a...,,,,,,,,
5,pnoistor_may2023,mohammadroshan_269b6326,LBA_after_RU,1cbe,comnt,wav,pnoistor_may2023-mohammadroshan_269b6326-LBA_a...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-LBA_a...,,,,,,,,
6,pnoistor_may2023,mohammadroshan_269b6326,LBA_before_LL,d11e,comnt,wav,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-LBA_b...,,,,,,,,
7,pnoistor_may2023,mohammadroshan_269b6326,VBA_before,ac57,comnt,WAV,pnoistor_may2023-mohammadroshan_269b6326-VBA_b...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-VBA_b...,,,,,,,,
8,pnoistor_may2023,mohammadroshan_269b6326,LBA_after_RL,7c2d,comnt,wav,pnoistor_may2023-mohammadroshan_269b6326-LBA_a...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-LBA_a...,,,,,,,,
9,pnoistor_may2023,mohammadroshan_269b6326,LBA_after_LU,3793,comnt,wav,pnoistor_may2023-mohammadroshan_269b6326-LBA_a...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-LBA_a...,,,,,,,,


In [97]:
# annotest.get_anotless()[annotest.get_anotless().value_counts() > 1]

pnoiannotest.get_anotless_pft() #["match_key"].value_counts().to_frame()

Unnamed: 0,app_code_x,sub_id_x,file_class_x,file_ID_x,comment_x,file_format_x,file_name_x,file_path_x,file_match,app_code_y,sub_id_y,file_class_y,file_ID_y,comment_y,file_format_y,file_name_y,file_path_y
0,pnoistor_may2023,mohammadroshan_269b6326,PFT_after,5b4f,comnt,pdf,pnoistor_may2023-mohammadroshan_269b6326-PFT_a...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-PFT_a...,,,,,,,,
1,pnoistor_may2023,mohammadroshan_269b6326,PFT_before,95b3,comnt,pdf,pnoistor_may2023-mohammadroshan_269b6326-PFT_b...,../../DATA/pnoistor_may2023/mohammadroshan_269...,pnoistor_may2023-mohammadroshan_269b6326-PFT_b...,,,,,,,,
2,pnoistor_may2023,shivaganga_46c140d9,PFT_before,87de,comnt,pdf,pnoistor_may2023-shivaganga_46c140d9-PFT_befor...,../../DATA/pnoistor_may2023/shivaganga_46c140d...,pnoistor_may2023-shivaganga_46c140d9-PFT_befor...,,,,,,,,
3,pnoistor_may2023,shivaganga_46c140d9,PFT_after,a0ed,comnt,pdf,pnoistor_may2023-shivaganga_46c140d9-PFT_after...,../../DATA/pnoistor_may2023/shivaganga_46c140d...,pnoistor_may2023-shivaganga_46c140d9-PFT_after...,,,,,,,,
6,pnoistor_may2023,sagayamary_258e8be2,PFT_after,ecd3,comnt,pdf,pnoistor_may2023-sagayamary_258e8be2-PFT_after...,../../DATA/pnoistor_may2023/sagayamary_258e8be...,pnoistor_may2023-sagayamary_258e8be2-PFT_after...,,,,,,,,
7,pnoistor_may2023,sagayamary_258e8be2,PFT_before,ae9c,comnt,pdf,pnoistor_may2023-sagayamary_258e8be2-PFT_befor...,../../DATA/pnoistor_may2023/sagayamary_258e8be...,pnoistor_may2023-sagayamary_258e8be2-PFT_befor...,,,,,,,,
8,pnoistor_may2023,valli_82ee8284,PFT_after,1734,comnt,pdf,pnoistor_may2023-valli_82ee8284-PFT_after-1734...,../../DATA/pnoistor_may2023/valli_82ee8284/pno...,pnoistor_may2023-valli_82ee8284-PFT_after-1734...,,,,,,,,
9,pnoistor_may2023,valli_82ee8284,PFT_before,4748,comnt,pdf,pnoistor_may2023-valli_82ee8284-PFT_before-474...,../../DATA/pnoistor_may2023/valli_82ee8284/pno...,pnoistor_may2023-valli_82ee8284-PFT_before-474...,,,,,,,,
