# Pnoi Anotest

> Notebook for validating the Pnoi dataset and its corresponding annotations, reporting any potential errors or inconsistencies. This script aids in ensuring the quality and integrity of the dataset by identifying issues that may require attention.

### File Nomenclature

![File Nomenclature](../../media/pnoi-file_nomenclature_md.png)


In [49]:
import pandas as pd
import functools
import shutil
import json
import glob
import os

mkdir = lambda p: 0 if os.path.exists(p) else (os.makedirs(p), 1)[1]


### STATE VARIABLES

In [50]:
IS_DRY_RUN = False
IS_DUMMY_PATH = False

### PATHS

In [51]:
EXP_VER = "vF"

REPORTS = "0_pnoi-reports"

ANNOTEST = f"anotest_{EXP_VER}"

DATA_PATH = "DATA_DUMMY" if IS_DUMMY_PATH else "DATA_PNOISTOR"

REPORT_FOLDER = f"../{REPORTS}/{ANNOTEST}"; mkdir(REPORT_FOLDER)

0

### Anotest Class

In [52]:
class Anotest:
    VER = "*"
    SEP = "-"
    META_SEP = "_"
    EXT_SEP = "."
    ANOT_LABELS = ["aa", "ee", "uu", "oo", "ii", "xx", "bb1", "bb2", "bb3", "bb4"]

    fkeys = {
        "APP_CODE": "app_code",
        "SID":"sub_id",
        "FCLASS": "file_class",
        "FID": "file_ID",
        "COMNT": "comment",
        "FFMT": "file_format",
        "FNAME": "file_name",
        "FPATH": "file_path",
        "FMATCH": "file_match"
    }

    def __init__(self, path: str, ver="*") -> None:
        self.VER = ver

        all_files = glob.glob(f"{path}/pnoistor_{self.VER}/*/pnoistor_*")

        self.ALL_FILES_DF = self.make_files_df(all_files)


    def file_dict(self, fpath: str) -> dict:
        _, fname = os.path.split(fpath)

        match_key = fname.split(self.EXT_SEP)[0]

        fitems = fname.replace(self.EXT_SEP, self.SEP, 1).split(self.SEP) 
        fitems += [fname, fpath, match_key]
        fdict = dict(zip(self.fkeys.values(), fitems))
        return fdict
    
    def make_files_df(self, all_files: list) -> pd.DataFrame:
        files = [self.file_dict(fp) for fp in all_files]
        return pd.DataFrame(files)


    # Getters
    def get_subject_list(self) -> list: 
        return self.ALL_FILES_DF["sub_id"].unique().tolist()
    
    def get_subject_files(self, sid: str) -> pd.DataFrame:
        return self.ALL_FILES_DF[self.ALL_FILES_DF["sub_id"] == sid]
    
    def get_subject_metadata(self, sid: str) -> pd.DataFrame:
        return self.ALL_METADATA_DF[self.ALL_METADATA_DF["sub_id"] == sid]
    
    def filter_file_df(self, keys: list) -> pd.DataFrame:

        df = self.ALL_FILES_DF.copy()
        filts = [df[self.fkeys["FNAME"]].str.contains(k) for k in keys]
        filt = functools.reduce(lambda p, c: p & c, filts)

        return df.loc[filt]

    # Checkers
    # ---------------------------------------------------------------------------------------------------------------
    report_columns = ['app_code_data', 'sub_id_data', 'file_name_data', 'file_path_data']

    def check_anotless_audio(self, do_export=False) -> pd.DataFrame:

        AUDIO_FKEYS = ["BA_", "wav|WAV"]
        AUDIO_ANOT_FKEYS = ["BA_", "txt"]

        audio_df = self.filter_file_df(AUDIO_FKEYS)
        anote_df = self.filter_file_df(AUDIO_ANOT_FKEYS)

        fm = audio_df.merge(anote_df, how="left", on=[self.fkeys["FMATCH"]], suffixes=("_data", "_anot"))

        k = self.fkeys["FNAME"]

        result = fm.loc[fm[f"{k}_anot"].isna()][self.report_columns]
        
        # export
        if do_export:
            result.to_csv(f"{REPORT_FOLDER}/anotless_audio.csv", index=False)
        
        return result
    
    def check_anotless_pft(self, do_export=False) -> pd.DataFrame:

        PFT_FKEYS = ["PFT_", "pdf"]
        PFT_ANOT_FKEYS = ["PFT_", "tsv|csv"]

        audio_df = self.filter_file_df(PFT_FKEYS)
        anote_df = self.filter_file_df(PFT_ANOT_FKEYS)

        fm = audio_df.merge(anote_df, how="left", on=[self.fkeys["FMATCH"]], suffixes=("_data", "_anot"))

        k = self.fkeys["FNAME"]

        result = fm.loc[fm[f"{k}_anot"].isna()][self.report_columns]
        
        # export
        if do_export:
            result.to_csv(f"{REPORT_FOLDER}/anotless_pft.csv", index=False)
        
        return result
    
    def check_missing_metadata(self, do_export=False) -> pd.DataFrame:
        META_FKEYS = ["META", ".meta.json"]

        metadata_df = self.filter_file_df(META_FKEYS)

        all_subjects = self.get_subject_list()
        _subjects = metadata_df["sub_id"].unique().tolist()

        missing_subjects = list(set(all_subjects) - set(_subjects))
        
        result = missing_subjects
        
        # export
        if do_export:
            with open(f"{REPORT_FOLDER}/missing_metadata.json", "w") as f:
                json.dump(missing_subjects, f)

        return result
    
    def check_pairless_breath(self, do_export=False):

        AUDIO_ANOT_FKEYS = ["BA_", "txt"]

        audio_anot_file_df = self.filter_file_df(AUDIO_ANOT_FKEYS)
        aud_anot = [
            self.read_aud_anot(af)
            for _, af in audio_anot_file_df.iterrows()
        ]
        
        audio_anot_df = pd.concat(aud_anot).copy()

        filt = ~audio_anot_df["label"].isin(self.ANOT_LABELS)

        pairless = [audio_anot_df[filt]]
        for file_name in audio_anot_df["file_name"].unique().tolist():
            
            filt = (audio_anot_df["file_name"] == file_name) 
            filt &= (audio_anot_df["label"].str.contains("ii|xx")) 

            df = audio_anot_df.loc[filt].reset_index()
            df["shift_cat_label"] = df["label"] + df["label"].shift(-1)
            df["shift_dif_dur"] = df["end"].shift(-1) - df["start"]

            df_even = df.iloc[::2]
            result = df_even[df_even["shift_cat_label"] != 'iixx']
            pairless.append(result.head(1))

        result = pd.concat(pairless)
        
        # export
        if do_export:
            result.to_csv(f"{REPORT_FOLDER}/pairless_breath.csv", index=False)
        
        return result
    # ---------------------------------------------------------------------------------------------------------------
    

    
    def read_aud_anot(self, finfo: pd.Series):
        df = pd.read_csv(finfo[self.fkeys["FPATH"]], sep="\t", names=["start", "end", "label"])
        df["line_number"] = range(1, len(df) + 1)
        df["dur"] = df["end"] - df["start"]
        return pd.concat([finfo.to_frame().T]*len(df), ignore_index=True, axis=0).join(df)
    
    

anotest = Anotest(DATA_PATH, ver="*")
anotest.ALL_FILES_DF.iloc[1].to_dict()


{'app_code': 'pnoistor_oct07',
 'sub_id': 'johnkiranborugada_e390538c',
 'file_class': 'PFT_before',
 'file_ID': 'f866',
 'comment': 'carefusion',
 'file_format': 'tsv',
 'file_name': 'pnoistor_oct07-johnkiranborugada_e390538c-PFT_before-f866-carefusion.tsv',
 'file_path': 'DATA_PNOISTOR/pnoistor_oct07/johnkiranborugada_e390538c/pnoistor_oct07-johnkiranborugada_e390538c-PFT_before-f866-carefusion.tsv',
 'file_match': 'pnoistor_oct07-johnkiranborugada_e390538c-PFT_before-f866-carefusion'}

In [53]:
anotest.check_anotless_audio(do_export=True)

Unnamed: 0,app_code_data,sub_id_data,file_name_data,file_path_data
35,pnoistor_oct07,sharmisthachakrabarti_80531fda,pnoistor_oct07-sharmisthachakrabarti_80531fda-...,DATA_PNOISTOR/pnoistor_oct07/sharmisthachakrab...
36,pnoistor_oct07,sharmisthachakrabarti_80531fda,pnoistor_oct07-sharmisthachakrabarti_80531fda-...,DATA_PNOISTOR/pnoistor_oct07/sharmisthachakrab...
37,pnoistor_oct07,sharmisthachakrabarti_80531fda,pnoistor_oct07-sharmisthachakrabarti_80531fda-...,DATA_PNOISTOR/pnoistor_oct07/sharmisthachakrab...
38,pnoistor_oct07,sharmisthachakrabarti_80531fda,pnoistor_oct07-sharmisthachakrabarti_80531fda-...,DATA_PNOISTOR/pnoistor_oct07/sharmisthachakrab...
39,pnoistor_oct07,sharmisthachakrabarti_80531fda,pnoistor_oct07-sharmisthachakrabarti_80531fda-...,DATA_PNOISTOR/pnoistor_oct07/sharmisthachakrab...
90,pnoistor_feb2023,oshinsaha_357a6a6b,pnoistor_feb2023-oshinsaha_357a6a6b-LBA_before...,DATA_PNOISTOR/pnoistor_feb2023/oshinsaha_357a6...
91,pnoistor_feb2023,oshinsaha_357a6a6b,pnoistor_feb2023-oshinsaha_357a6a6b-LBA_before...,DATA_PNOISTOR/pnoistor_feb2023/oshinsaha_357a6...
92,pnoistor_feb2023,oshinsaha_357a6a6b,pnoistor_feb2023-oshinsaha_357a6a6b-VBA_before...,DATA_PNOISTOR/pnoistor_feb2023/oshinsaha_357a6...
93,pnoistor_feb2023,oshinsaha_357a6a6b,pnoistor_feb2023-oshinsaha_357a6a6b-LBA_before...,DATA_PNOISTOR/pnoistor_feb2023/oshinsaha_357a6...
94,pnoistor_feb2023,oshinsaha_357a6a6b,pnoistor_feb2023-oshinsaha_357a6a6b-LBA_before...,DATA_PNOISTOR/pnoistor_feb2023/oshinsaha_357a6...


In [54]:
anotest.check_anotless_pft(do_export=True)

Unnamed: 0,app_code_data,sub_id_data,file_name_data,file_path_data
1,pnoistor_oct07,jesurajabandekar_7273cc8f,pnoistor_oct07-jesurajabandekar_7273cc8f-PFT_b...,DATA_PNOISTOR/pnoistor_oct07/jesurajabandekar_...


In [55]:
anotest.check_missing_metadata(do_export=True)

[]

In [56]:
anotest.check_pairless_breath(do_export=True)

Unnamed: 0,app_code,sub_id,file_class,file_ID,comment,file_format,file_name,file_path,file_match,start,end,label,line_number,dur,index,shift_cat_label,shift_dif_dur
