# Pnoi Anotest

> Notebook for validating the Pnoi dataset and its corresponding annotations, reporting any potential errors or inconsistencies. This script aids in ensuring the quality and integrity of the dataset by identifying issues that may require attention.

### File Nomenclature

![File Nomenclature](../../media/pnoi-file_nomenclature_md.png)


In [79]:
import pandas as pd
import functools
import shutil
import json
import glob
import os

mkdir = lambda p: 0 if os.path.exists(p) else (os.makedirs(p), 1)[1]


### STATE VARIABLES

In [80]:
IS_DRY_RUN = False
IS_DUMMY_PATH = False

### PATHS

In [81]:
DATA_PATH = "DATA_DUMMY" if IS_DUMMY_PATH else "DATA_PNOISTOR"

REPORT_FOLDER = "./report/anotest"; mkdir(REPORT_FOLDER)

0

### Anotest Class

In [82]:
class Anotest:
    VER = "*"
    SEP = "-"
    META_SEP = "_"
    EXT_SEP = "."
    ANOT_LABELS = ["aa", "ee", "uu", "oo", "ii", "xx", "bb1", "bb2", "bb3", "bb4"]

    fkeys = {
        "APP_CODE": "app_code",
        "SID":"sub_id",
        "FCLASS": "file_class",
        "FID": "file_ID",
        "COMNT": "comment",
        "FFMT": "file_format",
        "FNAME": "file_name",
        "FPATH": "file_path",
        "FMATCH": "file_match"
    }

    def __init__(self, path: str, ver="*") -> None:
        self.VER = ver

        all_files = glob.glob(f"{path}/pnoistor_{self.VER}/*/pnoistor_*")

        self.ALL_FILES_DF = self.make_files_df(all_files)


    def file_dict(self, fpath: str) -> dict:
        _, fname = os.path.split(fpath)

        match_key = fname.split(self.EXT_SEP)[0]

        fitems = fname.replace(self.EXT_SEP, self.SEP, 1).split(self.SEP) 
        fitems += [fname, fpath, match_key]
        fdict = dict(zip(self.fkeys.values(), fitems))
        return fdict
    
    def make_files_df(self, all_files: list) -> pd.DataFrame:
        files = [self.file_dict(fp) for fp in all_files]
        return pd.DataFrame(files)


    # Getters
    def get_subject_list(self) -> list: 
        return self.ALL_FILES_DF["sub_id"].unique().tolist()
    
    def get_subject_files(self, sid: str) -> pd.DataFrame:
        return self.ALL_FILES_DF[self.ALL_FILES_DF["sub_id"] == sid]
    
    def get_subject_metadata(self, sid: str) -> pd.DataFrame:
        return self.ALL_METADATA_DF[self.ALL_METADATA_DF["sub_id"] == sid]
    
    def filter_file_df(self, keys: list) -> pd.DataFrame:

        df = self.ALL_FILES_DF.copy()
        filts = [df[self.fkeys["FNAME"]].str.contains(k) for k in keys]
        filt = functools.reduce(lambda p, c: p & c, filts)

        return df.loc[filt]

    # Checkers
    report_columns = ['app_code_data', 'sub_id_data', 'file_name_data', 'file_path_data']

    def check_anotless_audio(self) -> pd.DataFrame:

        AUDIO_FKEYS = ["BA_", "wav|WAV"]
        AUDIO_ANOT_FKEYS = ["BA_", "txt"]

        audio_df = self.filter_file_df(AUDIO_FKEYS)
        anote_df = self.filter_file_df(AUDIO_ANOT_FKEYS)

        fm = audio_df.merge(anote_df, how="left", on=[self.fkeys["FMATCH"]], suffixes=("_data", "_anot"))

        k = self.fkeys["FNAME"]

        return fm.loc[fm[f"{k}_anot"].isna()][self.report_columns]
    
    def check_anotless_pft(self) -> pd.DataFrame:

        PFT_FKEYS = ["PFT_", "pdf"]
        PFT_ANOT_FKEYS = ["PFT_", "tsv|csv"]

        audio_df = self.filter_file_df(PFT_FKEYS)
        anote_df = self.filter_file_df(PFT_ANOT_FKEYS)

        fm = audio_df.merge(anote_df, how="left", on=[self.fkeys["FMATCH"]], suffixes=("_data", "_anot"))

        k = self.fkeys["FNAME"]

        return fm.loc[fm[f"{k}_anot"].isna()][self.report_columns]
    
    def check_missing_metadata(self) -> pd.DataFrame:
        META_FKEYS = ["META", ".meta.json"]

        metadata_df = self.filter_file_df(META_FKEYS)

        all_subjects = self.get_subject_list()
        _subjects = metadata_df["sub_id"].unique().tolist()

        missing_subjects = list(set(all_subjects) - set(_subjects))

    
        return missing_subjects
    

    
    def read_aud_anot(self, finfo: pd.Series):
        df = pd.read_csv(finfo[self.fkeys["FPATH"]], sep="\t", names=["start", "end", "label"])
        df["line_number"] = range(1, len(df) + 1)
        df["dur"] = df["end"] - df["start"]
        return pd.concat([finfo.to_frame().T]*len(df), ignore_index=True, axis=0).join(df)
    
    def get_pairless_breath(self):

        AUDIO_ANOT_FKEYS = ["BA_", "txt"]

        audio_anot_file_df = self.filter_file_df(AUDIO_ANOT_FKEYS)
        aud_anot = [
            self.read_aud_anot(af)
            for _, af in audio_anot_file_df.iterrows()
        ]
        
        audio_anot_df = pd.concat(aud_anot).copy()

        filt = ~audio_anot_df["label"].isin(self.ANOT_LABELS)

        pairless = [audio_anot_df[filt]]
        for file_name in audio_anot_df["file_name"].unique().tolist():
            
            filt = (audio_anot_df["file_name"] == file_name) 
            filt &= (audio_anot_df["label"].str.contains("ii|xx")) 

            df = audio_anot_df.loc[filt].reset_index()
            df["shift_cat_label"] = df["label"] + df["label"].shift(-1)
            df["shift_dif_dur"] = df["end"].shift(-1) - df["start"]

            df_even = df.iloc[::2]
            pairless_df = df_even[df_even["shift_cat_label"] != 'iixx']
            pairless.append(pairless_df.head(1))

        return pd.concat(pairless)

anotest = Anotest(DATA_PATH, ver="*")
anotest.ALL_FILES_DF.iloc[1].to_dict()


{'app_code': 'pnoistor_jun2023',
 'sub_id': 'maitribrambhatt_540264cd',
 'file_class': 'META',
 'file_ID': '4122',
 'comment': 'comnt',
 'file_format': 'meta.json',
 'file_name': 'pnoistor_jun2023-maitribrambhatt_540264cd-META-4122-comnt.meta.json',
 'file_path': 'DATA_PNOISTOR/pnoistor_jun2023/maitribrambhatt_540264cd/pnoistor_jun2023-maitribrambhatt_540264cd-META-4122-comnt.meta.json',
 'file_match': 'pnoistor_jun2023-maitribrambhatt_540264cd-META-4122-comnt'}

In [83]:
anotest.check_anotless_audio()

Unnamed: 0,app_code_data,sub_id_data,file_name_data,file_path_data
0,pnoistor_jun2023,maitribrambhatt_540264cd,pnoistor_jun2023-maitribrambhatt_540264cd-LBA_...,DATA_PNOISTOR/pnoistor_jun2023/maitribrambhatt...
1,pnoistor_jun2023,maitribrambhatt_540264cd,pnoistor_jun2023-maitribrambhatt_540264cd-LBA_...,DATA_PNOISTOR/pnoistor_jun2023/maitribrambhatt...
2,pnoistor_jun2023,maitribrambhatt_540264cd,pnoistor_jun2023-maitribrambhatt_540264cd-LBA_...,DATA_PNOISTOR/pnoistor_jun2023/maitribrambhatt...
3,pnoistor_jun2023,maitribrambhatt_540264cd,pnoistor_jun2023-maitribrambhatt_540264cd-VBA_...,DATA_PNOISTOR/pnoistor_jun2023/maitribrambhatt...
4,pnoistor_jun2023,maitribrambhatt_540264cd,pnoistor_jun2023-maitribrambhatt_540264cd-LBA_...,DATA_PNOISTOR/pnoistor_jun2023/maitribrambhatt...
5,pnoistor_jun2023,aditis_96917e0d,pnoistor_jun2023-aditis_96917e0d-LBA_before_RL...,DATA_PNOISTOR/pnoistor_jun2023/aditis_96917e0d...
6,pnoistor_jun2023,aditis_96917e0d,pnoistor_jun2023-aditis_96917e0d-LBA_before_LL...,DATA_PNOISTOR/pnoistor_jun2023/aditis_96917e0d...
7,pnoistor_jun2023,aditis_96917e0d,pnoistor_jun2023-aditis_96917e0d-LBA_before_RU...,DATA_PNOISTOR/pnoistor_jun2023/aditis_96917e0d...
8,pnoistor_jun2023,aditis_96917e0d,pnoistor_jun2023-aditis_96917e0d-LBA_before_LU...,DATA_PNOISTOR/pnoistor_jun2023/aditis_96917e0d...
9,pnoistor_jun2023,aditis_96917e0d,pnoistor_jun2023-aditis_96917e0d-VBA_before-56...,DATA_PNOISTOR/pnoistor_jun2023/aditis_96917e0d...


In [84]:
anotest.check_anotless_pft()

Unnamed: 0,app_code_data,sub_id_data,file_name_data,file_path_data
0,pnoistor_jun2023,maitribrambhatt_540264cd,pnoistor_jun2023-maitribrambhatt_540264cd-PFT_...,DATA_PNOISTOR/pnoistor_jun2023/maitribrambhatt...
1,pnoistor_jun2023,aditis_96917e0d,pnoistor_jun2023-aditis_96917e0d-PFT_before-ca...,DATA_PNOISTOR/pnoistor_jun2023/aditis_96917e0d...
2,pnoistor_jun2023,ashwinraikar_4f6053c3,pnoistor_jun2023-ashwinraikar_4f6053c3-PFT_bef...,DATA_PNOISTOR/pnoistor_jun2023/ashwinraikar_4f...
6,pnoistor_dec01,manju_e54090da,pnoistor_dec01-manju_e54090da-PFT_before-9d28-...,DATA_PNOISTOR/pnoistor_dec01/manju_e54090da/pn...
7,pnoistor_dec01,manju_e54090da,pnoistor_dec01-manju_e54090da-PFT_after-c118-c...,DATA_PNOISTOR/pnoistor_dec01/manju_e54090da/pn...
25,pnoistor_oct07,sharmisthachakrabarti_80531fda,pnoistor_oct07-sharmisthachakrabarti_80531fda-...,DATA_PNOISTOR/pnoistor_oct07/sharmisthachakrab...
29,pnoistor_oct07,jesurajabandekar_7273cc8f,pnoistor_oct07-jesurajabandekar_7273cc8f-PFT_b...,DATA_PNOISTOR/pnoistor_oct07/jesurajabandekar_...


In [85]:
anotest.check_missing_metadata()

[]

In [86]:
anotest.get_pairless_breath()

Unnamed: 0,app_code,sub_id,file_class,file_ID,comment,file_format,file_name,file_path,file_match,start,end,label,line_number,dur,index,shift_cat_label,shift_dif_dur
