In [7]:
import os
import glob
import json
import shutil
import functools
import pandas as pd


In [8]:
DATA_PATH = "./DATA_/pnoistor_*"

In [9]:
mkdir = lambda p: 0 if os.path.exists(p) else (os.mkdir(p), 1)[1]

In [10]:
class Annotest:
    SEP = "-"
    # FKEYS = ["app_code 0", "sub_id 1", "file_class 2", "file_ID 3", "comment 4", "file_format 5", "file_name 6", "file_path 7", "file_match 8"]
    fkeys = {
        "APP_CODE": "app_code",
        "SID":"sub_id",
        "FCLASS": "file_class",
        "FID": "file_ID",
        "COMNT": "comment",
        "FFMT": "file_format",
        "FNAME": "file_name",
        "FPATH": "file_path",
        "FMATCH": "file_match"
    }

    ANOT_FILE_TYPES = ["txt", "csv"]
    ANOT_LABELS = ["aa", "ee", "uu", "oo", "ii", "xx", "bb1", "bb2", "bb3", "bb4"]
    ALL_FILES_DF: pd.DataFrame
    ALL_METADATA_DF: pd.DataFrame
    ALL_AUDIO_ANOTE_DF: pd.DataFrame

    ALL_SUBJECTS_LIST: tuple[int, list]

    def __init__(self, dataset_path: str) -> None:
        all_files = glob.glob(f"{dataset_path}/*/pnoistor_*")

        self.ALL_FILES_DF = self.make_files_df(all_files)
        self.ALL_SUBJECTS_LIST = self.get_subject_list()
        
        self.ALL_METADATA_DF = self.make_metadata_df()
        self.ALL_AUDIO_ANOTE_DF = self.make_aud_anot_df()


    def make_files_df(self, all_files: list) -> pd.DataFrame:
        files = [self.file_dict(fp) for fp in all_files]
        return pd.DataFrame(files)

    def make_metadata_df(self) -> pd.DataFrame:
        meta_file_df = self.filter_file_df(self.META_FKEYS)

        meta_data = [
            mf.to_dict() | self.read_json(mf[self.fkeys["FPATH"]])
            for _, mf in meta_file_df.iterrows()
        ]

        return pd.DataFrame(meta_data).copy()
    
    def make_aud_anot_df(self):
        anot_file_df = self.filter_file_df(self.AUDIO_ANOT_FKEYS)

        aud_anot = [
            self.read_aud_anot(af)
            for _, af in anot_file_df.iterrows()
        ]
        return pd.concat(aud_anot).copy()

    def read_aud_anot(self, finfo: pd.Series):
        df = pd.read_csv(finfo[self.fkeys["FPATH"]], sep="\t", names=["start", "end", "label"])
        df["line_number"] = range(1, len(df) + 1)
        df["dur"] = df["end"] - df["start"]
        return pd.concat([finfo.to_frame().T]*len(df), ignore_index=True, axis=0).join(df)


    def read_json(self, fpath: str) -> dict:
        with open(fpath, "r") as m:
            return json.load(m)["subjectBiodata"]

    def file_dict(self, fpath: str) -> dict:
        _, fname = os.path.split(fpath)
        match_key = fname.split(".")[0]
        fitems = fname.replace(".", self.SEP).split(self.SEP) + [fname, fpath, match_key]
        fdict = dict(zip(self.fkeys.values(), fitems))

        return fdict

    def get_files_df(self) -> pd.DataFrame:
        return self.ALL_FILES_DF

    def get_subject_list(self) -> list:
        s = pd.unique(self.ALL_FILES_DF[self.fkeys["SID"]])
        return (len(s), s)

    def get_subject_file_df(self, sid: str) -> list:
        filt = self.ALL_FILES_DF[self.fkeys["SID"]] == sid
        return self.ALL_FILES_DF.loc[filt]

    def filter_file_df(self, keys: list) -> pd.DataFrame:

        filts = [self.ALL_FILES_DF[self.fkeys["FNAME"]].str.contains(k) for k in keys]
        filt = functools.reduce(lambda p, c: p & c, filts)

        return self.ALL_FILES_DF.loc[filt].copy()


    # Tests
    META_FKEYS = ["META", "json"]
    AUDIO_FKEYS = ["BA_", "wav|WAV"]
    AUDIO_ANOT_FKEYS = ["BA_", "txt"]
    PFT_FKEYS = ["PFT_", "pdf"]
    PFT_ANOT_FKEYS = ["PFT_", "tsv|csv"]
    
    def get_aud_anot_typos(self):
        a_df = self.ALL_AUDIO_ANOTE_DF
        filt = ~a_df["label"].isin(self.ANOT_LABELS)
        return a_df[filt]
    
    def get_pairless_breath(self):
        
        pairless = []
        for file_name in pd.unique(self.ALL_AUDIO_ANOTE_DF["file_name"]):
            
            filt = (self.ALL_AUDIO_ANOTE_DF["label"].str.contains("ii|xx")) & (self.ALL_AUDIO_ANOTE_DF["file_name"] == file_name) & (self.ALL_AUDIO_ANOTE_DF["file_name"].str.contains("VBA_"))
            df = self.ALL_AUDIO_ANOTE_DF.loc[filt].reset_index()
            df["shift_cat_label"] = df["label"] + df["label"].shift(-1)
            df["shift_dif_dur"] = df["start"].shift(-1) - df["end"]

            df_even = df.iloc[::2]
            pairless_df = df_even[df_even["shift_cat_label"] != 'iixx']
            pairless.append(pairless_df)

        return pd.concat(pairless)

    def get_anotless(self, k_file, k_anot):
        f1 = self.filter_file_df(k_file)
        f2 = self.filter_file_df(k_anot)

        fm = f1.merge(f2, how="left", on=[self.fkeys["FMATCH"]])
        k = self.fkeys["FNAME"]
        return fm.loc[fm[f"{k}_y"].isna()].copy()

    def get_anotless_aud(self):
        return self.get_anotless(self.AUDIO_FKEYS, self.AUDIO_ANOT_FKEYS)
    
    def get_anotless_pft(self):
        return self.get_anotless(self.PFT_FKEYS, self.PFT_ANOT_FKEYS)
    
    REPORT_FOLDER = "./report/annotest"
    
    def check_anots(self):

        mkdir(self.REPORT_FOLDER)

        _a = self.get_anotless_aud()
        _a.to_csv(f"{self.REPORT_FOLDER}/missing_anotes.csv", index=False, columns=['app_code_x', 'sub_id_x', 'file_class_x', 'file_name_x'])

        _a2 = self.get_aud_anot_typos()
        _a2.to_csv(f"{self.REPORT_FOLDER}/anot_typos.csv", index=False, columns=['app_code', 'sub_id', 'file_name', 'label', 'line_number'])

        _a3 = self.get_pairless_breath()
        _a3.to_csv(f"{self.REPORT_FOLDER}/pairless_breath.csv", index=False, columns=['app_code', 'sub_id', 'file_name', 'label', 'line_number'])

        _b = self.get_anotless_pft()
        _b.to_csv(f"{self.REPORT_FOLDER}/missing_pft.csv", index=False, columns=['app_code_x', 'sub_id_x', 'file_class_x', 'file_name_x'])


        





pnoiannotest = Annotest(DATA_PATH)

# annotest.get_anotless_aud()

# pnoiannotest.get_aud_anot_typos()
pnoiannotest.check_anots()


In [11]:
pnoiannotest.check_anots()


In [12]:
mkdir = lambda p: 0 if os.path.exists(p) else os.mkdir(p)

In [13]:
def export_toanot(row):
    to_anote_folder = "./to_anote/pft"
    mkdir(to_anote_folder)

    shutil.copy(row["file_path_x"], to_anote_folder)


pnoiannotest.get_anotless_pft().apply(export_toanot, axis=1)


0     None
1     None
2     None
6     None
7     None
25    None
29    None
dtype: object

In [14]:
def export_toanot(row):
    to_anote_folder = "./to_anote/audio"
    mkdir(to_anote_folder)

    shutil.copy(row["file_path_x"], to_anote_folder)


pnoiannotest.get_anotless_aud().apply(export_toanot, axis=1)

0      None
1      None
2      None
3      None
4      None
5      None
6      None
7      None
8      None
9      None
10     None
11     None
12     None
13     None
14     None
45     None
46     None
47     None
48     None
49     None
50     None
51     None
52     None
53     None
78     None
107    None
119    None
120    None
121    None
122    None
123    None
139    None
140    None
141    None
142    None
143    None
191    None
192    None
196    None
dtype: object

In [15]:
filt_nofile = pnoiannotest.ALL_AUDIO_ANOTE_DF.apply(lambda x: os.path.exists(x["file_path"]), axis=1)
pnoiannotest.ALL_AUDIO_ANOTE_DF[~filt_nofile]

Unnamed: 0,app_code,sub_id,file_class,file_ID,comment,file_format,file_name,file_path,file_match,start,end,label,line_number,dur


In [16]:
pnoiannotest.ALL_SUBJECTS_LIST


(33,
 array(['maitribrambhatt_540264cd', 'aditis_96917e0d',
        'ashwinraikar_4f6053c3', 'sagarkumar_b77e6507',
        'vijayaomkar_b866b679', 'saikeerthanaarun_3364bc1a',
        'kumarchowdam_53f32e31', 'sannashoukat_5213fe84', 'manju_e54090da',
        'rayniva_f3e542e1', 'anitajoby_8bf5d711', 'sujatan_bdd161b6',
        'royjoseph_4136005e', 'kamakshi_e7a7aae6', 'nagaraju_0b600fb0',
        'sananaushad_80e84b51', 'shreyamgupta_78aa423a',
        'oshinsaha_357a6a6b', 'darshanvshettar_843c416b',
        'atifahmed_ebb5032c', 'lokeshk_90b4871a',
        'sharmisthachakrabarti_80531fda', 'amartyaveer_81b8f33c',
        'rahulkumar_710a4b74', 'jesurajabandekar_7273cc8f',
        'muralikadambi_3895019a', 'nanjazhakig_7876351b',
        'johnkiranborugada_e390538c', 'saurabhkumar_92bef9b7',
        'mohammadroshan_269b6326', 'shivaganga_46c140d9',
        'sagayamary_258e8be2', 'valli_82ee8284'], dtype=object))