In [3]:
import pandas as pd
import functools
import shutil
import json
import glob
import os

mkdir = lambda p: 0 if os.path.exists(p) else (os.makedirs(p), 1)[1]

### STATE VARIABLES

In [4]:
IS_DRY_RUN = False
IS_DUMMY_PATH = False

### DATA PATHS

In [6]:
# DATA_PATH = "report/audio_files_v5"

EXP_VER = "v12"
DATA_PATH = f"report_{EXP_VER}/asquire_data"
REPORT_FOLDER = f"report_{EXP_VER}/anotest"; mkdir(REPORT_FOLDER)

0

In [1]:
class DataStaticInfo:

    VER = "*"
    SEP = "_"
    META_SEP = "-"
    EXT_SEP = "."
    ANOT_HEADER = ['start', 'end', 'label']
    ANOT_LABELS = ['cc', 'ss', 'aa', 'yy', 'ee', 'ii', 'xx', 'zz', 'uu', 'oo', 'ii-n']
    ANOT_TAG = "anot--"

    def get_anot_tag(self, tag):
        return f"{self.ANOT_TAG}{tag}"

    fkeys = {
        "APP_CODE": "app_code", # 0
        "SID":"sub_id", # 1
        "FCLASS": "file_class", # 2
        "FCIDX": "file_xindex", # 3
        "SCORE": "score", # 4
        "FFMT": "file_format", # 5
        "FNAME": "file_name", # 6
        "FPATH": "file_path", # 7
        "FMATCH": "file_match" # 8
    }

### CLASS: ANNOTEST -> 
- anotless audio files
- missing metadata files
- pairless breath


In [7]:
class Anotest(DataStaticInfo):


    def __init__(self, path: str, ver="*") -> None:
        self.VER = ver

        all_files = glob.glob(f"{path}/*/{ver}")

        self.ALL_FILES_DF = self.make_files_df(all_files)



    def file_dict(self, fpath: str) -> dict:
        sub_path, fname = os.path.split(fpath)
        _, sub_id = os.path.split(sub_path)

        match_key = fname.split(self.EXT_SEP)[0]

        fitems = fname.replace(self.EXT_SEP, self.SEP, 1).split(self.SEP) 

        fitems[1] = sub_id
        fitems += [fname, fpath, match_key]
        
        fdict = dict(zip(self.fkeys.values(), fitems))
        return fdict
    
    def make_files_df(self, all_files: list) -> pd.DataFrame:
        files = [self.file_dict(fp) for fp in all_files]

        all_files_df = pd.DataFrame(files)

        all_files_df.to_csv(os.path.join(REPORT_FOLDER, "asq_all_files.csv"))

        return all_files_df


    # Getters
    def get_subject_list(self) -> list: 
        return self.ALL_FILES_DF["sub_id"].unique().tolist()
    
    def get_subject_files(self, sid: str) -> pd.DataFrame:
        return self.ALL_FILES_DF[self.ALL_FILES_DF["sub_id"] == sid]
    
    def get_subject_metadata(self, sid: str) -> pd.DataFrame:
        return self.ALL_METADATA_DF[self.ALL_METADATA_DF["sub_id"] == sid]
    
    def filter_file_df(self, keys: list) -> pd.DataFrame:

        df = self.ALL_FILES_DF.copy()
        filts = [df[self.fkeys["FNAME"]].str.contains(k) for k in keys]
        filt = functools.reduce(lambda p, c: p & c, filts)

        return df.loc[filt]

    # Checkers
    report_columns = ['app_code_data', 'sub_id_data', 'file_name_data', 'file_path_data']

    def check_anotless_audio(self) -> pd.DataFrame:

        AUDIO_FKEYS = ["wav"]
        AUDIO_ANOT_FKEYS = ["txt"]

        audio_df = self.filter_file_df(AUDIO_FKEYS)
        anote_df = self.filter_file_df(AUDIO_ANOT_FKEYS)

        fm = audio_df.merge(anote_df, how="left", on=[self.fkeys["FMATCH"]], suffixes=("_data", "_anot"))

        k = self.fkeys["FNAME"]
        anotless_df = fm.loc[fm[f"{k}_anot"].isna()][self.report_columns]

        anotless_df.to_csv(os.path.join(REPORT_FOLDER, "asq_anotless_audio_files.csv"))

        return anotless_df
    
    def check_missing_metadata(self) -> pd.DataFrame:
        META_FKEYS = ["meta-data", "json"]

        metadata_df = self.filter_file_df(META_FKEYS)

        all_subjects = self.get_subject_list()
        _subjects = metadata_df["sub_id"].unique().tolist()

        missing_subjects = list(set(all_subjects) - set(_subjects))

        if len(missing_subjects) == 0:
            print("No subject is missing metadata")

        lines = ""
        with open(os.path.join(REPORT_FOLDER, "asq_no-metadata_subs.txt"), 'w') as f:

            lines = [ './' + os.path.join(DATA_PATH, sub) + "\n" for sub in missing_subjects]
            f.writelines(lines)

        [print(l) for l in lines]
    
    def read_aud_anot(self, finfo: pd.Series):
        df = pd.read_csv(finfo[self.fkeys["FPATH"]], sep="\t", names=["start", "end", "label"])
        df["line_number"] = range(1, len(df) + 1)
        df["dur"] = df["end"] - df["start"]

        return pd.concat([finfo.to_frame().T]*len(df), ignore_index=True, axis=0).join(df)
    
    def get_aud_anot_df(self):
        AUDIO_ANOT_FKEYS = ["txt"]

        audio_anot_file_df = self.filter_file_df(AUDIO_ANOT_FKEYS)
        aud_anot = [
            self.read_aud_anot(af)
            for _, af in audio_anot_file_df.iterrows()
        ]
        
        audio_anot_df = pd.concat(aud_anot).copy()

        return audio_anot_df

    def get_pairless_breath(self):
        
        audio_anot_df = self.get_aud_anot_df()

        filt = ~audio_anot_df["label"].isin(self.ANOT_LABELS)

        pairless = [audio_anot_df[filt]] # pre-add foreign labels
        for file_name in audio_anot_df["file_name"].unique().tolist():
            
            filt = (audio_anot_df["file_name"] == file_name) 
            filt &= (audio_anot_df["label"].str.contains("ii|xx")) 

            df = audio_anot_df.loc[filt].reset_index()
            df["shift_cat_label"] = df["label"] + df["label"].shift(-1)
            df["shift_dif_dur"] = df["end"].shift(-1) - df["start"]

            df_even = df.iloc[::2]
            pairless_df = df_even[df_even["shift_cat_label"] != 'iixx']
            pairless.append(pairless_df.head(1))

        pairless_breath = pd.concat(pairless)

        pairless_breath.to_csv(os.path.join(REPORT_FOLDER, "asq_pairless_breath.csv"))


        return pairless_breath

anotest = Anotest(DATA_PATH, ver="*")

In [170]:
anotest.check_anotless_audio()

Unnamed: 0,app_code_data,sub_id_data,file_name_data,file_path_data
39,webapp-asquire-mox-koi,aditi-c09f732f,webapp-asquire-mox-koi_aditi-c09f732f_~noise~_...,report_v12/asquire_data/aditi-c09f732f/webapp-...
40,webapp-asquire-mox-koi,aditi-c09f732f,webapp-asquire-mox-koi_aditi-c09f732f_sent_0_0...,report_v12/asquire_data/aditi-c09f732f/webapp-...
41,webapp-asquire-mox-koi,aditi-c09f732f,webapp-asquire-mox-koi_aditi-c09f732f_breath_2...,report_v12/asquire_data/aditi-c09f732f/webapp-...
42,webapp-asquire-mox-koi,aditi-c09f732f,webapp-asquire-mox-koi_aditi-c09f732f_cough_3_...,report_v12/asquire_data/aditi-c09f732f/webapp-...
52,webapp-asquire-mox-koi,qwer-0177a438,webapp-asquire-mox-koi_qwer-0177a438_aa_4_0.wav,report_v12/asquire_data/qwer-0177a438/webapp-a...
...,...,...,...,...
1658,webapp-asquire-mox-koi,sharanya-9e75eccd,webapp-asquire-mox-koi_sharanya-9e75eccd_uu_6_...,report_v12/asquire_data/sharanya-9e75eccd/weba...
1659,webapp-asquire-mox-koi,jeevan-2e24c903,webapp-asquire-mox-koi_jeevan-2e24c903_cough_3...,report_v12/asquire_data/jeevan-2e24c903/webapp...
1660,webapp-asquire-mox-koi,jeevan-2e24c903,webapp-asquire-mox-koi_jeevan-2e24c903_~noise~...,report_v12/asquire_data/jeevan-2e24c903/webapp...
1661,webapp-asquire-mox-koi,jeevan-2e24c903,webapp-asquire-mox-koi_jeevan-2e24c903_sent_0_...,report_v12/asquire_data/jeevan-2e24c903/webapp...


In [171]:
class Anotest_reformat:
    VER = "*"
    SEP = "_"
    META_SEP = "-"
    EXT_SEP = "."
    ANOT_LABELS = ['cc', 'ss', 'aa', 'yy', 'ee', 'ii', 'xx', 'zz', 'uu', 'oo', 'ii-n']

    fkeys = {
        "APP_CODE": "app_code", # 0
        "SID":"sub_id", # 1
        "FCLASS": "file_class", # 2
        "FCIDX": "file_xindex", # 3
        "SCORE": "score", # 4
        "FFMT": "file_format", # 5
        "FNAME": "file_name", # 6
        "FPATH": "file_path", # 7
        "FMATCH": "file_match" # 8
    }

    def __init__(self, path: str, ver="*") -> None:
        self.VER = ver

        all_files = glob.glob(f"{path}/*/{ver}")

        self.ALL_FILES_DF = self.make_files_df(all_files)


    def file_dict(self, fpath: str) -> dict:
        sub_path, fname = os.path.split(fpath)
        path, sub_id = os.path.split(sub_path)

        match_key = fname.split(self.EXT_SEP)[0]

        fitems = fname.replace(self.EXT_SEP, self.SEP, 1).split(self.SEP) 

        

        # if "asquire" not in fitems[0]:
        #     '''webapp-kruthikasv_a2ac52c9-breath-0-asquire-mox.wav'''

        #     fitems = fname.replace(self.EXT_SEP, self.META_SEP, 1).split(self.META_SEP)
        #     fitems[0] += f"-{fitems[-3]}-{fitems[-2]}"
        #     fitems[1] = sub_id.replace("_", "-")
        #     # fitems[2] = fclass
        #     # fitems[3] = fidx
        #     fitems[4] = "NA"
        #     fitems.pop(-2)
        #     # fitems[5] = fext
        #     fitems += [fname, fpath, match_key]
        #     fname = "_".join(fitems[:6])
        #     fname = "_".join(fitems[:5]) + '.' + fitems[5]
        #     print(fname)

        #     npath = os.path.join(sub_path, fname)
        #     old_path = fpath

        #     self.rename(old_path, npath)

        # elif "meta-data" in fname:
        #     # ext = fitems.pop()
        #     # fitems += ["na", "na", ext, fname, fpath, match_key]

        #     fname = "_".join(fitems[:6])
        #     fname = "_".join(fitems[:5]) + '.' + ext
        #     print(fname)

        #     npath = os.path.join(sub_path, fname)
        #     old_path = fpath

        #     # self.rename(old_path, npath)

    
        fitems[1] = sub_id
        fitems += [fname, fpath, match_key]
        
        
        
        fdict = dict(zip(self.fkeys.values(), fitems))
        return fdict
    
    def make_files_df(self, all_files: list) -> pd.DataFrame:
        files = [self.file_dict(fp) for fp in all_files]
        return pd.DataFrame(files)
    
    def rename(self, old_filepath: str, new_filepath: str) -> None:
        os.rename(old_filepath, new_filepath)


    # Getters
    def get_subject_list(self) -> list: 
        return self.ALL_FILES_DF["sub_id"].unique().tolist()
    
    def get_subject_files(self, sid: str) -> pd.DataFrame:
        return self.ALL_FILES_DF[self.ALL_FILES_DF["sub_id"] == sid]
    
    def get_subject_metadata(self, sid: str) -> pd.DataFrame:
        return self.ALL_METADATA_DF[self.ALL_METADATA_DF["sub_id"] == sid]
    
    def filter_file_df(self, keys: list) -> pd.DataFrame:

        df = self.ALL_FILES_DF.copy()
        filts = [df[self.fkeys["FNAME"]].str.contains(k) for k in keys]
        filt = functools.reduce(lambda p, c: p & c, filts)

        return df.loc[filt]

    # Checkers
    report_columns = ['app_code_data', 'sub_id_data', 'file_name_data', 'file_path_data']

    def check_anotless_audio(self) -> pd.DataFrame:

        AUDIO_FKEYS = ["wav"]
        AUDIO_ANOT_FKEYS = ["txt"]

        audio_df = self.filter_file_df(AUDIO_FKEYS)
        anote_df = self.filter_file_df(AUDIO_ANOT_FKEYS)

        fm = audio_df.merge(anote_df, how="left", on=[self.fkeys["FMATCH"]], suffixes=("_data", "_anot"))

        k = self.fkeys["FNAME"]

        return fm.loc[fm[f"{k}_anot"].isna()][self.report_columns]
    
    def check_missing_metadata(self) -> pd.DataFrame:
        META_FKEYS = ["meta-data", "json"]

        metadata_df = self.filter_file_df(META_FKEYS)

        all_subjects = self.get_subject_list()
        _subjects = metadata_df["sub_id"].unique().tolist()

        missing_subjects = list(set(all_subjects) - set(_subjects))

        if len(missing_subjects) == 0:
            print("No subject is missing metadata")
    
        return missing_subjects
    
    def read_aud_anot(self, finfo: pd.Series):
        df = pd.read_csv(finfo[self.fkeys["FPATH"]], sep="\t", names=["start", "end", "label"])
        df["line_number"] = range(1, len(df) + 1)
        df["dur"] = df["end"] - df["start"]
        return pd.concat([finfo.to_frame().T]*len(df), ignore_index=True, axis=0).join(df)
    
    def get_aud_anot_df(self):
        AUDIO_ANOT_FKEYS = ["txt"]

        audio_anot_file_df = self.filter_file_df(AUDIO_ANOT_FKEYS)
        aud_anot = [
            self.read_aud_anot(af)
            for _, af in audio_anot_file_df.iterrows()
        ]
        
        audio_anot_df = pd.concat(aud_anot).copy()

        return audio_anot_df

    def get_pairless_breath(self):

        AUDIO_ANOT_FKEYS = ["BA_", "txt"]

        audio_anot_file_df = self.filter_file_df(AUDIO_ANOT_FKEYS)
        aud_anot = [
            self.read_aud_anot(af)
            for _, af in audio_anot_file_df.iterrows()
        ]
        
        audio_anot_df = pd.concat(aud_anot).copy()

        filt = ~audio_anot_df["label"].isin(self.ANOT_LABELS)

        pairless = [audio_anot_df[filt]]
        for file_name in audio_anot_df["file_name"].unique().tolist():
            
            filt = (audio_anot_df["file_name"] == file_name) 
            filt &= (audio_anot_df["label"].str.contains("ii|xx")) 

            df = audio_anot_df.loc[filt].reset_index()
            df["shift_cat_label"] = df["label"] + df["label"].shift(-1)
            df["shift_dif_dur"] = df["end"].shift(-1) - df["start"]

            df_even = df.iloc[::2]
            pairless_df = df_even[df_even["shift_cat_label"] != 'iixx']
            pairless.append(pairless_df.head(1))



        return pd.concat(pairless)

# anotest = Anotest(DATA_PATH, ver="*")
# anotest.ALL_FILES_DF.to_csv("test.csv")

In [172]:
anotest.check_missing_metadata()

./report_v12/asquire_data/khushi-a4d0638c

./report_v12/asquire_data/dhairyagupta-4487f377

./report_v12/asquire_data/gauri-b4dcd86a

./report_v12/asquire_data/doll-808ed97a

./report_v12/asquire_data/giridhar-2ccb7108

./report_v12/asquire_data/sharanya-9e75eccd

./report_v12/asquire_data/athiban-51fa28b5

./report_v12/asquire_data/saikeerthana-6ce37f43

./report_v12/asquire_data/cooldude-a1b88e9b

./report_v12/asquire_data/hereiskusal-52795f7d

./report_v12/asquire_data/bugsbunny-13f848d8

./report_v12/asquire_data/prajaathiban-758e9950

./report_v12/asquire_data/sarthaksengupta-b2b291cb

./report_v12/asquire_data/shruthi-b4e8e18d

./report_v12/asquire_data/aditi-c09f732f

./report_v12/asquire_data/jeevan-2e24c903

./report_v12/asquire_data/kingbugsbunny-32a3bde1

./report_v12/asquire_data/amar-c29d4707

./report_v12/asquire_data/rekha-de0e48cb

./report_v12/asquire_data/murugeswari-d700dff3

./report_v12/asquire_data/varun-8f78ac35

./report_v12/asquire_data/qwer-0177a438

./report_

In [173]:
anot_df = anotest.get_aud_anot_df()

In [174]:
anot_df

Unnamed: 0,app_code,sub_id,file_class,file_xindex,score,file_format,file_name,file_path,file_match,start,end,label,line_number,dur
0,webapp-asquire-mox,bhargavee-70c0073e,ss,11,2,txt,webapp-asquire-mox_bhargavee-70c0073e_ss_11_2.txt,report_v12/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_ss_11_2,1.148886,7.382659,ss,1,6.233773
1,webapp-asquire-mox,bhargavee-70c0073e,ss,11,2,txt,webapp-asquire-mox_bhargavee-70c0073e_ss_11_2.txt,report_v12/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_ss_11_2,9.659157,15.488692,ss,2,5.829535
2,webapp-asquire-mox,bhargavee-70c0073e,ss,11,2,txt,webapp-asquire-mox_bhargavee-70c0073e_ss_11_2.txt,report_v12/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_ss_11_2,17.446054,22.658595,ss,3,5.212541
0,webapp-asquire-mox,bhargavee-70c0073e,cough,10,0,txt,webapp-asquire-mox_bhargavee-70c0073e_cough_10...,report_v12/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_cough_10_0,2.041492,2.503881,cc,1,0.462389
1,webapp-asquire-mox,bhargavee-70c0073e,cough,10,0,txt,webapp-asquire-mox_bhargavee-70c0073e_cough_10...,report_v12/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_cough_10_0,5.060108,5.592292,cc,2,0.532184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,webapp-asquire-mox,aditisatvika-dd09b820,aa,2,10,txt,webapp-asquire-mox_aditisatvika-dd09b820_aa_2_...,report_v12/asquire_data/aditisatvika-dd09b820/...,webapp-asquire-mox_aditisatvika-dd09b820_aa_2_10,11.487568,21.274378,aa,2,9.786810
2,webapp-asquire-mox,aditisatvika-dd09b820,aa,2,10,txt,webapp-asquire-mox_aditisatvika-dd09b820_aa_2_...,report_v12/asquire_data/aditisatvika-dd09b820/...,webapp-asquire-mox_aditisatvika-dd09b820_aa_2_10,23.631568,32.135351,aa,3,8.503783
0,webapp-asquire-mox,aditisatvika-dd09b820,ee,3,9,txt,webapp-asquire-mox_aditisatvika-dd09b820_ee_3_...,report_v12/asquire_data/aditisatvika-dd09b820/...,webapp-asquire-mox_aditisatvika-dd09b820_ee_3_9,2.262486,17.569622,ee,1,15.307136
1,webapp-asquire-mox,aditisatvika-dd09b820,ee,3,9,txt,webapp-asquire-mox_aditisatvika-dd09b820_ee_3_...,report_v12/asquire_data/aditisatvika-dd09b820/...,webapp-asquire-mox_aditisatvika-dd09b820_ee_3_9,19.266486,29.129514,ee,2,9.863028


In [175]:
anotest.get_pairless_breath()

Unnamed: 0,app_code,sub_id,file_class,file_xindex,score,file_format,file_name,file_path,file_match,start,end,label,line_number,dur,index,shift_cat_label,shift_dif_dur
2,webapp-asquire-mox,saniya-c3426e28,breath,0,0.0,txt,webapp-asquire-mox_saniya-c3426e28_breath_0_0.txt,report_v12/asquire_data/saniya-c3426e28/webapp...,webapp-asquire-mox_saniya-c3426e28_breath_0_0,4.463903,5.852141,ii-n,3,1.388238,2.0,ii-nxx,3.087632
0,webapp-asquire-mox,niharika-a8738865,breath,2,0.0,txt,webapp-asquire-mox_niharika-a8738865_breath_2_...,report_v12/asquire_data/niharika-a8738865/weba...,webapp-asquire-mox_niharika-a8738865_breath_2_0,1.162314,2.265924,ii-n,1,1.10361,0.0,ii-nxx,2.066335
0,webapp-asquire-mox,surabhi-3bce259b,breath,0,,txt,webapp-asquire-mox_surabhi-3bce259b_breath_0_N...,report_v12/asquire_data/surabhi-3bce259b/webap...,webapp-asquire-mox_surabhi-3bce259b_breath_0_NA,2.313243,4.492832,ii-n,1,2.179589,0.0,ii-nxx,4.318054
0,webapp-asquire-mox,sanjanak-17449261,breath,5,,txt,webapp-asquire-mox_sanjanak-17449261_breath_5_...,report_v12/asquire_data/sanjanak-17449261/weba...,webapp-asquire-mox_sanjanak-17449261_breath_5_NA,1.199773,3.817459,ii-n,1,2.617686,0.0,ii-nxx,6.726
0,webapp-asquire-mox,pavan-39ee4a48,breath,10,10.0,txt,webapp-asquire-mox_pavan-39ee4a48_breath_10_10...,report_v12/asquire_data/pavan-39ee4a48/webapp-...,webapp-asquire-mox_pavan-39ee4a48_breath_10_10,1.812097,3.685622,ii-n,1,1.873525,0.0,ii-nxx,4.852735
0,webapp-asquire-mox,amitraj-a4abaf74,breath,0,0.0,txt,webapp-asquire-mox_amitraj-a4abaf74_breath_0_0...,report_v12/asquire_data/amitraj-a4abaf74/webap...,webapp-asquire-mox_amitraj-a4abaf74_breath_0_0,2.944605,5.069578,ii-n,1,2.124973,0.0,ii-nxx,5.221363
0,webapp-asquire-mox,roopantj-c5e4e909,breath,2,0.0,txt,webapp-asquire-mox_roopantj-c5e4e909_breath_2_...,report_v12/asquire_data/roopantj-c5e4e909/weba...,webapp-asquire-mox_roopantj-c5e4e909_breath_2_0,1.261881,2.070551,ii-n,1,0.80867,0.0,ii-nxx,2.585968
0,webapp-asquire-mox,roopantj-c5e4e909,aa,4,0.0,txt,webapp-asquire-mox_roopantj-c5e4e909_aa_4_0.txt,report_v12/asquire_data/roopantj-c5e4e909/weba...,webapp-asquire-mox_roopantj-c5e4e909_aa_4_0,1.632649,3.727881,ii-n,1,2.095232,0.0,ii-nxx,4.789102
0,webapp-asquire-mox,gyamaryana-e2c7bce8,breath,2,0.0,txt,webapp-asquire-mox_gyamaryana-e2c7bce8_breath_...,report_v12/asquire_data/gyamaryana-e2c7bce8/we...,webapp-asquire-mox_gyamaryana-e2c7bce8_breath_2_0,1.288995,4.371373,ii,1,3.082378,0.0,iiii,5.268064
8,webapp-asquire-mox,sahil-ccb6707f,breath,20,5.0,txt,webapp-asquire-mox_sahil-ccb6707f_breath_20_5.txt,report_v12/asquire_data/sahil-ccb6707f/webapp-...,webapp-asquire-mox_sahil-ccb6707f_breath_20_5,11.344541,13.212519,ii-n,9,1.867978,8.0,ii-nxx,3.784475


In [176]:
anot_df["label"].unique()

array(['ss', 'cc', 'yy', 'oo', 'ii', 'xx', 'aa', 'uu', 'ee', 'zz', 'ii-n'],
      dtype=object)