# Create dataset for Asquire experiments

Prepare asquire_corpus_muster.csv file that contains detailes of entire asquire-data

In [2]:
from tqdm.autonotebook import tqdm
import soundfile as sf
import pandas as pd
import numpy as np
import subprocess
import functools
import librosa
import shutil
import glob
import json
import os

from lib.asqlib import *

  from tqdm.autonotebook import tqdm


## STATE VARIABLES

In [3]:
IS_DRY_RUN = False
IS_DUMMY_PATH = False

## PATHS

In [4]:
DATA_PATH = "ASQUIRE_DATA"
# DATA_PATH = "report_v10/asquire_data"


REPORT_FOLDER = f"report_{EXP_VER}"; mkdir(REPORT_FOLDER)

0

### CLASS: DATA FILE EXTRACTOR -> ALL_FILES_DF

In [5]:
class DataExtractFiles(DataStaticInfo):

    ALL_FILES_DF: pd.DataFrame

    def __init__(self, path: str, ver: str="*") -> None:

        self.DATA_PATH = path

        all_files = glob.glob(f"{path}/*/{ver}")

        self.ALL_FILES_DF = self.make_files_df(all_files)

        print("all file info extracted")


    # extract info from file name
    def file_dict(self, fpath: str) -> dict:
        '''
        nomeclature: 
        appcode_subject-id_file-class_file-index_score.file-format

        ex: 
        webapp-asquire-mox_aayushmanraaz-a78a2d2f_breath_0_0.wav
        '''

        _, fname = os.path.split(fpath)

        fitems = fname.replace(self.EXT_SEP, self.SEP, 1).split(self.SEP) 

        match_key = os.path.splitext(fname)[0]

        fitems += [fname, fpath, match_key]

        fdict = dict(zip(self.fkeys.values(), fitems))

        return fdict
    
    # make files dataframe
    def make_files_df(self, all_files: list) -> pd.DataFrame:
        files = [self.file_dict(fp) for fp in all_files]
        return pd.DataFrame(files)
    
    # get file extention types
    def get_file_extention(self) -> np.ndarray:
        return self.ALL_FILES_DF[self.fkeys["FFMT"]].unique()
    
    # get all subject list
    def get_subject_list(self) -> np.ndarray:
        return self.ALL_FILES_DF[self.fkeys["SID"]].unique()
    
    def get_all_data_info(self) -> str:
        cols = self.ALL_FILES_DF.columns.to_list()
        n_cols = self.ALL_FILES_DF.shape[1]
        rows = self.ALL_FILES_DF.shape[0]


        o = subprocess.run(["du", "-hs", self.DATA_PATH], 
                           stdout=subprocess.PIPE, 
                           stderr=subprocess.PIPE, 
                           check=True, text=True)

        info = [
            f"{n_cols} columns",
            f"{cols}",
            f"{rows} files",
            f"{len(self.get_subject_list())} subjects",
            f"size: {o.stdout}",
        ]
        print("\n".join(info))
    


### CLASS: SOURCE DATA FILE - ANOTES MATCHING -> SOURCE_FILES_DF

In [6]:
class DataSourceFiles(DataExtractFiles):

    # source data file name regex pattern
    src_data_fname_regx = r"^webapp-asquire-(?!.*\.meta\.).+\.(wav|json|)$"

    # source data file format to anot format mapping
    src_anot_map = {'wav': 'txt', "json": "meta.json"}

    ALL_FILES_DF: pd.DataFrame
    SOURCE_FILES_DF: pd.DataFrame

    def __init__(self, path: str, ver: str="*") -> None:
        super().__init__(path, ver)

        # self.ALL_FILES_DF = all_file_df

        self.SOURCE_FILES_DF = self.pair_src_anot_files()

    def make_source_files_df(self) -> pd.DataFrame:
        filt = self.ALL_FILES_DF.loc[:, self.fkeys["FNAME"]].str.match(self.src_data_fname_regx)

        df = self.ALL_FILES_DF[filt]
        return df
    
    def anot_path_join(self, p: pd.Series) -> str:
        fpath = os.path.join(os.path.split(p[self.fkeys['FPATH']])[0], p[self.get_anot_tag(self.fkeys['FNAME'])])

        return fpath if os.path.exists(fpath) else False
    
    def pair_src_anot_files(self) -> pd.DataFrame:
        anot_format_mapping = { self.fkeys["FFMT"]: self.src_anot_map }

        source_files_DF = self.make_source_files_df()
    
        anot_file_format_DF = source_files_DF.replace(anot_format_mapping) # anotation file extension

        # rename file_format to anot--file-format
        anot_fmt_column = self.get_anot_tag(self.fkeys['FFMT'])
        anot_file_format_DF.rename(columns={self.fkeys["FFMT"]: anot_fmt_column}, inplace=True)
        anot_file_format_DF = anot_file_format_DF[[anot_fmt_column, self.fkeys["FMATCH"]]]
       
        
        # merge source and anotation files
        src_data_files_DF = pd.merge(source_files_DF, anot_file_format_DF, on=self.fkeys["FMATCH"], how="left")
        anot_fname_column = self.get_anot_tag(self.fkeys['FNAME'])

        src_data_files_DF[anot_fname_column] = src_data_files_DF[self.fkeys["FMATCH"]] + '.' + src_data_files_DF[anot_fmt_column]


        anot_fpath_column = self.get_anot_tag(self.fkeys['FPATH'])
        
        anote_filepath_df = src_data_files_DF.apply(self.anot_path_join, axis=1).to_frame(name=anot_fpath_column)

        src_data_files_DF = pd.concat([src_data_files_DF, anote_filepath_df], axis=1)


        return src_data_files_DF
    
    def get_src_data_info(self) -> str:
        cols = self.SOURCE_FILES_DF.columns.to_list()
        n_cols = self.SOURCE_FILES_DF.shape[1]
        rows = self.SOURCE_FILES_DF.shape[0]

        info = [
            f"{n_cols} columns",
            f"{cols}",
            f"{rows} files",
            f"{len(self.get_subject_list())} subjects",
        ]

        print("\n".join(info))

### CLASS: EXTRACT META DATA -> SOURCE_FILES_META_DF

In [7]:
class ExtractMETAData(DataSourceFiles):

    IS_DRY_RUN: bool = True

    k_BIODATA = "bio"
    k_SURVEYDATA = "survey"
    na_FILL = "-"

    FCLASS_META = "meta-data"

    k_QIDX = "qn"

    SOURCE_FILES_DF: pd.DataFrame
    SOURCE_FILES_wMETA_DF: pd.DataFrame
    
    def __init__(self, path: str, ver: str='*', is_dry_run: bool = True) -> None:
        super().__init__(path, ver)

        self.IS_DRY_RUN = is_dry_run
        self.SOURCE_FILES_wMETA_DF = self.make_meta_df()

    def read_metadata(self, finfo):
        file_path = finfo[self.fkeys["FPATH"]]

        with open(file_path, 'r') as fp:
            # print(f"./{file_path}")
            meta_data = json.load(fp)

            bio_data: dict = meta_data[self.k_BIODATA] # refer json file for structure

            bio_data[self.fkeys["SID"]] = meta_data["userId"].replace('_', '-') # make subject id sep uniform
            survey_data_a = meta_data[self.k_SURVEYDATA]

            bio_data_dict = [{ self.k_QIDX: k, "Q": f"{k}?", "A": v } for k, v in sorted(bio_data.items())]

            survey_data_dict = [{ self.k_QIDX: str(q["qno"]), "Q": q["question"], "A": q["answer"] } for q in survey_data_a]

            survey_data_a_DF = pd.DataFrame(bio_data_dict + survey_data_dict)

            survey_data_o_DF = survey_data_a_DF
            survey_data_o_DF.fillna(self.na_FILL, inplace=True)

            if not self.IS_DRY_RUN:
                nfile_path = file_path.replace(".json", f".{self.src_anot_map['json']}")
                survey_data_o_DF.set_index(self.k_QIDX).to_json(nfile_path, orient='index')
                # print(f"exported: ./{nfile_path}")

            
            return (bio_data, survey_data_o_DF)
        
    def make_meta_df(self) -> pd.DataFrame:
        filt = self.SOURCE_FILES_DF[self.fkeys["FCLASS"]] == self.FCLASS_META
        src_META_data_DF = self.SOURCE_FILES_DF[filt]

        all_META_data_DF = pd.DataFrame([self.read_metadata(row)[0] for _, row in src_META_data_DF.iterrows()])

        print("all meta data exported")

        src_data_files_DF = self.SOURCE_FILES_DF.copy()
        src_data_files_DF = src_data_files_DF.merge(all_META_data_DF, how="left", on=[self.fkeys["SID"]])

        return src_data_files_DF

    def get_meta_data_info(self) -> str:
        cols = self.SOURCE_FILES_wMETA_DF.columns.to_list()
        n_cols = self.SOURCE_FILES_wMETA_DF.shape[1]
        rows = self.SOURCE_FILES_wMETA_DF.shape[0]

        info = [
            f"{n_cols} columns",
            f"{cols}",
            f"{rows} files",
            f"{len(self.get_subject_list())} subjects",
        ]

        print("\n".join(info))


### CLASS: EXTRACT ALL ANOT LABELS -> ALL_AUD_ANOTE_LABEL_DF

In [8]:
class AnotDF(ExtractMETAData):

    ALL_AUD_ANOTE_LABEL_DF: pd.DataFrame
    ALL_AUD_ANOTE_LABEL_CSV_FNAME = f"all_asq_aud_anotes"

    def __init__(self, path: str, ver: str='*', is_dry_run: bool = True) -> None:
        super().__init__(path, ver, is_dry_run)

    def filter_file_df(self, keys: list) -> pd.DataFrame:

        df = self.SOURCE_FILES_wMETA_DF.copy()
        filts = [df[self.fkeys['FNAME']].str.contains(k) for k in keys]
        filt = functools.reduce(lambda p, c: p & c, filts)

        return df.loc[filt]
    
    def read_aud_anot(self, finfo: pd.Series, pbar: tqdm):
        # test
        # print(finfo[f"anot--{self.fkeys['FPATH']}"])

        pbar.set_description("processing: ")

        df = pd.read_csv(finfo[self.get_anot_tag(self.fkeys['FPATH'])], 
                         names=self.ANOT_HEADER,
                         sep="\t", 
                         )
        
        df["line_number"] = range(1, len(df) + 1)
        df["dur"] = df["end"] - df["start"]

        anot_df = pd.concat([finfo.to_frame().T]*len(df), ignore_index=True, axis=0).join(df)


        pbar.update(1)

        return anot_df

    def get_aud_anot_df(self):

        df = self.SOURCE_FILES_wMETA_DF

        filt = df[self.fkeys['FFMT']] == "wav"
        filt &= ~(df[self.get_anot_tag(self.fkeys['FPATH'])] == False)

        audio_anot_file_df = df.loc[filt].copy()

        with tqdm(total=len(audio_anot_file_df), position=0, leave=True) as pbar:
            aud_anot = [
                self.read_aud_anot(af, pbar)
                for _, af in tqdm(audio_anot_file_df.iterrows())
            ]

        audio_anot_df = pd.concat(aud_anot).copy()


        export_path = os.path.join(REPORT_FOLDER, 
                                   f"{self.ALL_AUD_ANOTE_LABEL_CSV_FNAME}.csv")
        
        audio_anot_df.to_csv(export_path, index=False)

        self.ALL_AUD_ANOTE_LABEL_DF = audio_anot_df

        return audio_anot_df
    
    def get_anot_label_info(self) -> str:
        if self.ALL_AUD_ANOTE_LABEL_DF is None:
            self.get_aud_anot_df()
        
        cols = self.ALL_AUD_ANOTE_LABEL_DF.columns.to_list()
        labels = self.ALL_AUD_ANOTE_LABEL_DF["label"].unique().tolist()
        n_cols = self.ALL_AUD_ANOTE_LABEL_DF.shape[1]
        rows = self.ALL_AUD_ANOTE_LABEL_DF.shape[0]

        info = [
            f"{n_cols} columns",
            f"{cols}",
            f"{len(labels)} labels, {labels}",
            f"{rows} bounds",
            f"{len(self.get_subject_list())} subjects",
        ]

        print("\n".join(info))


### CLASS: FINAL DATA PROCESS -> SOURCE_FILES_META_DF

In [9]:
class FinalDataProcess(ExtractMETAData):
    merge_cols = [
    'sub_id',
    'age', 'weight', 
    'height', 'gender',
    ]

    drop_cols = ["file_format", "file_name", "file_class", "file_xindex", "file_match"]

    col_order = [
    'sub_id', 'age', 'gender', 'height', 'weight',
    
    'meta-data--file_path', 'anot--meta-data--file_path',
    
    'breath--file_path', 'anot--breath--file_path',
    
    'cough--file_path', 'anot--cough--file_path',
    
    'aa--file_path', 'anot--aa--file_path',
    
    'ee--file_path', 'anot--ee--file_path',
    
    'oo--file_path', 'anot--oo--file_path',
    
    'ss--file_path', 'anot--ss--file_path',
    
    'uu--file_path', 'anot--uu--file_path',
    
    'yee--file_path', 'anot--yee--file_path',
    
    'zz--file_path', 'anot--zz--file_path',
    
    'sent--file_path', 'anot--sent--file_path',
    
    '~noise~--file_path', 'anot--~noise~--file_path']

    MASTER_DATA_DF: pd.DataFrame
    MASTER_DATA_EXPORT_FNAME: str  = "asquire_corpus_muster"

    def __init__(self, path: str, ver: str='*', is_dry_run: bool = True) -> None:
        
        super().__init__(path, ver, is_dry_run)


        print("dataset export vesion: ", EXP_VER)

        self.MASTER_DATA_DF = self.merge_fclass_dfs()

        n_data_path = f"{REPORT_FOLDER}/{self.AUDIO_EXPORT_FOLDER}"
        asqdata_anote_labels = AnotDF(n_data_path, ver="*", is_dry_run=True)
        asqdata_anote_labels.get_aud_anot_df()

    def merge_fclass_dfs(self):

        main_df = self.SOURCE_FILES_wMETA_DF.copy()
        main_df.drop(columns=[self.fkeys['APP_CODE']], inplace=True)

        fclass_DFs = self.seperate_fclass(main_df)

        df_merged = functools.reduce(lambda  l, r: pd.merge(l, r, on=self.merge_cols, how='outer'), fclass_DFs)

        df_cols = list(df_merged.columns.delete(df_merged.columns.str.contains("|".join(self.drop_cols))))

        df_merged = df_merged[df_cols]

        df_merged = df_merged.fillna(False).replace(False, "-")

        df_merged.sort_values(["gender", "age"], inplace=True)

        rm_cols = df_merged.columns.str.contains("|".join(["score"]))
        df_merged = df_merged.drop(columns=df_merged.columns[rm_cols])

        df_merged.reset_index(inplace=True, drop=True)
        df_merged.index.name = "index"

        df_merged = df_merged[self.col_order]

        export_path = os.path.join(REPORT_FOLDER, 
                                   f"{self.MASTER_DATA_EXPORT_FNAME}.csv")
        df_merged.to_csv(export_path, index=True)

        return df_merged
    
    def seperate_fclass(self, df):

        fclass_DFs = []
        fclasses = df[self.fkeys["FCLASS"]].unique()
        fclasses = sorted(fclasses)
        # fclasses.remove('meta-data')

        print(fclasses)
        for fclass in fclasses:

            filt = df[self.fkeys["FCLASS"]] == fclass
            fclass_df = df[filt]

            if len(fclass_df) > 0:
                print("processing: ", fclass,  end=" ")
                fclass_df =  self.merge_audio_files(fclass_df)
                print("done")
            
            
            fclass_df.columns = fclass_df.columns.str.replace("file", f"{fclass}--file")
            fclass_df.columns = fclass_df.columns.str.replace("score", f"{fclass}--score")

            fclass_DFs.append(fclass_df)

            #test
            # break
        
        self.FCLASS_DFs = fclass_DFs

        return fclass_DFs
    
    FCLASS_DFs: []

    FS = 44100
    AUDIO_EXPORT_FOLDER = f"asquire_data"

    def merge_audio_files(self, fclass_df: pd.DataFrame):

        subjects = fclass_df[self.fkeys["SID"]].unique()

        merged_fclass_dfs = []
        
        with tqdm(total=len(subjects), position=0, leave=True) as pbar:


            for sub_id in subjects:

                # subject export folder
                export_folder = os.path.join(REPORT_FOLDER, 
                                             f"{self.AUDIO_EXPORT_FOLDER}", sub_id)
                mkdir(export_folder)

                filt = fclass_df[self.fkeys["SID"]] == sub_id
                sub_df: pd.DataFrame = fclass_df[filt]
                
                # merge multiple audio files
                fmt = set(sub_df[self.fkeys["FFMT"]].unique())
                n_sub_rv = sub_df.iloc[-1]

                
            
                if ("wav" in fmt) and (len(fmt) == 1):

                    # rename with last filename
                    sub_df = sub_df.sort_values(self.fkeys["FCIDX"])
                    fname = os.path.splitext(sub_df.iloc[-1][self.fkeys["FNAME"]])[0]
                    export_fpath = f"{export_folder}/{fname}"


                    n_sub_rv[self.fkeys['FPATH']] = '-'
                    n_sub_rv[self.get_anot_tag(self.fkeys['FPATH'])] = '-'

                    if not self.IS_DRY_RUN:

                        extended_aud_signal, extend_anot_dfs = self.extend_audio_files(sub_df)


                        if len(extended_aud_signal) > 0:
                            extended_aud_signal = (np.nan_to_num(extended_aud_signal))

                            extended_aud_signal = librosa.util.normalize(extended_aud_signal)
                            sf.write(f"{export_fpath}.wav", extended_aud_signal, self.FS)

                            n_sub_rv[self.fkeys['FPATH']] = f"{export_fpath}.wav"


                        if len(extend_anot_dfs) > 0:
                            extended_anot_df = pd.concat(extend_anot_dfs)
                            extended_anot_df.to_csv(f"{export_fpath}.txt", index=False, header=False, sep="\t")

                            n_sub_rv[self.get_anot_tag(self.fkeys['FPATH'])] = f"{export_fpath}.txt"


                else: # copy non-audio files into the new folder
                    
                    if not self.IS_DRY_RUN:
                        src_fpath = sub_df.iloc[0][self.get_anot_tag(self.fkeys['FPATH'])]

                        fname = sub_df.iloc[0][self.get_anot_tag(self.fkeys['FNAME'])]
                        export_fpath = os.path.join(export_folder, fname)
                        
                        shutil.copy(src_fpath, export_fpath)
                        n_sub_rv[self.get_anot_tag(self.fkeys['FPATH'])] = f"{export_fpath}"

                        src_fpath = sub_df.iloc[0][self.fkeys['FPATH']]

                        fname = sub_df.iloc[0][self.fkeys['FNAME']]
                        export_fpath = os.path.join(export_folder, fname)

                        shutil.copy(src_fpath, export_fpath)
                        n_sub_rv[self.fkeys['FPATH']] = f"{export_fpath}"


                
                merged_fclass_dfs.append(n_sub_rv)
                
                pbar.update(1)
        
        merged_fclass_df = pd.DataFrame(merged_fclass_dfs) # if len(merged_fclass_dfs) > 0 else fclass_df

        
        return merged_fclass_df

    def extend_audio_files(self, sub_df: pd.DataFrame):
        extend_duration = 0
        extend_anot_dfs = []
        extended_aud_signal = np.ndarray([])

        # if len(sub_df) > 2: print(sub_df["sub_id"])

        for _, file in sub_df.iterrows():

            # EXTEND AUDIO SIGNAL
            data_fpath = file[self.fkeys["FPATH"]]
            if os.path.exists(f"{data_fpath}"):
                signal = librosa.load(data_fpath, sr=self.FS)[0]
                extended_aud_signal = np.append(extended_aud_signal, signal)

            # EXTEND ANNOTATION TIME STAMPS
            anot_fpath = file[self.get_anot_tag(self.fkeys['FPATH'])]
            if os.path.exists(f"{anot_fpath}"):

                # UPDATE ANNOTATION TIME STAMPS
                anot_df = pd.read_csv(anot_fpath, 
                                    names=self.ANOT_HEADER, 
                                    sep="\t", header=None)

                anot_df["start"] +=  extend_duration
                anot_df["end"] += extend_duration

                # UPDATE DURATION AFTER EXTENDING SIGNAL
                extend_duration += (signal.shape[0] / self.FS)

                extend_anot_dfs.append(anot_df)

        extended_aud_signal = (np.nan_to_num(extended_aud_signal))
        extended_aud_signal = librosa.util.normalize(extended_aud_signal)

        return extended_aud_signal, extend_anot_dfs

    def get_muster_info(self) -> str:
        cols = self.MASTER_DATA_DF.columns.to_list()
        n_cols = self.MASTER_DATA_DF.shape[1]
        rows = self.MASTER_DATA_DF.shape[0]

        info = [
            f"{n_cols} columns",
            f"{cols}",
            f"{rows} data points",
            f"{len(self.get_subject_list())} subjects",
        ]

        print("\n".join(info))

In [10]:
asqdata_muster_file = FinalDataProcess(DATA_PATH, ver="*", is_dry_run=False)
asqdata_muster_file.MASTER_DATA_DF

all file info extracted
all meta data exported
dataset export vesion:  v12
['aa', 'breath', 'cough', 'ee', 'meta-data', 'oo', 'sent', 'ss', 'uu', 'yee', 'zz', '~noise~']
processing:  aa 

  0%|          | 0/186 [00:00<?, ?it/s]

done
processing:  breath 

  0%|          | 0/193 [00:00<?, ?it/s]

done
processing:  cough 

  0%|          | 0/215 [00:00<?, ?it/s]

done
processing:  ee 

  0%|          | 0/185 [00:00<?, ?it/s]

done
processing:  meta-data 

  0%|          | 0/203 [00:00<?, ?it/s]

done
processing:  oo 

  0%|          | 0/176 [00:00<?, ?it/s]

done
processing:  sent 

  0%|          | 0/28 [00:00<?, ?it/s]

done
processing:  ss 

  0%|          | 0/167 [00:00<?, ?it/s]

done
processing:  uu 

  0%|          | 0/174 [00:00<?, ?it/s]

done
processing:  yee 

  0%|          | 0/168 [00:00<?, ?it/s]

done
processing:  zz 

  0%|          | 0/162 [00:00<?, ?it/s]

done
processing:  ~noise~ 

  0%|          | 0/27 [00:00<?, ?it/s]

done
all file info extracted
all meta data exported


  0%|          | 0/1466 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Unnamed: 0_level_0,sub_id,age,gender,height,weight,meta-data--file_path,anot--meta-data--file_path,breath--file_path,anot--breath--file_path,cough--file_path,...,uu--file_path,anot--uu--file_path,yee--file_path,anot--yee--file_path,zz--file_path,anot--zz--file_path,sent--file_path,anot--sent--file_path,~noise~--file_path,anot--~noise~--file_path
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,qwer-0177a438,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1,bugsbunny-13f848d8,-,-,-,-,-,-,report_v12/asquire_data/bugsbunny-13f848d8/web...,-,report_v12/asquire_data/bugsbunny-13f848d8/web...,...,report_v12/asquire_data/bugsbunny-13f848d8/web...,-,report_v12/asquire_data/bugsbunny-13f848d8/web...,-,report_v12/asquire_data/bugsbunny-13f848d8/web...,-,report_v12/asquire_data/bugsbunny-13f848d8/web...,-,report_v12/asquire_data/bugsbunny-13f848d8/web...,-
2,doll-808ed97a,-,-,-,-,-,-,report_v12/asquire_data/doll-808ed97a/webapp-a...,-,report_v12/asquire_data/doll-808ed97a/webapp-a...,...,report_v12/asquire_data/doll-808ed97a/webapp-a...,-,report_v12/asquire_data/doll-808ed97a/webapp-a...,-,report_v12/asquire_data/doll-808ed97a/webapp-a...,-,report_v12/asquire_data/doll-808ed97a/webapp-a...,-,report_v12/asquire_data/doll-808ed97a/webapp-a...,-
3,varun-8f78ac35,-,-,-,-,-,-,report_v12/asquire_data/varun-8f78ac35/webapp-...,-,report_v12/asquire_data/varun-8f78ac35/webapp-...,...,report_v12/asquire_data/varun-8f78ac35/webapp-...,-,report_v12/asquire_data/varun-8f78ac35/webapp-...,-,report_v12/asquire_data/varun-8f78ac35/webapp-...,-,report_v12/asquire_data/varun-8f78ac35/webapp-...,-,report_v12/asquire_data/varun-8f78ac35/webapp-...,-
4,murugeswari-d700dff3,-,-,-,-,-,-,report_v12/asquire_data/murugeswari-d700dff3/w...,-,report_v12/asquire_data/murugeswari-d700dff3/w...,...,report_v12/asquire_data/murugeswari-d700dff3/w...,-,report_v12/asquire_data/murugeswari-d700dff3/w...,-,report_v12/asquire_data/murugeswari-d700dff3/w...,-,report_v12/asquire_data/murugeswari-d700dff3/w...,-,report_v12/asquire_data/murugeswari-d700dff3/w...,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,harisk-6f82d610,58,m,175,88,report_v12/asquire_data/harisk-6f82d610/webapp...,report_v12/asquire_data/harisk-6f82d610/webapp...,report_v12/asquire_data/harisk-6f82d610/webapp...,report_v12/asquire_data/harisk-6f82d610/webapp...,report_v12/asquire_data/harisk-6f82d610/webapp...,...,report_v12/asquire_data/harisk-6f82d610/webapp...,report_v12/asquire_data/harisk-6f82d610/webapp...,-,-,report_v12/asquire_data/harisk-6f82d610/webapp...,report_v12/asquire_data/harisk-6f82d610/webapp...,-,-,-,-
228,arhaam-350fc3e6,6,m,114,30,report_v12/asquire_data/arhaam-350fc3e6/webapp...,report_v12/asquire_data/arhaam-350fc3e6/webapp...,report_v12/asquire_data/arhaam-350fc3e6/webapp...,report_v12/asquire_data/arhaam-350fc3e6/webapp...,report_v12/asquire_data/arhaam-350fc3e6/webapp...,...,report_v12/asquire_data/arhaam-350fc3e6/webapp...,report_v12/asquire_data/arhaam-350fc3e6/webapp...,-,-,-,-,-,-,-,-
229,ravisk-cd186836,60,m,177,83,report_v12/asquire_data/ravisk-cd186836/webapp...,report_v12/asquire_data/ravisk-cd186836/webapp...,report_v12/asquire_data/ravisk-cd186836/webapp...,report_v12/asquire_data/ravisk-cd186836/webapp...,report_v12/asquire_data/ravisk-cd186836/webapp...,...,report_v12/asquire_data/ravisk-cd186836/webapp...,report_v12/asquire_data/ravisk-cd186836/webapp...,report_v12/asquire_data/ravisk-cd186836/webapp...,report_v12/asquire_data/ravisk-cd186836/webapp...,report_v12/asquire_data/ravisk-cd186836/webapp...,report_v12/asquire_data/ravisk-cd186836/webapp...,-,-,-,-
230,srinivas-b89febe6,60,m,172,85,report_v12/asquire_data/srinivas-b89febe6/weba...,report_v12/asquire_data/srinivas-b89febe6/weba...,report_v12/asquire_data/srinivas-b89febe6/weba...,report_v12/asquire_data/srinivas-b89febe6/weba...,report_v12/asquire_data/srinivas-b89febe6/weba...,...,report_v12/asquire_data/srinivas-b89febe6/weba...,report_v12/asquire_data/srinivas-b89febe6/weba...,report_v12/asquire_data/srinivas-b89febe6/weba...,report_v12/asquire_data/srinivas-b89febe6/weba...,report_v12/asquire_data/srinivas-b89febe6/weba...,report_v12/asquire_data/srinivas-b89febe6/weba...,-,-,-,-


In [11]:
asqdata_muster_file.get_all_data_info()

9 columns
['app_code', 'sub_id', 'file_class', 'file_xindex', 'score', 'file_format', 'file_name', 'file_path', 'file_match']
4324 files
232 subjects
size: 0	ASQUIRE_DATA



In [12]:
asqdata_muster_file.get_muster_info()

29 columns
['sub_id', 'age', 'gender', 'height', 'weight', 'meta-data--file_path', 'anot--meta-data--file_path', 'breath--file_path', 'anot--breath--file_path', 'cough--file_path', 'anot--cough--file_path', 'aa--file_path', 'anot--aa--file_path', 'ee--file_path', 'anot--ee--file_path', 'oo--file_path', 'anot--oo--file_path', 'ss--file_path', 'anot--ss--file_path', 'uu--file_path', 'anot--uu--file_path', 'yee--file_path', 'anot--yee--file_path', 'zz--file_path', 'anot--zz--file_path', 'sent--file_path', 'anot--sent--file_path', '~noise~--file_path', 'anot--~noise~--file_path']
232 data points
232 subjects


In [13]:
label = "ii|xx"
filt = asqdata_muster_file.ALL_AUD_ANOTE_LABEL_DF["label"].str.contains(label)
asqdata_muster_file.ALL_AUD_ANOTE_LABEL_DF[filt]["dur"].describe()

AttributeError: 'FinalDataProcess' object has no attribute 'ALL_AUD_ANOTE_LABEL_DF'

In [None]:
asqdata_muster_file.ALL_AUD_ANOTE_LABEL_DF["label"].value_counts()

label
cc      1278
xx       771
ii       711
ee       558
aa       537
oo       501
uu       452
zz       429
ss       412
yy       400
ii-n      64
Name: count, dtype: int64

In [None]:
filt = asqdata_muster_file.MASTER_DATA_DF.columns.str.contains("file_path")
# filt &= ~asqdata_muster_file.MASTER_DATA_DF.columns.str.contains("anot--")
c = asqdata_muster_file.MASTER_DATA_DF.columns[filt].str.replace("--file_path", "")
[print(c, end=', ') for c in c]
print()

meta-data, anot--meta-data, breath, anot--breath, cough, anot--cough, aa, anot--aa, ee, anot--ee, oo, anot--oo, ss, anot--ss, uu, anot--uu, yee, anot--yee, zz, anot--zz, sent, anot--sent, ~noise~, anot--~noise~, 


In [None]:
c = asqdata_muster_file.ALL_AUD_ANOTE_LABEL_DF['label'].unique().tolist()
[print(c, end=', ') for c in c]
print()

AttributeError: 'FinalDataProcess' object has no attribute 'ALL_AUD_ANOTE_LABEL_DF'

## TEST

In [None]:
asqdata_all_files = DataExtractFiles(DATA_PATH, ver="*")
asqdata_all_files.get_all_data_info()

asqdata_all_files.ALL_FILES_DF.head(3)

all file info extracted
9 columns
['app_code', 'sub_id', 'file_class', 'file_xindex', 'score', 'file_format', 'file_name', 'file_path', 'file_match']
3553 files
232 subjects
size: 4.3G	report_v10/asquire_data



Unnamed: 0,app_code,sub_id,file_class,file_xindex,score,file_format,file_name,file_path,file_match
0,webapp-asquire-mox,bhargavee-70c0073e,oo,8,4,wav,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4.wav,report_v10/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4
1,webapp-asquire-mox,bhargavee-70c0073e,ss,11,2,txt,webapp-asquire-mox_bhargavee-70c0073e_ss_11_2.txt,report_v10/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_ss_11_2
2,webapp-asquire-mox,bhargavee-70c0073e,cough,10,0,txt,webapp-asquire-mox_bhargavee-70c0073e_cough_10...,report_v10/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_cough_10_0


In [None]:
asqdata_source_files = DataSourceFiles(DATA_PATH, ver="*")
asqdata_source_files.get_src_data_info()

asqdata_source_files.SOURCE_FILES_DF.head(3)


all file info extracted
12 columns
['app_code', 'sub_id', 'file_class', 'file_xindex', 'score', 'file_format', 'file_name', 'file_path', 'file_match', 'anot--file_format', 'anot--file_name', 'anot--file_path']
1884 files
232 subjects


Unnamed: 0,app_code,sub_id,file_class,file_xindex,score,file_format,file_name,file_path,file_match,anot--file_format,anot--file_name,anot--file_path
0,webapp-asquire-mox,bhargavee-70c0073e,oo,8,4,wav,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4.wav,report_v10/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4,txt,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4.txt,report_v10/asquire_data/bhargavee-70c0073e/web...
1,webapp-asquire-mox,bhargavee-70c0073e,yee,9,3,wav,webapp-asquire-mox_bhargavee-70c0073e_yee_9_3.wav,report_v10/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_yee_9_3,txt,webapp-asquire-mox_bhargavee-70c0073e_yee_9_3.txt,report_v10/asquire_data/bhargavee-70c0073e/web...
2,webapp-asquire-mox,bhargavee-70c0073e,cough,10,0,wav,webapp-asquire-mox_bhargavee-70c0073e_cough_10...,report_v10/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_cough_10_0,txt,webapp-asquire-mox_bhargavee-70c0073e_cough_10...,report_v10/asquire_data/bhargavee-70c0073e/web...


In [None]:
asqdata_meta_files = ExtractMETAData(DATA_PATH, ver="*", is_dry_run=True)
asqdata_meta_files.get_meta_data_info()

asqdata_meta_files.SOURCE_FILES_wMETA_DF.head(3)

all file info extracted
all meta data exported
16 columns
['app_code', 'sub_id', 'file_class', 'file_xindex', 'score', 'file_format', 'file_name', 'file_path', 'file_match', 'anot--file_format', 'anot--file_name', 'anot--file_path', 'age', 'gender', 'height', 'weight']
1884 files
232 subjects


Unnamed: 0,app_code,sub_id,file_class,file_xindex,score,file_format,file_name,file_path,file_match,anot--file_format,anot--file_name,anot--file_path,age,gender,height,weight
0,webapp-asquire-mox,bhargavee-70c0073e,oo,8,4,wav,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4.wav,report_v10/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4,txt,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4.txt,report_v10/asquire_data/bhargavee-70c0073e/web...,21,f,168,48
1,webapp-asquire-mox,bhargavee-70c0073e,yee,9,3,wav,webapp-asquire-mox_bhargavee-70c0073e_yee_9_3.wav,report_v10/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_yee_9_3,txt,webapp-asquire-mox_bhargavee-70c0073e_yee_9_3.txt,report_v10/asquire_data/bhargavee-70c0073e/web...,21,f,168,48
2,webapp-asquire-mox,bhargavee-70c0073e,cough,10,0,wav,webapp-asquire-mox_bhargavee-70c0073e_cough_10...,report_v10/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_cough_10_0,txt,webapp-asquire-mox_bhargavee-70c0073e_cough_10...,report_v10/asquire_data/bhargavee-70c0073e/web...,21,f,168,48


In [None]:
asqdata_anote_labels = AnotDF('report_v11/asquire_data', ver="*", is_dry_run=True)


all file info extracted
all meta data exported


In [None]:
asqdata_anote_labels.get_aud_anot_df()

1466it [00:06, 238.18it/s]██▉| 1465/1466 [00:06<00:00, 225.79it/s]
processing: : 100%|██████████| 1466/1466 [00:06<00:00, 238.06it/s]


Unnamed: 0,app_code,sub_id,file_class,file_xindex,score,file_format,file_name,file_path,file_match,anot--file_format,...,anot--file_path,age,gender,height,weight,start,end,label,line_number,dur
0,webapp-asquire-mox,bhargavee-70c0073e,oo,8,4,wav,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4.wav,report_v11/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4,txt,...,report_v11/asquire_data/bhargavee-70c0073e/web...,21,f,168,48,1.526465,5.969838,oo,1,4.443373
1,webapp-asquire-mox,bhargavee-70c0073e,oo,8,4,wav,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4.wav,report_v11/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4,txt,...,report_v11/asquire_data/bhargavee-70c0073e/web...,21,f,168,48,6.861535,11.486270,oo,2,4.624735
2,webapp-asquire-mox,bhargavee-70c0073e,oo,8,4,wav,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4.wav,report_v11/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_oo_8_4,txt,...,report_v11/asquire_data/bhargavee-70c0073e/web...,21,f,168,48,12.529103,16.020324,oo,3,3.491221
0,webapp-asquire-mox,bhargavee-70c0073e,yee,9,3,wav,webapp-asquire-mox_bhargavee-70c0073e_yee_9_3.wav,report_v11/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_yee_9_3,txt,...,report_v11/asquire_data/bhargavee-70c0073e/web...,21,f,168,48,0.804811,7.611211,yy,1,6.806400
1,webapp-asquire-mox,bhargavee-70c0073e,yee,9,3,wav,webapp-asquire-mox_bhargavee-70c0073e_yee_9_3.wav,report_v11/asquire_data/bhargavee-70c0073e/web...,webapp-asquire-mox_bhargavee-70c0073e_yee_9_3,txt,...,report_v11/asquire_data/bhargavee-70c0073e/web...,21,f,168,48,8.576984,13.428843,yy,2,4.851859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,webapp-asquire-mox,aditisatvika-dd09b820,ss,7,9,wav,webapp-asquire-mox_aditisatvika-dd09b820_ss_7_...,report_v11/asquire_data/aditisatvika-dd09b820/...,webapp-asquire-mox_aditisatvika-dd09b820_ss_7_9,txt,...,report_v11/asquire_data/aditisatvika-dd09b820/...,16,f,170,40,14.950703,26.865730,ss,2,11.915027
2,webapp-asquire-mox,aditisatvika-dd09b820,ss,7,9,wav,webapp-asquire-mox_aditisatvika-dd09b820_ss_7_...,report_v11/asquire_data/aditisatvika-dd09b820/...,webapp-asquire-mox_aditisatvika-dd09b820_ss_7_9,txt,...,report_v11/asquire_data/aditisatvika-dd09b820/...,16,f,170,40,29.142486,41.436973,ss,3,12.294487
0,webapp-asquire-mox,aditisatvika-dd09b820,ee,3,9,wav,webapp-asquire-mox_aditisatvika-dd09b820_ee_3_...,report_v11/asquire_data/aditisatvika-dd09b820/...,webapp-asquire-mox_aditisatvika-dd09b820_ee_3_9,txt,...,report_v11/asquire_data/aditisatvika-dd09b820/...,16,f,170,40,2.262486,17.569622,ee,1,15.307136
1,webapp-asquire-mox,aditisatvika-dd09b820,ee,3,9,wav,webapp-asquire-mox_aditisatvika-dd09b820_ee_3_...,report_v11/asquire_data/aditisatvika-dd09b820/...,webapp-asquire-mox_aditisatvika-dd09b820_ee_3_9,txt,...,report_v11/asquire_data/aditisatvika-dd09b820/...,16,f,170,40,19.266486,29.129514,ee,2,9.863028
