# Load Pnoi Sync data

In [79]:
import pandas as pd
import os


mkdir = lambda p: 0 if os.path.exists(p) else (os.mkdir(p), 1)[1]


## Paths to Dataset

In [80]:
EXP_VER = "v9"

REPORTS = "reports"; mkdir(REPORTS) # Path to the reports folder

PNOI_CORPUS_CSV_NAME = "pnoicorpus_muster.csv" # Name of the csv file
PNOI_CORPUS_CSV_PATH = f"{REPORTS}/{PNOI_CORPUS_CSV_NAME}"  #"pnoicorpus_muster.csv" # Name of the csv file

PNOI_CORPUS_CSV_PATH = f"{REPORTS}/{PNOI_CORPUS_CSV_NAME}" # Path to the master csv file

PNOI_SYNC_CSV_NAME = f"pnoi_sync_aud_{EXP_VER}.csv" # Name of the csv file
PNOI_SYNC_CSV_PATH = f"{REPORTS}/{PNOI_SYNC_CSV_NAME}" # Path to the master csv file

EXP_VER = "vB"


In [81]:
class DataStaticInfo:

    VER = "*"
    SEP = "-"
    META_SEP = "_"
    EXT_SEP = "."
    ANOT_LABELS = ["aa", "ee", "uu", "oo", "ii", "xx", "bb1", "bb2", "bb3", "bb4"]

    fkeys = {
        "APP_CODE": "app_code",
        "SID":"sub_id",
        "FCLASS": "file_class",
        "FID": "file_ID",
        "COMNT": "file_comment",
        "FFMT": "file_format",
        "FNAME": "file_name",
        "FPATH": "file_path",
        "FMATCH": "file_match"
    }

class AudStaticData(DataStaticInfo):
    EMPTY_VAL = '-'
    FNAME_SEP = "-"
    ANOT_SEP = '\t'
    FS_k = "fs"
    DUR_k = "dur"
    BEGIN_k = "begin"; END_k = "end"; LABEL_k = "label"
    ANOTE_COLS = [BEGIN_k, END_k, LABEL_k]

    LUNG_LOCS = ["LU", "RU", "LL", "RL"]

    LBA_k = "LBA"; VBA_k = "VBA"; BA_k = "BA"
    
    AUD_TAG = "aud--"
    ANOT_TAG = "anot--"
    AUDIO_FPATH_k = f"audio--file_path"
    ANOT_FPATH_k = f"anot--file_path"


In [82]:
class AnotFrame(AudStaticData):

    PNOI_SYNC_DF: pd.DataFrame
    PNOI_SYNC_ANOT_DF: pd.DataFrame

    def __init__(self, sync_aud_csv_path: str) -> None:
        self.PNOI_SYNC_DF = pd.read_csv(sync_aud_csv_path)
        self.PNOI_SYNC_ANOT_DF = self.make_dataframe()

    def extract_anots(self, row: pd.Series) -> pd.DataFrame:
        anot_path = row[self.ANOT_FPATH_k]

        df = pd.read_csv(anot_path, sep=self.ANOT_SEP, names=self.ANOTE_COLS)
        df[self.DUR_k] = df[self.END_k] - df[self.BEGIN_k]

        info = pd.DataFrame([row]* len(df)).reset_index(drop=True)
        return pd.concat([df, info], axis=1)


    def make_dataframe(self) -> pd.DataFrame:
        
        anot_dfs = []
        for _, row in self.PNOI_SYNC_DF.iterrows():
            anot_df = self.extract_anots(row)
            anot_dfs.append(anot_df)

        return pd.concat(anot_dfs, axis=0).reset_index(drop=True)

pnoidata_anotframe = AnotFrame(PNOI_SYNC_CSV_PATH)

pnoidata_anotframe.PNOI_SYNC_ANOT_DF.head()

Unnamed: 0,begin,end,label,dur,fs,sub_id,file_class,audio--file_path,anot--file_path
0,0.2,1.723096,oo,1.523096,16000,shreyamgupta_78aa423a,_before_LU,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...
1,3.722159,5.007272,ii,1.285113,16000,shreyamgupta_78aa423a,_before_LU,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...
2,5.007272,5.959207,xx,0.951935,16000,shreyamgupta_78aa423a,_before_LU,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...
3,5.959207,6.911141,ii,0.951934,16000,shreyamgupta_78aa423a,_before_LU,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...
4,6.911141,8.10106,xx,1.189919,16000,shreyamgupta_78aa423a,_before_LU,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...


In [None]:
class PnoiAnotDataset:

    PNOI_SYNC_ANOT_DF: pd.DataFrame
    PNOI_ANOT_DATASET_DF: pd.DataFrame

    def __init__(self, anot_df: pd.DataFrame, pnoi_muster_csv) -> None:
        self.PNOI_SYNC_ANOT_DF = anot_df.copy()
        
        pnoi_muster = pd.read_csv(PNOI_CORPUS_CSV_PATH); pnoi_muster

    col_order = [
    'sub_id',

    'subjectName', 'subjectGender', 'subjectAge',
    'subjectType', 'subjectHeight', 'subjectWeight',
    
    'anot--META--file_path',

    'FEV1_ref_before', 'FEV1_val_before',
    'FVC_ref_before', 'FVC_val_before',
    'ratio_ref_before', 'ratio_val_before',
    
    'FEV1_ref_after', 'FEV1_val_after',
    'FVC_ref_after', 'FVC_val_after',
    'ratio_ref_after', 'ratio_val_after',
    ]

    def make_dataframe(self) -> pd.DataFrame:


In [92]:
pnoi_muster = pd.read_csv(PNOI_CORPUS_CSV_PATH); pnoi_muster

col_order = [
    'sub_id',

    'subjectName', 'subjectGender', 'subjectAge',
    'subjectType', 'subjectHeight', 'subjectWeight',
    
    'anot--META--file_path',

    'FEV1_ref_before', 'FEV1_val_before',
    'FVC_ref_before', 'FVC_val_before',
    'ratio_ref_before', 'ratio_val_before',
    
    'FEV1_ref_after', 'FEV1_val_after',
    'FVC_ref_after', 'FVC_val_after',
    'ratio_ref_after', 'ratio_val_after',
    ]

pnoidata_anotframe.PNOI_SYNC_ANOT_DF.merge(pnoi_muster[col_order], on="sub_id", how="left")

Unnamed: 0,begin,end,label,dur,fs,sub_id,file_class,audio--file_path,anot--file_path,subjectName,...,FVC_ref_before,FVC_val_before,ratio_ref_before,ratio_val_before,FEV1_ref_after,FEV1_val_after,FVC_ref_after,FVC_val_after,ratio_ref_after,ratio_val_after
0,0.200000,1.723096,oo,1.523096,16000,shreyamgupta_78aa423a,_before_LU,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,Shreyam Gupta,...,-,3.13,-,87.0,-,-,-,-,-,-
1,3.722159,5.007272,ii,1.285113,16000,shreyamgupta_78aa423a,_before_LU,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,Shreyam Gupta,...,-,3.13,-,87.0,-,-,-,-,-,-
2,5.007272,5.959207,xx,0.951935,16000,shreyamgupta_78aa423a,_before_LU,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,Shreyam Gupta,...,-,3.13,-,87.0,-,-,-,-,-,-
3,5.959207,6.911141,ii,0.951934,16000,shreyamgupta_78aa423a,_before_LU,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,Shreyam Gupta,...,-,3.13,-,87.0,-,-,-,-,-,-
4,6.911141,8.101060,xx,1.189919,16000,shreyamgupta_78aa423a,_before_LU,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,reports/pnoi_sync_data_v9/shreyamgupta_78aa423...,Shreyam Gupta,...,-,3.13,-,87.0,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1946,18.554887,20.754090,ii,2.199203,16000,lokeshk_90b4871a,_before_RL,reports/pnoi_sync_data_v9/lokeshk_90b4871a/pno...,reports/pnoi_sync_data_v9/lokeshk_90b4871a/pno...,Lokesh K,...,3.96,3.21,79.0,78.0,3.11,2.53,3.96,3.2,80.0,79.0
1947,20.754090,22.699539,xx,1.945449,16000,lokeshk_90b4871a,_before_RL,reports/pnoi_sync_data_v9/lokeshk_90b4871a/pno...,reports/pnoi_sync_data_v9/lokeshk_90b4871a/pno...,Lokesh K,...,3.96,3.21,79.0,78.0,3.11,2.53,3.96,3.2,80.0,79.0
1948,22.699539,24.644988,ii,1.945449,16000,lokeshk_90b4871a,_before_RL,reports/pnoi_sync_data_v9/lokeshk_90b4871a/pno...,reports/pnoi_sync_data_v9/lokeshk_90b4871a/pno...,Lokesh K,...,3.96,3.21,79.0,78.0,3.11,2.53,3.96,3.2,80.0,79.0
1949,24.644988,26.252098,xx,1.607110,16000,lokeshk_90b4871a,_before_RL,reports/pnoi_sync_data_v9/lokeshk_90b4871a/pno...,reports/pnoi_sync_data_v9/lokeshk_90b4871a/pno...,Lokesh K,...,3.96,3.21,79.0,78.0,3.11,2.53,3.96,3.2,80.0,79.0


In [90]:
subjects = pnoidata_anotframe.PNOI_SYNC_DF["sub_id"].unique()

for sub in subjects:
    filt = pnoidata_anotframe.PNOI_SYNC_DF["sub_id"] == sub
    sub_df = pnoidata_anotframe.PNOI_SYNC_DF[filt]

    if len(sub_df)%4 != 0: print(len(sub_df), sub)

7 sujatan_bdd161b6


In [86]:
pnoidata_anotframe.PNOI_SYNC_ANOT_DF["label"].value_counts()


label
ii     676
xx     676
ee     136
uu     136
oo     134
aa     134
bb1     15
bb2     15
bb4     15
bb3     14
Name: count, dtype: int64

In [None]:
import librosa

librosa.lpc