# Load Pnoi Sync data

In [2]:
import pandas as pd
import os
import numpy as np


mkdir = lambda p: 0 if os.path.exists(p) else (os.mkdir(p), 1)[1]

### STATE VARIABLES

In [18]:
IS_TEST = True

## Paths to Dataset
> paths valid for linux based os; make changes if running on windows

In [10]:
EXP_VER = 15

REPORTS_DIR = "0_pnoi-reports"

## PNOI ETL
DATAETL = f"dataetl_v{EXP_VER}"

ETL_REPORT_FOLDER = f"../{REPORTS_DIR}/{DATAETL}"

PNOI_CORPUS_CSV_NAME = "pnoicorpus_muster.csv" # Name of pnoi muster csv file

MUSTER_CSV_IMPORT_PATH = f"{ETL_REPORT_FOLDER}/{PNOI_CORPUS_CSV_NAME}"; MUSTER_CSV_IMPORT_PATH

##PNOI EXP
EXPERIMENT = f"experiment_v{EXP_VER}"

REPORT_FOLDER = f"../{REPORTS_DIR}/{EXPERIMENT}"; mkdir(REPORT_FOLDER)

PNOI_BASYNC_CSV_NAME = "pnoi_sync_aud.csv" # Name of sync BA csv file with metadata

PNOI_BASYNC_CSV_IMPORT_PATH = f"{REPORT_FOLDER}/{PNOI_BASYNC_CSV_NAME}"; PNOI_BASYNC_CSV_IMPORT_PATH

'../0_pnoi-reports/experiment_v15/pnoi_sync_aud.csv'

### Data Static Variables
- List of name seperators used in file nomenclaure
- List of annotation labels
- List of file dataframe keys mapping in order

### Audio Static Variables
- File name seperators
- Anotation headers
- Recording locations
- Dataframe keys mapping

In [9]:
class DataStaticInfo:
    VER = "*"
    SEP = "-"
    META_SEP = "_"
    EXT_SEP = "."
    ANOT_LABELS = ["aa", "ee", "uu", "oo", "ii", "xx", "bb1", "bb2", "bb3", "bb4"]

    fkeys = {
        "APP_CODE": "app_code",
        "SID":"sub_id",
        "FCLASS": "file_class",
        "FID": "file_ID",
        "COMNT": "file_comment",
        "FFMT": "file_format",
        "FNAME": "file_name",
        "FPATH": "file_path",
        "FMATCH": "file_match"
    }

class AudStaticData(DataStaticInfo):
    EMPTY_VAL = '-'
    FNAME_SEP = "-"
    ANOT_SEP = '\t'
    FS_k = "fs"
    DUR_k = "dur"
    BEGIN_k = "begin"; END_k = "end"; LABEL_k = "label"
    ANOTE_COLS = [BEGIN_k, END_k, LABEL_k]

    LUNG_LOCS = ["LU", "RU", "LL", "RL"]

    LBA_k = "LBA"; VBA_k = "VBA"; BA_k = "BA"
    
    AUD_TAG = "aud--"
    ANOT_TAG = "anot--"
    AUDIO_FPATH_k = f"audio--file_path"
    ANOT_FPATH_k = f"anot--file_path"


## Annotation Frames Dataframe
Exract all annotation files and create a dataframe with all the annotation labels

- input:

`PNOI_BASYNC_CSV_IMPORT_PATH`: path to raw pnoi muster csv file

- output:

`PNOI_SYNC_ANOT_DF`: dataframe of all subjects with no missing files

In [19]:
class AnotFrame(AudStaticData):

    PNOI_SYNC_DF: pd.DataFrame
    PNOI_SYNC_ANOT_DF: pd.DataFrame

    def __init__(self, sync_aud_csv_path: str) -> None:
        self.PNOI_SYNC_DF = pd.read_csv(sync_aud_csv_path)
        self.PNOI_SYNC_ANOT_DF = self.make_anot_dataframe(self.PNOI_SYNC_DF)

    def extract_anots(self, row: pd.Series) -> pd.DataFrame:
        anot_path = row[self.ANOT_FPATH_k]

        df = pd.read_csv(anot_path, sep=self.ANOT_SEP, names=self.ANOTE_COLS)
        df[self.DUR_k] = df[self.END_k] - df[self.BEGIN_k]

        info = pd.DataFrame([row]* len(df)).reset_index(drop=True)
        return pd.concat([df, info], axis=1)


    def make_anot_dataframe(self, df) -> pd.DataFrame:
        
        anot_dfs = []
        for _, row in df.iterrows():
            anot_df = self.extract_anots(row)
            anot_dfs.append(anot_df)

        return pd.concat(anot_dfs, axis=0).reset_index(drop=True)
    
def test():
    pnoi_anotframe = AnotFrame(PNOI_BASYNC_CSV_IMPORT_PATH)
    
    return pnoi_anotframe.PNOI_SYNC_ANOT_DF

test() if IS_TEST else None

Unnamed: 0,begin,end,label,dur,fs,sub_id,file_class,audio--file_path,anot--file_path
0,0.200000,21.961447,bb1,21.761447,16000,tasmiyapm_57aac126,_before_LU,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...
1,0.200000,4.608721,oo,4.408721,16000,tasmiyapm_57aac126,_before_LU,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...
2,4.608721,6.301670,ii,1.692949,16000,tasmiyapm_57aac126,_before_LU,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...
3,6.301670,7.606652,xx,1.304982,16000,tasmiyapm_57aac126,_before_LU,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...
4,7.606652,9.052712,ii,1.446060,16000,tasmiyapm_57aac126,_before_LU,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...
...,...,...,...,...,...,...,...,...,...
2661,18.554887,20.754090,ii,2.199203,16000,lokeshk_90b4871a,_before_RL,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...
2662,20.754090,22.699539,xx,1.945449,16000,lokeshk_90b4871a,_before_RL,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...
2663,22.699539,24.644988,ii,1.945449,16000,lokeshk_90b4871a,_before_RL,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...
2664,24.644988,26.252098,xx,1.607110,16000,lokeshk_90b4871a,_before_RL,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...


## Pnoi Dataset
Dataframe with audio files, annotation files, pft values, and metadata.

- input:

`PNOI_SYNC_ANOT_DF`: dataframe of all subjects with no missing files

- output:

`PNOI_ANOT_DATASET_DF`: dataframe of all subjects sounds with metadata

In [35]:
class PnoiAnotDataset(AnotFrame):

    # PNOI_SYNC_ANOT_DF: pd.DataFrame
    PNOI_MUSTER_DF: pd.DataFrame
    PNOI_ANOT_DATASET_DF: pd.DataFrame

    def __init__(self, sync_aud_csv_path: str, pnoi_muster_csv_path: str) -> None:
        super().__init__(sync_aud_csv_path)
        
        # self.PNOI_SYNC_ANOT_DF = anot_df.copy()
        basync_anot_data_df = self.PNOI_SYNC_ANOT_DF
        self.PNOI_MUSTER_DF = pd.read_csv(pnoi_muster_csv_path)
 
        self.PNOI_ANOT_DATASET_DF = self.make_full_dataframe(basync_anot_data_df)


    col_order = [
    'sub_id',

    'subjectName', 'subjectGender', 'subjectAge',
    'subjectType', 'subjectHeight', 'subjectWeight',
    
    'anot--META--file_path',

    'FEV1_ref_before', 'FEV1_val_before',
    'FVC_ref_before', 'FVC_val_before',
    'ratio_ref_before', 'ratio_val_before',
    
    'FEV1_ref_after', 'FEV1_val_after',
    'FVC_ref_after', 'FVC_val_after',
    'ratio_ref_after', 'ratio_val_after',
    ]

    def make_full_dataframe(self, basync_anot_df) -> pd.DataFrame:
        pnoi_anot_dataset = basync_anot_df.merge(self.PNOI_MUSTER_DF[self.col_order], on="sub_id", how="left")

        pnoi_anot_dataset.to_csv(f"{REPORT_FOLDER}/pnoi_anot_dataset.csv", index=False)
        
        return pnoi_anot_dataset

def test():
    pnoidata_anotdataset = PnoiAnotDataset(PNOI_BASYNC_CSV_IMPORT_PATH, MUSTER_CSV_IMPORT_PATH)
    
    return pnoidata_anotdataset.PNOI_ANOT_DATASET_DF
    

# pnoidata_anotdataset = PnoiAnotDataset(PNOI_BASYNC_CSV_IMPORT_PATH, MUSTER_CSV_IMPORT_PATH)
# cols_filt = pnoidata_anotdataset.PNOI_MUSTER_DF.columns.str.contains("after")
# cols = pnoidata_anotdataset.PNOI_MUSTER_DF.columns[~cols_filt]

# pnoidata_anotdataset.PNOI_MUSTER_DF[cols].replace("-", np.nan).dropna().apply(pd.to_numeric, errors='ignore')['subjectType'].value_counts()

test() if IS_TEST else None

Unnamed: 0,begin,end,label,dur,fs,sub_id,file_class,audio--file_path,anot--file_path,subjectName,...,FVC_ref_before,FVC_val_before,ratio_ref_before,ratio_val_before,FEV1_ref_after,FEV1_val_after,FVC_ref_after,FVC_val_after,ratio_ref_after,ratio_val_after
0,0.200000,21.961447,bb1,21.761447,16000,tasmiyapm_57aac126,_before_LU,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,Tasmiya P M,...,-,-,-,-,_NA,_NA,_NA,_NA,_NA,_NA
1,0.200000,4.608721,oo,4.408721,16000,tasmiyapm_57aac126,_before_LU,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,Tasmiya P M,...,-,-,-,-,_NA,_NA,_NA,_NA,_NA,_NA
2,4.608721,6.301670,ii,1.692949,16000,tasmiyapm_57aac126,_before_LU,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,Tasmiya P M,...,-,-,-,-,_NA,_NA,_NA,_NA,_NA,_NA
3,6.301670,7.606652,xx,1.304982,16000,tasmiyapm_57aac126,_before_LU,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,Tasmiya P M,...,-,-,-,-,_NA,_NA,_NA,_NA,_NA,_NA
4,7.606652,9.052712,ii,1.446060,16000,tasmiyapm_57aac126,_before_LU,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,Tasmiya P M,...,-,-,-,-,_NA,_NA,_NA,_NA,_NA,_NA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2661,18.554887,20.754090,ii,2.199203,16000,lokeshk_90b4871a,_before_RL,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,Lokesh K,...,3.96,3.21,79.0,78.0,3.11,2.53,3.96,3.2,80.0,79.0
2662,20.754090,22.699539,xx,1.945449,16000,lokeshk_90b4871a,_before_RL,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,Lokesh K,...,3.96,3.21,79.0,78.0,3.11,2.53,3.96,3.2,80.0,79.0
2663,22.699539,24.644988,ii,1.945449,16000,lokeshk_90b4871a,_before_RL,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,Lokesh K,...,3.96,3.21,79.0,78.0,3.11,2.53,3.96,3.2,80.0,79.0
2664,24.644988,26.252098,xx,1.607110,16000,lokeshk_90b4871a,_before_RL,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,../0_pnoi-reports/experiment_v15/DATA_PNOI-SYN...,Lokesh K,...,3.96,3.21,79.0,78.0,3.11,2.53,3.96,3.2,80.0,79.0


### Test Data Count per Subject

In [39]:
PNOI_BASYNC_DF = pd.read_csv(f"{REPORT_FOLDER}/pnoi_sync_aud.csv")  
PNOI_DATASET_DF = pd.read_csv(f"{REPORT_FOLDER}/pnoi_anot_dataset.csv")  

In [42]:
subjects = PNOI_BASYNC_DF["sub_id"].unique()

for sub in subjects:
    filt = PNOI_BASYNC_DF["sub_id"] == sub
    sub_df = PNOI_BASYNC_DF[filt]

    if len(sub_df)%4 != 0: 
        print(len(sub_df), sub) # there should be 4 or 8 audio files per subject

3 udita_6618e247
7 sujatan_bdd161b6


In [45]:
PNOI_DATASET_DF["label"].value_counts()

label
xx     922
ii     922
ee     179
oo     178
uu     178
aa     178
bb1     28
bb2     28
bb4     27
bb3     26
Name: count, dtype: int64