# PREPARE DATASET FOR PNOI EXPERIMENTS

In [4]:
import pandas as pd
import numpy as np
import functools
import matplotlib.pyplot as plt
import librosa
import os


mkdir = lambda p: 0 if os.path.exists(p) else (os.mkdir(p), 1)[1]

## PATHS to dataset

In [9]:
REPORTS = "reports"; mkdir(REPORTS) # Path to the reports folder

PNOI_CORPUS_CSV_NAME = "pnoicorpus_muster.csv" # Name of the csv file

PNOI_CORPUS_CSV_PATH = f"{REPORTS}/{PNOI_CORPUS_CSV_NAME}" # Path to the master csv file

## FILTER dataset

In [15]:
class DataFilter:

    EMPTY_VAL = '-'
    PNOI_MUSTER_DF: pd.DataFrame

    PNOI_FILT_DF: pd.DataFrame

    def __init__(self, muster_csv_path: str) -> None:
        pnoi_corpus_DF = pd.read_csv(muster_csv_path)
        pnoi_corpus_DF.columns = pnoi_corpus_DF.columns.str.replace("--file_path", ""); pnoi_corpus_DF.head()

        self.PNOI_MUSTER_DF: pd.DataFrame = pnoi_corpus_DF

        self.PNOI_FILT_DF: pd.DataFrame = self.filter_df()

    def filter_df(self):
        # BREATH AUDIO (BA) Columns
        _col_str_match = "anot--LBA_before|anot--VBA_before" # match string
        _ba_cols = self.PNOI_MUSTER_DF.columns[self.PNOI_MUSTER_DF.columns.str.contains(_col_str_match)]; print(_ba_cols) # get columns that match string
        _filts = [self.PNOI_MUSTER_DF[col] != self.EMPTY_VAL for col in _ba_cols] # filters for non-empty values
        _filt = functools.reduce(lambda p, c: p & c, _filts) # combine filters

        pnoi_corpus_filt_DF = self.PNOI_MUSTER_DF[_filt] # apply filter
        pnoi_corpus_filt_DF.to_csv(f"{REPORTS}/filtered_dataset.csv") # save filtered dataframe to csv
        print(pnoi_corpus_filt_DF.shape)
        
        return pnoi_corpus_filt_DF
    
pnoidata_filter = DataFilter(PNOI_CORPUS_CSV_PATH)

pnoidata_filter.PNOI_FILT_DF

Index(['anot--VBA_before', 'anot--LBA_before_LU', 'anot--LBA_before_RU',
       'anot--LBA_before_LL', 'anot--LBA_before_RL'],
      dtype='object')
(25, 47)


Unnamed: 0,index,app_code,sub_id,subjectName,subjectGender,subjectAge,subjectType,subjectHeight,subjectWeight,META,...,LBA_after_RU,anot--LBA_after_RU,LBA_after_LL,anot--LBA_after_LL,LBA_after_RL,anot--LBA_after_RL,PFT_before,anot--PFT_before,PFT_after,anot--PFT_after
0,0,pnoistor_feb2023,shreyamgupta_78aa423a,Shreyam Gupta,Female,19,Control,158,70,DATA_PNOISTOR/pnoistor_feb2023/shreyamgupta_78...,...,-,-,-,-,-,-,DATA_PNOISTOR/pnoistor_feb2023/shreyamgupta_78...,DATA_PNOISTOR/pnoistor_feb2023/shreyamgupta_78...,-,-
2,2,pnoistor_dec01,sannashoukat_5213fe84,Sanna Shoukat,Female,21,Control,166,55,DATA_PNOISTOR/pnoistor_dec01/sannashoukat_5213...,...,-,-,-,-,-,-,DATA_PNOISTOR/pnoistor_dec01/sannashoukat_5213...,DATA_PNOISTOR/pnoistor_dec01/sannashoukat_5213...,-,-
3,3,pnoistor_feb2023,sananaushad_80e84b51,Sana Naushad,Female,21,Control,158,60,DATA_PNOISTOR/pnoistor_feb2023/sananaushad_80e...,...,-,-,-,-,-,-,DATA_PNOISTOR/pnoistor_feb2023/sananaushad_80e...,DATA_PNOISTOR/pnoistor_feb2023/sananaushad_80e...,-,-
4,4,pnoistor_dec01,saikeerthanaarun_3364bc1a,Sai Keerthana Arun,Female,22,Control,172,58,DATA_PNOISTOR/pnoistor_dec01/saikeerthanaarun_...,...,-,-,-,-,-,-,-,-,-,-
6,6,pnoistor_dec01,kumarchowdam_53f32e31,Kumar Chowdam,Male,21,Control,162,60,DATA_PNOISTOR/pnoistor_dec01/kumarchowdam_53f3...,...,-,-,-,-,-,-,DATA_PNOISTOR/pnoistor_dec01/kumarchowdam_53f3...,DATA_PNOISTOR/pnoistor_dec01/kumarchowdam_53f3...,-,-
7,7,pnoistor_feb2023,darshanvshettar_843c416b,Darshan V Shettar,Male,21,Control,178,64,DATA_PNOISTOR/pnoistor_feb2023/darshanvshettar...,...,-,-,-,-,-,-,DATA_PNOISTOR/pnoistor_feb2023/darshanvshettar...,DATA_PNOISTOR/pnoistor_feb2023/darshanvshettar...,-,-
8,8,pnoistor_feb2023,atifahmed_ebb5032c,Atif ahmed,Male,21,Control,170,70,DATA_PNOISTOR/pnoistor_feb2023/atifahmed_ebb50...,...,-,-,-,-,-,-,DATA_PNOISTOR/pnoistor_feb2023/atifahmed_ebb50...,DATA_PNOISTOR/pnoistor_feb2023/atifahmed_ebb50...,-,-
9,9,pnoistor_oct07,johnkiranborugada_e390538c,John Kiran Borugada,Male,21,Control,172,70,DATA_PNOISTOR/pnoistor_oct07/johnkiranborugada...,...,-,-,-,-,-,-,DATA_PNOISTOR/pnoistor_oct07/johnkiranborugada...,DATA_PNOISTOR/pnoistor_oct07/johnkiranborugada...,-,-
10,10,pnoistor_oct07,amartyaveer_81b8f33c,Amartyaveer,Male,22,Control,182,57,DATA_PNOISTOR/pnoistor_oct07/amartyaveer_81b8f...,...,-,-,-,-,-,-,DATA_PNOISTOR/pnoistor_oct07/amartyaveer_81b8f...,DATA_PNOISTOR/pnoistor_oct07/amartyaveer_81b8f...,-,-
11,11,pnoistor_oct07,jesurajabandekar_7273cc8f,Jesuraja Bandekar,Male,24,Control,173,74,DATA_PNOISTOR/pnoistor_oct07/jesurajabandekar_...,...,-,-,-,-,-,-,DATA_PNOISTOR/pnoistor_oct07/jesurajabandekar_...,-,-,-


In [37]:
import soundfile as sf

In [None]:
class AudioDataProcess:
    

In [38]:
def export_audio_signals(aud_info: str, loc_i: int, export_path: str, dry_run=False) -> tuple[str]:
    loc = ["LU", "RU", "LL", "RL"] # location order
    fname = os.path.basename(aud_info["aud_fp"]) # get filename

    '''
    ["app_code 0", "sub_id 1", "file_class 2", "file_ID 3", "comment 4", "file_format 5"]'''

    fn_parts = fname.split('-') # split filename into parts
    n_fclass = fn_parts[2] # get filename class
    fn_parts[2] = n_fclass if 'LBA' in n_fclass else f"{n_fclass}_{loc[loc_i]}" # update filename class
    fn_parts[-1] = fn_parts[-1].lower() # lowercase file format
    n_aud_fname = '-'.join(fn_parts) # join filename parts

    #create export folder
    export_folder_path = os.path.join(export_path, "EXPORT_DATA"); mkdir(export_folder_path)
    sub_folder_path = os.path.join(export_folder_path, fn_parts[1]); mkdir(sub_folder_path)

    
    # export audio file
    n_aud_fpath = os.path.join(sub_folder_path, n_aud_fname)
    if not dry_run: sf.write(n_aud_fpath, aud_info['signal'], aud_info["fs"]) # export audio file
    
    # export annotation file
    n_anot_fname = n_aud_fname.replace(".wav", ".txt") # replace file extension
    n_anotpath = os.path.join(sub_folder_path, n_anot_fname)
    anot_df: pd.DataFrame = aud_info["label_DF"]
    if not dry_run: anot_df.to_csv(n_anotpath, sep='\t', index=False, header=False) # export annotation file

    return {
        "audio--file_path": n_aud_fpath, 
        "anot--file_path": n_anotpath,
        "file_class": fn_parts[2],
        "sub_id": fn_parts[1],
        }


In [39]:
def extract_signal_chunk(chunks_DF: pd.DataFrame, is_plot=False, PAD=0.2):

    # Get begin and end values of chunk
    begin = chunks_DF.iloc[0]["begin"]
    end = chunks_DF.iloc[-1]["end"]

    # Calculate offset (with Padding)
    offset = (begin - PAD)
    duration = (end - begin + 2*PAD)

    # Offset chunk DF
    chunks_DF.loc[:, ["begin", "end"]] -= offset

    # label DF
    label_DF = chunks_DF.loc[:, ['begin', 'end', 'label']]
    
    # Extract audio signal from audio file
    audio_fp = chunks_DF.iloc[0]["aud_fp"]
    signal, fs = librosa.load(audio_fp, sr=None, mono=True, offset=offset, duration=duration)

    # Normalize signal
    signal = librosa.util.normalize(signal)

    # PLOT signals
    if is_plot:
        plt.plot(np.linspace(0, duration, len(signal)), signal)
        plt.stem(chunks_DF.iloc[:]["begin"], np.ones(len(chunks_DF)))
        plt.stem(chunks_DF.iloc[:]["end"], np.ones(len(chunks_DF)), 'r')
        plt.show()

    return {
        "fs": fs, # sampling frequency
        "aud_fp": audio_fp, # audio file path
        "signal": signal, # audio signal
        "label_DF": label_DF, # label dataframe
        }

In [40]:
def anot_breath_loc_chunks(audio_fp: str, anot_fp: str, GAP: float=10.0) -> list[pd.DataFrame]:
    """
    Split audio file into chunks based on duration b/w breaths in the audio file.
    The breaths are annotated in the annotation file.
    The annotation file is a tsv file exported from audacity.
    """

    anot_df: pd.DataFrame = pd.read_csv(anot_fp, sep='\t', names=["begin", "end", "label"]) 
    # Read annotation labeled with audacity tsv file

    anot_df["aud_fp"] = audio_fp
    anot_df["dur"] = anot_df['end'] - anot_df["begin"] 
    # Calculate duration of each label marke in annotation

    anot_df["gap"] = anot_df['begin'].shift(-1) - anot_df["end"] 
    # Calculate gap between previous label end and next label start: give the gap between labels

    gap_filt = (anot_df['gap'] > GAP) | (anot_df['gap'].isnull()) 
    # Filter rows where gap b/w labels is above the threshold (GAP)
    anot_gap_df = anot_df.loc[gap_filt] # Apply filter

    i_splits = sorted(set([0] + list(anot_gap_df.index + 1))) 
    # Get row index of those breakpoints (breath location chunks)

    chunks = [anot_df.iloc[i_splits[n]:i_splits[n+1]] for n in range(len(i_splits) - 1)] 
    # Split chunks using the index of breakpoints

    # assert len(chunks) == 4 # check if there are 4 chunks

    return chunks


In [41]:
pnoi_corpus_filt_DF.reset_index()

Unnamed: 0,level_0,index,app-version-code,subject-ID,subjectName,subjectGender,subjectAge,subjectType,subjectHeight,subjectWeight,...,ratio_ref_after,FEV1_val_after,FVC_val_after,ratio_val_after,PFT_before,anot--PFT_before,PFT_after,anot--PFT_after,META,anot--META
0,0,0,pnoistor_feb2023,shreyamgupta_78aa423a,Shreyam Gupta,Female,19,Control,158,70,...,-,-,-,-,DATA_/pnoistor_feb2023/shreyamgupta_78aa423a/p...,DATA_/pnoistor_feb2023/shreyamgupta_78aa423a/p...,-,-,DATA_/pnoistor_feb2023/shreyamgupta_78aa423a/p...,DATA_/pnoistor_feb2023/shreyamgupta_78aa423a/p...
1,2,2,pnoistor_dec01,sannashoukat_5213fe84,Sanna Shoukat,Female,21,Control,166,55,...,-,-,-,-,DATA_/pnoistor_dec01/sannashoukat_5213fe84/pno...,DATA_/pnoistor_dec01/sannashoukat_5213fe84/pno...,-,-,DATA_/pnoistor_dec01/sannashoukat_5213fe84/pno...,DATA_/pnoistor_dec01/sannashoukat_5213fe84/pno...
2,3,3,pnoistor_feb2023,sananaushad_80e84b51,Sana Naushad,Female,21,Control,158,60,...,-,-,-,-,DATA_/pnoistor_feb2023/sananaushad_80e84b51/pn...,DATA_/pnoistor_feb2023/sananaushad_80e84b51/pn...,-,-,DATA_/pnoistor_feb2023/sananaushad_80e84b51/pn...,DATA_/pnoistor_feb2023/sananaushad_80e84b51/pn...
3,4,4,pnoistor_dec01,saikeerthanaarun_3364bc1a,Sai Keerthana Arun,Female,22,Control,172,58,...,-,-,-,-,-,-,-,-,DATA_/pnoistor_dec01/saikeerthanaarun_3364bc1a...,DATA_/pnoistor_dec01/saikeerthanaarun_3364bc1a...
4,6,6,pnoistor_dec01,kumarchowdam_53f32e31,Kumar Chowdam,Male,21,Control,162,60,...,-,-,-,-,DATA_/pnoistor_dec01/kumarchowdam_53f32e31/pno...,DATA_/pnoistor_dec01/kumarchowdam_53f32e31/pno...,-,-,DATA_/pnoistor_dec01/kumarchowdam_53f32e31/pno...,DATA_/pnoistor_dec01/kumarchowdam_53f32e31/pno...
5,7,7,pnoistor_feb2023,darshanvshettar_843c416b,Darshan V Shettar,Male,21,Control,178,64,...,-,-,-,-,DATA_/pnoistor_feb2023/darshanvshettar_843c416...,DATA_/pnoistor_feb2023/darshanvshettar_843c416...,-,-,DATA_/pnoistor_feb2023/darshanvshettar_843c416...,DATA_/pnoistor_feb2023/darshanvshettar_843c416...
6,8,8,pnoistor_feb2023,atifahmed_ebb5032c,Atif ahmed,Male,21,Control,170,70,...,-,-,-,-,DATA_/pnoistor_feb2023/atifahmed_ebb5032c/pnoi...,DATA_/pnoistor_feb2023/atifahmed_ebb5032c/pnoi...,-,-,DATA_/pnoistor_feb2023/atifahmed_ebb5032c/pnoi...,DATA_/pnoistor_feb2023/atifahmed_ebb5032c/pnoi...
7,9,9,pnoistor_oct07,johnkiranborugada_e390538c,John Kiran Borugada,Male,21,Control,172,70,...,-,-,-,-,DATA_/pnoistor_oct07/johnkiranborugada_e390538...,DATA_/pnoistor_oct07/johnkiranborugada_e390538...,-,-,DATA_/pnoistor_oct07/johnkiranborugada_e390538...,DATA_/pnoistor_oct07/johnkiranborugada_e390538...
8,10,10,pnoistor_oct07,amartyaveer_81b8f33c,Amartyaveer,Male,22,Control,182,57,...,-,-,-,-,DATA_/pnoistor_oct07/amartyaveer_81b8f33c/pnoi...,DATA_/pnoistor_oct07/amartyaveer_81b8f33c/pnoi...,-,-,DATA_/pnoistor_oct07/amartyaveer_81b8f33c/pnoi...,DATA_/pnoistor_oct07/amartyaveer_81b8f33c/pnoi...
9,11,11,pnoistor_oct07,jesurajabandekar_7273cc8f,Jesuraja Bandekar,Male,24,Control,173,74,...,-,-,-,-,DATA_/pnoistor_oct07/jesurajabandekar_7273cc8f...,-,-,-,DATA_/pnoistor_oct07/jesurajabandekar_7273cc8f...,DATA_/pnoistor_oct07/jesurajabandekar_7273cc8f...


In [42]:
# _i = np.random.randint(len(pnoi_corpus_filt_DF)); _i

# _rv = pnoi_corpus_filt_DF.iloc[_i]


_ba_str_match = '|'.join(['anot--VBA_before', 'anot--LBA_before'])
_ba_cols = pnoi_corpus_filt_DF.columns[pnoi_corpus_filt_DF.columns.str.contains(_ba_str_match)]; print(_ba_cols)

aud_info_dict = []
for _, _rv in pnoi_corpus_filt_DF.iterrows():
    get_chunks = lambda col: anot_breath_loc_chunks(_rv[col.replace("anot--", "")], _rv[col])

    chunks = [get_chunks(col) for col in _ba_cols]
    chunks = functools.reduce(lambda p, c: p + c, chunks)

    for ci, chk in enumerate(chunks):
        chunk_info = extract_signal_chunk(chk, is_plot=False)

        aud_info = export_audio_signals(chunk_info, ci, export_path=REPORTS, dry_run=True)

        aud_info_dict.append(aud_info)



pnoi_corpus_split_DF = pd.DataFrame(aud_info_dict); pnoi_corpus_split_DF

Index(['anot--VBA_before', 'anot--LBA_before_LU', 'anot--LBA_before_RU',
       'anot--LBA_before_LL', 'anot--LBA_before_RL'],
      dtype='object')


  return f(*args, **kwargs)


Unnamed: 0,audio--file_path,anot--file_path,file_class,sub_id
0,./reports/EXPORT_DATA/shreyamgupta_78aa423a/pn...,./reports/EXPORT_DATA/shreyamgupta_78aa423a/pn...,VBA_before_LU,shreyamgupta_78aa423a
1,./reports/EXPORT_DATA/shreyamgupta_78aa423a/pn...,./reports/EXPORT_DATA/shreyamgupta_78aa423a/pn...,VBA_before_RU,shreyamgupta_78aa423a
2,./reports/EXPORT_DATA/shreyamgupta_78aa423a/pn...,./reports/EXPORT_DATA/shreyamgupta_78aa423a/pn...,VBA_before_LL,shreyamgupta_78aa423a
3,./reports/EXPORT_DATA/shreyamgupta_78aa423a/pn...,./reports/EXPORT_DATA/shreyamgupta_78aa423a/pn...,VBA_before_RL,shreyamgupta_78aa423a
4,./reports/EXPORT_DATA/shreyamgupta_78aa423a/pn...,./reports/EXPORT_DATA/shreyamgupta_78aa423a/pn...,LBA_before_LU,shreyamgupta_78aa423a
...,...,...,...,...
195,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,VBA_before_RL,lokeshk_90b4871a
196,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,LBA_before_LU,lokeshk_90b4871a
197,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,LBA_before_RU,lokeshk_90b4871a
198,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,LBA_before_LL,lokeshk_90b4871a


In [44]:
all_subjects = pd.unique(pnoi_corpus_split_DF["sub_id"]); all_subjects
loc = ["LU", "RU", "LL", "RL"] # location order

def join_ba_signals(rv, export_path, fs=16000, is_plot=False, dry_run=False):

    read_anot = lambda fp: pd.read_csv(fp, sep='\t', names=["begin", "end", "label"]) # Read annotation labeled with audacity tsv file
    load_audio = lambda fp: librosa.load(fp, sr=fs, mono=True)[0] # Load audio signal
    pad_sig = lambda sig, max_len: np.pad(sig, (0, max_len - len(sig)), mode='constant') # Pad signal with zeros

    # Plot signal and annotation
    def plot_sig(aud, anot, fs, scale=1.0):
        t = np.linspace(0, len(aud)/fs, len(aud))
        plt.stem(anot["begin"], np.ones(len(anot))*1.5, 'r')
        plt.stem(anot["end"], np.ones(len(anot))*1.2, 'g')
        plt.plot(t, aud*scale)

    
    # Read annotation
    vba_anot, lba_anot = read_anot(rv["anot--file_path_VBA"]), read_anot(rv["anot--file_path_LBA"])

    # Load audio signal
    vba_sig, lba_sig = load_audio(rv["audio--file_path_VBA"]), load_audio(rv["audio--file_path_LBA"])


    # Pad signals with zeros to make them equal length
    sig_len = max(len(vba_sig), len(lba_sig)) # get max length of the two signals
    vba_sig, lba_sig = pad_sig(vba_sig, sig_len), pad_sig(lba_sig, sig_len)


    if is_plot: # plot signal and annotation
       plt.figure(figsize=(140, 40))
       plot_sig(vba_sig, vba_anot, fs)
       plot_sig(lba_sig, lba_anot, fs, scale=0.2)
       
       plt.show()
       print(rv["sub_id"], rv["match"])

    # Join the two signals
    ba_sig = np.array([vba_sig, lba_sig]).T

    #create export folder
    fname = os.path.basename(rv["audio--file_path_VBA"]) # get filename
    '''
    ["app_code 0", "sub_id 1", "file_class 2", "file_ID 3", "comment 4", "file_format 5"]'''
    fn_parts = fname.split('-') # split filename into parts

    n_filename = fname.replace("VBA", "BA").replace(".wav", "")
    export_folder_path = os.path.join(export_path, "EXPORT_DATA3"); mkdir(export_folder_path)
    sub_folder_path = os.path.join(export_folder_path, rv['sub_id']); mkdir(sub_folder_path)

    # Export audio file
    audio_filename = os.path.join(sub_folder_path, f"{n_filename}.wav")
    if not dry_run: sf.write(audio_filename, ba_sig, fs) # export audio file

    # Export annotation file
    anot_filepath = os.path.join(sub_folder_path, f"{n_filename}.txt")
    if not dry_run: vba_anot.to_csv(anot_filepath, sep='\t', index=False, header=False)

    return {
        "fs": fs, # sampling frequency
        "subject-ID": rv['sub_id'], # subject ID
        "audio_filepath": audio_filename, # audio file path
        "anot_filepath": anot_filepath, # label dataframe
        }

ba_file_dicts = []
for sub in all_subjects[:]:

    _filt = pnoi_corpus_split_DF["sub_id"] == sub
    sub_df = pnoi_corpus_split_DF.loc[_filt]
    sub_df["match"] = sub_df["file_class"].replace("VBA|LBA", "", regex=True)

    _filt = sub_df["file_class"].str.contains("VBA")
    sub_VBA = sub_df.loc[_filt]
    sub_LBA = sub_df.loc[~_filt]

    sub_df2 = sub_VBA.merge(sub_LBA, how="inner", on=["match", "sub_id"], suffixes=("_VBA", "_LBA"))

    ba_file_dicts += [join_ba_signals(rv, REPORTS, is_plot=False, dry_run=False) for _, rv in sub_df2.iterrows()]
    
    # break
    
    
pd.DataFrame(ba_file_dicts)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df["match"] = sub_df["file_class"].replace("VBA|LBA", "", regex=True)


Unnamed: 0,fs,subject-ID,audio_filepath,anot_filepath
0,16000,shreyamgupta_78aa423a,./reports/EXPORT_DATA3/shreyamgupta_78aa423a/p...,./reports/EXPORT_DATA3/shreyamgupta_78aa423a/p...
1,16000,shreyamgupta_78aa423a,./reports/EXPORT_DATA3/shreyamgupta_78aa423a/p...,./reports/EXPORT_DATA3/shreyamgupta_78aa423a/p...
2,16000,shreyamgupta_78aa423a,./reports/EXPORT_DATA3/shreyamgupta_78aa423a/p...,./reports/EXPORT_DATA3/shreyamgupta_78aa423a/p...
3,16000,shreyamgupta_78aa423a,./reports/EXPORT_DATA3/shreyamgupta_78aa423a/p...,./reports/EXPORT_DATA3/shreyamgupta_78aa423a/p...
4,16000,sannashoukat_5213fe84,./reports/EXPORT_DATA3/sannashoukat_5213fe84/p...,./reports/EXPORT_DATA3/sannashoukat_5213fe84/p...
...,...,...,...,...
95,16000,sujatan_bdd161b6,./reports/EXPORT_DATA3/sujatan_bdd161b6/pnoist...,./reports/EXPORT_DATA3/sujatan_bdd161b6/pnoist...
96,16000,lokeshk_90b4871a,./reports/EXPORT_DATA3/lokeshk_90b4871a/pnoist...,./reports/EXPORT_DATA3/lokeshk_90b4871a/pnoist...
97,16000,lokeshk_90b4871a,./reports/EXPORT_DATA3/lokeshk_90b4871a/pnoist...,./reports/EXPORT_DATA3/lokeshk_90b4871a/pnoist...
98,16000,lokeshk_90b4871a,./reports/EXPORT_DATA3/lokeshk_90b4871a/pnoist...,./reports/EXPORT_DATA3/lokeshk_90b4871a/pnoist...


In [202]:
sub_df2.iloc[0].to_dict()

{'audio--file_path_VBA': './reports/EXPORT_DATA/sujatan_bdd161b6/pnoistor_apr2023-sujatan_bdd161b6-VBA_before_LU-611f-comnt.wav',
 'anot--file_path_VBA': './reports/EXPORT_DATA/sujatan_bdd161b6/pnoistor_apr2023-sujatan_bdd161b6-VBA_before_LU-611f-comnt.txt',
 'file_class_VBA': 'VBA_before_LU',
 'sub_id': 'sujatan_bdd161b6',
 'match': '_before_LU',
 'audio--file_path_LBA': './reports/EXPORT_DATA/sujatan_bdd161b6/pnoistor_apr2023-sujatan_bdd161b6-LBA_before_LU-f46c-comnt.wav',
 'anot--file_path_LBA': './reports/EXPORT_DATA/sujatan_bdd161b6/pnoistor_apr2023-sujatan_bdd161b6-LBA_before_LU-f46c-comnt.txt',
 'file_class_LBA': 'LBA_before_LU'}

In [130]:
sub_VBA

Unnamed: 0_level_0,audio--file_path,anot--file_path,sub_id
file_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
VBA_before_LL,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,lokeshk_90b4871a
VBA_before_LU,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,lokeshk_90b4871a
VBA_before_RL,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,lokeshk_90b4871a
VBA_before_RU,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,./reports/EXPORT_DATA/lokeshk_90b4871a/pnoisto...,lokeshk_90b4871a


In [122]:
pnoi_corpus_split_DF["file_class"].value_counts()
pd.unique(pnoi_corpus_split_DF["sub_id"]).__len__()

25