In [None]:
# Cough Data
Audiodata = {'percentile_25': [], 'percentile_50': [], 'percentile_75': [], 'percentile_90': [], 'mfcc': [], 'sc':[], 'label': []}
all_cough_data = [] # Will be used to create mel spectrograms

audio_files = os.listdir('./audio-records/cough')
timestamp_files = os.listdir('./audio-records/cough-timestamps')

# Filtering irrelevant files
audio_files = list(filter(lambda x: x.endswith('.wav'), audio_files))
timestamp_files = list(filter(lambda x: x.endswith('.txt'), timestamp_files))

for file in audio_files:
    
    file_name = file.split(".wav")[0]    
    timestamp_files = [file[0:19] for file in timestamp_files]    

    # Finding correspoding timestamp file
    index = timestamp_files.index(file_name)
    timestamp_data = timestamp_files[index]
    
    file_path = './audio-records/cough/' + file
    timestamp_path = './audio-records/cough-timestamps/' + timestamp_data + '-label.txt'

    # Adding timestamps to the list
    real_timestamps = get_real_timestamps(timestamp_path)    
    
    # Loading audio_file
    # Target sample rate: 48000
    data, sample_rate = librosa.load(file_path, sr=48000)

    # Audio duration
    duration = librosa.get_duration(path = file_path)
    total_samples = duration * sample_rate
    time = np.arange(0, len(data)) / sample_rate

    # Filtering data
    data = butter_bandpass_filter(data, 1000, 4000, sample_rate,8)

    # Getting moving average of the data
    moving_avg_data = compute_moving_average(np.abs(data))

    # Normalize the data
    moving_avg_data = normalize_data(moving_avg_data)
    
    # *******************************
    # Find coughs in the data
    copy_data = moving_avg_data.copy()
    coughs = []

    for timestamp in real_timestamps:

        start = int((timestamp - 0.3) * sample_rate)
        finish = int((timestamp + 0.3) * sample_rate)
        
        if start < 0:
            start = 0
        
        if finish > total_samples:
            finish = total_samples - 1

        data = copy_data[start:finish]
        coughs.append(data)

    # *******************************
    for cough in coughs:
        # Removing DC component
        cough -= np.mean(cough)
        all_cough_data.append(cough)

        try:
            
            # Find percentile points
            per_25, per_50, per_75, per_90 = find_percentile_points(cough)

            Audiodata['percentile_25'].append(per_25)
            Audiodata['percentile_50'].append(per_50)
            Audiodata['percentile_75'].append(per_75)
            Audiodata['percentile_90'].append(per_90)


            # MFCC Feature extraction
            mfccs, delta_mfccs, delta2_mfccs = get_mfcc_features(cough)

            # Comprehensive MFCCs
            print(file_path)
            print(duration)
            print(mfccs.shape)
            print(delta_mfccs.shape)
            print(delta2_mfccs.shape)
            
            comprehensive_mfccs = np.concatenate((mfccs, delta_mfccs, delta2_mfccs))
            Audiodata['mfcc'].append(comprehensive_mfccs)
            print("Comprehensive mfcc: {}".format(comprehensive_mfccs.shape))
            
            Audiodata['label'].append('cough')
            
            # Find spectral centroids
            sc = librosa.feature.spectral_centroid(y=cough, sr=sample_rate)
            Audiodata['sc'].append(sc)
            print(sc.shape)
            print()
        
        except:
            continue
    