In [18]:
import pandas as pd
import numpy as np
import os

In [19]:
medical_data = pd.read_csv("data/Respiratory_Sound_Database/patient_diagnosis.csv", names = ["patient", "diagnosis"])
print(medical_data)
print()
value_counts = medical_data["diagnosis"].value_counts()
print(value_counts)


     patient  diagnosis
0        101       URTI
1        102    Healthy
2        103     Asthma
3        104       COPD
4        105       URTI
..       ...        ...
121      222       COPD
122      223       COPD
123      224    Healthy
124      225    Healthy
125      226  Pneumonia

[126 rows x 2 columns]

diagnosis
COPD              64
Healthy           26
URTI              14
Bronchiectasis     7
Pneumonia          6
Bronchiolitis      6
LRTI               2
Asthma             1
Name: count, dtype: int64


In [20]:
import glob

directory_path = 'data/Respiratory_Sound_Database/audio_and_txt_files'
text_files = glob.iglob(directory_path + '/*.txt', recursive=True)
text_files_list = list(text_files)

dfs = []
for file in text_files_list:
    content = pd.read_csv(file, names = ["start_time", "end_time", "crackles", "wheezes"], delimiter = "\t")
    #print(file)
    #print(file.split("_")[5])
    patient_number = file.split("_")[5][6:]
    #print(patient_number)
    content['patient'] = int(patient_number)
    dfs.append(content)
samples = pd.concat(dfs)

print("There are", len(samples), "cycles")
print("There are", len(text_files_list), "samples")

There are 6898 cycles
There are 920 samples


In [21]:
all_data = pd.merge(medical_data, samples, on = "patient", how = "outer")
all_data["diagnosis"].value_counts()

diagnosis
COPD              5746
Healthy            322
Pneumonia          285
URTI               243
Bronchiolitis      160
Bronchiectasis     104
LRTI                32
Asthma               6
Name: count, dtype: int64

In [22]:
resp_cycles_len = samples["end_time"] - samples["start_time"]
print("Mean is", resp_cycles_len.mean())
print("Standard deviation is", resp_cycles_len.std())



Mean is 2.7005090852420994
Standard deviation is 1.1725341343398874


In [43]:
import os

# clear output folder
output_folder = 'data/spectograms'
for file_name in os.listdir(output_folder):
    file_path = os.path.join(output_folder, file_name)
    # Remove the file
    if os.path.isfile(file_path):
        os.remove(file_path)

print(f"Contents of {output_folder} cleared.")

# how many files in output folder?
output_folder = 'data/spectograms/'
files_in_output_folder = os.listdir(output_folder)
num_files = len(files_in_output_folder)
print("num files in output folder:", num_files)



Contents of data/working cleared.
num files in output folder: 0


In [45]:
import librosa as librosa
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
%matplotlib inline
import librosa.display
from IPython.display import Audio
import time
import gc


# converts file, where file is a string (name of the file), into a string with the given format, where format is a string
# assumes the format already in the file is 3 characters long
# if cycle > 0, it adds "cycle _" to the end of the file name before the ending
# if cycle = 0, it leaves the rest of the file as is
def convert_type(file, format, cycle):
    if cycle > 0:
        file = file[0:len(file)-4] + "_cycle" + str(cycle) + "." + format
    else:
        file = file[0:len(file)-3] + format
    return file


# divide audio_file into its respiratory cycles
# takes the name of the audio file (.wav) as a string as the argument
def divide_into_cycles(audio_file_name):
    # take just the timestamps for the respiratory cycles- ignore crackles and wheezes
    audio_file_name_as_txt = directory_path + "/" + convert_type(audio_file_name, "txt", 0)
    timestamps = np.loadtxt(audio_file_name_as_txt, delimiter='\t')
    timestamps = timestamps[:, :2]
    
    return timestamps


def generate_spectogram(audio_file):
    # load the audio file
    y, sr = librosa.load(audio_file)
    
    audio_file_name = os.path.basename(audio_file)
    # find where the timestamps of audio_file are
    timestamps = divide_into_cycles(audio_file_name) # list
    
    # find the length of the shortest spectrogram
    min = -1
    for cycle in timestamps:
        start_time = cycle[0]
        end_time = cycle[1]
        length = end_time - start_time
        if min == -1 or length < min:
            min = length
    
    # make a mel db spectrogram for each respiratory cycle using the timestamps
    cycle_number = 1
    for cycle in timestamps:
        # take the portion of the audio that contains this respiratory cycle

        # start and end time in seconds
        start_time = cycle[0]
        end_time = cycle[1]
        
        # determine how much of the audio segment we need to crop
        diff = (end_time - start_time) - min
        crop_amt = int(diff / 2)
        
        # convert the start and end times to sample indices
        start_sample = int(start_time * sr) + crop_amt
        end_sample = int(end_time * sr) - crop_amt
        
        # extract the audio segment
        audio_segment = y[start_sample:end_sample]
        
        spectrogram_name = convert_type(audio_file_name, "jpg", cycle_number)
        
        # make the mel db spectrogram based on that audio segment
        D = librosa.amplitude_to_db(np.abs(librosa.stft(audio_segment, n_fft=2048, hop_length=512)), ref=np.max)
        # display the spectrogram
        plt.figure(figsize=(5, 2))
        librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='viridis')
        plt.axis('off')
        plt.subplots_adjust(left=0, right=1, top=1, bottom=0) # removes the white space around the spectrogram
        # plt.colorbar(format='%+2.0f dB')
        # plt.title(spectrogram_name)

        # save the spectrogram to the output folder
        output_path = os.path.join(output_folder, spectrogram_name)
        plt.savefig(output_path)
        plt.close()
        
        cycle_number += 1


In [52]:
audio_files = glob.iglob(directory_path + '/*.wav', recursive=True)

In [47]:
begin_time = time.time()
file_number = 0

for audio_file in audio_files:
    generate_spectogram(audio_file)
    file_number += 1

    # Print progress
    if file_number % 10 == 0:
        print(f"Processed {file_number} files so far in {time.time() - begin_time} seconds")
        gc.collect()

print(f"Processed a batch of {file_number} files in {time.time() - begin_time} seconds")

'''# TO REMOVE: just generate spectrograms for 1 file
for audio_file in audio_files:
    generate_spectogram(audio_file)
    break'''

0
1
2
3
4
5
6
7
8
9
Processed 10 files so far in 4.698668718338013 seconds
10
11
12
13
14
15
16
17
18
19
Processed 20 files so far in 9.230161905288696 seconds
20
21
22
23
24
25
26
27
28
29
Processed 30 files so far in 13.70967984199524 seconds
30
31
32
33
34
35
36
37
38
39
Processed 40 files so far in 17.655300855636597 seconds
40
41
42
43
44
45
46
47
48
49
Processed 50 files so far in 21.570308685302734 seconds
50
51
52
53
54
55
56
57
58
59
Processed 60 files so far in 25.333678722381592 seconds
60
61
62
63
64
65
66
67
68
69
Processed 70 files so far in 29.079599857330322 seconds
70
71
72
73
74
75
76
77
78
79
Processed 80 files so far in 32.902023792266846 seconds
80
81
82
83
84
85
86
87
88
89
Processed 90 files so far in 36.78122067451477 seconds
90
91
92
93
94
95
96
97
98
99
Processed 100 files so far in 41.501980781555176 seconds
100
101
102
103
104
105
106
107
108
109
Processed 110 files so far in 45.45787477493286 seconds
110
111
112
113
114
115
116
117
118
119
Processed 120 fil

'# TO REMOVE: just generate spectrograms for 1 file\nfor audio_file in audio_files:\n    generate_spectogram(audio_file)\n    break'

In [48]:
import shutil
shutil.copy("data/Respiratory_Sound_Database/patient_diagnosis.csv",
           "data/spectograms/")

'data/working/patient_diagnosis.csv'

In [49]:
from scipy.io.wavfile import write

def generate_audio_clips(audio_file, output_folder):
    # load the audio file
    y, sr = librosa.load(audio_file)
    
    audio_file_name = os.path.basename(audio_file)
    # find where the timestamps of audio_file are
    timestamps = divide_into_cycles(audio_file_name) # list
    
    # make an audio clip for each respiratory cycle using the timestamps
    cycle_number = 1
    for cycle in timestamps:
        # take the portion of the audio that contains this respiratory cycle

        # start and end time in seconds
        start_time = cycle[0]
        end_time = cycle[1]
        
        # convert the start and end times to sample indices
        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)
        
        # extract the audio segment
        audio_segment = y[start_sample:end_sample]
        
        audio_clip_name = convert_type(audio_file_name, "wav", cycle_number)
        
        # save the audio clip to the output folder
        output_path = os.path.join(output_folder, audio_clip_name)
        write(output_path, sr, audio_segment)
        
        cycle_number += 1

In [50]:
directory_path = 'data/Respiratory_Sound_Database/audio_and_txt_files'
output_folder = 'data/Respiratory_Sound_Database/clips_by_cycle'

audio_files = glob.iglob(directory_path + '/*.wav', recursive=True)

for audio_file in audio_files:
    generate_audio_clips(audio_file, output_folder)

In [65]:
# get the list of sound files
sound_files = glob.glob('data/Respiratory_Sound_Database/clips_by_cycle/*.wav')

# get the list of spectrogram files
spectrogram_files = glob.glob('data/spectograms/*.jpg')

# create a DataFrame
df = pd.DataFrame({
    'filename': [os.path.splitext(os.path.basename(x))[0] for x in sound_files],
    'sound_file': sound_files,
    'spectrogram_file': spectrogram_files
})

df = df.sort_values('filename', ascending=True)
df = df.reset_index(drop=True)

# print the DataFrame
print(df)

                            filename  \
0      101_1b1_Al_sc_Meditron_cycle1   
1     101_1b1_Al_sc_Meditron_cycle10   
2     101_1b1_Al_sc_Meditron_cycle11   
3     101_1b1_Al_sc_Meditron_cycle12   
4      101_1b1_Al_sc_Meditron_cycle2   
...                              ...   
6893   226_1b1_Pl_sc_LittC2SE_cycle5   
6894   226_1b1_Pl_sc_LittC2SE_cycle6   
6895   226_1b1_Pl_sc_LittC2SE_cycle7   
6896   226_1b1_Pl_sc_LittC2SE_cycle8   
6897   226_1b1_Pl_sc_LittC2SE_cycle9   

                                             sound_file  \
0     data/Respiratory_Sound_Database/clips_by_cycle...   
1     data/Respiratory_Sound_Database/clips_by_cycle...   
2     data/Respiratory_Sound_Database/clips_by_cycle...   
3     data/Respiratory_Sound_Database/clips_by_cycle...   
4     data/Respiratory_Sound_Database/clips_by_cycle...   
...                                                 ...   
6893  data/Respiratory_Sound_Database/clips_by_cycle...   
6894  data/Respiratory_Sound_Database/clips_by_

In [66]:
# export the DataFrame to a CSV file
df.to_csv('data/df_spectogram_by_cycle.csv', index=True)