### Data Preparation
The purpose of this notebook is to prepare out data. The data will be downloaded from various sources and then we'll use sagemaker to run preprocessing and prepare the data for training.

This notebook should ideally only be run a couple of times, the result should output to your s3 bucket so then for training you can just pull the data during training each time. Rather then preprocess and train each time.

In [None]:
!sudo apt-get install -y sox unzip

In [None]:
# AWS libs and setup
import boto3
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
sess = sagemaker.Session()
region = boto3.session.Session().region_name
sm = boto3.Session().client('sagemaker')

print("Region = {}".format(region))

In [None]:
# 
# Download the data
# Note: You'll only want to do this once as it takes a while to download the source data
#
import os
import pandas as pd
import numpy as np
import subprocess
import numpy as np
import glob
import pandas as pd
import csv

# 
# Initialze folder structure
#

coswara_data = 'rawdata/coswara'
coughvid_data = 'rawdata/coughvid'
combined_data = 'rawdata/combined'
combined_audio_data = 'rawdata/combined/audio'

if not os.path.exists('rawdata'):
    !mkdir rawdata
    !mkdir {coughvid_data}
    !mkdir {combined_data}
    !mkdir {combined_audio_data}

#
# Coswara
#
# Note: These are already in wav format in an archive, the labeling is more complex it's not just coughs, also speaking
# Extract and processing: https://github.com/iiscleap/Coswara-Data/blob/master/extract_data.py
# Size: ~12.7GB

# Let's check in case we've already done this
extracted_data_dir = os.path.join(coswara_data, 'extracted_data')
if not os.path.exists(extracted_data_dir):
    print("no coswara dir")
    !git clone https://github.com/iiscleap/Coswara-Data.git {coswara_data}  

    if not os.path.exists(extracted_data_dir):
        os.makedirs(extracted_data_dir) # Creates the Extracted_data folder if it doesn't exist

    dirs_extracted = set(map(os.path.basename,glob.glob('{}/202*'.format(extracted_data_dir))))
    dirs_all = set(map(os.path.basename,glob.glob('{}/202*'.format(coswara_data))))

    dirs_to_extract = list(set(dirs_all) - dirs_extracted)

    for d in dirs_to_extract:
        p = subprocess.Popen('cat {}/{}/*.tar.gz.* |tar -xz -C {}/'.format(coswara_data, d, extracted_data_dir), shell=True)
        p.wait()
else:
    print("found exisiting coswara dataset")

# Format the extracted contents and csv
# I had issues with pandas and the word "true", just couldn't select it?
if os.path.isfile(os.path.join(coswara_data, 'combined_data_upd.csv')):
    os.remove(os.path.join(coswara_data, 'combined_data_upd.csv'))
text = open(os.path.join(coswara_data, 'combined_data.csv'), "r")
text = ''.join([i for i in text]) \
    .replace("True", "IsTrue")
x = open(os.path.join(coswara_data, 'combined_data_upd.csv'), "w")
x.writelines(text)
x.close()

# Filter out records:
#   Covid Status = positive or negative
#   Cough = True
coswara_pd = pd.read_csv(os.path.join(coswara_data, 'combined_data_upd.csv'), header=0)
coswara_pd = coswara_pd[['id', 'covid_status']]
# drop nans
coswara_pd.dropna(inplace=True)
coswara_pd['file_path'] = ""
coswara_pd['source'] = ""

# Add the file_path to the csv file
for filename in glob.glob(extracted_data_dir + '/*/*/cough-shallow.wav'):
    file_uid = filename.split('/')[-2]
    file_path = filename
    coswara_pd.loc[coswara_pd.id == file_uid, "file_path"] = filename
    # adding a source col
    coswara_pd.loc[coswara_pd.id == file_uid, "source"] = "coswara"
    
coswara_pd.loc[coswara_pd.covid_status == "COVID-19", "covid_status"] = "positive"
coswara_pd.loc[coswara_pd.covid_status == "positive", "covid_status"] = "positive"
coswara_pd.loc[coswara_pd.covid_status == "positive_mild", "covid_status"] = "positive"
coswara_pd.loc[coswara_pd.covid_status == "positive_moderate", "covid_status"] = "positive"
coswara_pd.loc[coswara_pd.covid_status == "positive_asymp", "covid_status"] = "positive"
# At this point, we've identified the positive cases, everything else is a form of negative, for this
# set i am using healthy only. There are some others like recovered etc but small subset only.
coswara_pd.loc[coswara_pd.covid_status == "healthy", "covid_status"] = "negative"
coswara_pd = coswara_pd[coswara_pd.covid_status.isin(['negative', 'positive'])]
# standardize columns
coswara_pd.rename(columns={"id":"uuid","covid_status":"covid"},inplace=True)

# include heavy coughs
# copy the df over, replace the word for the filepath, join the two df's together
heavy = coswara_pd.copy()
heavy["file_path"].replace("shallow", "heavy", inplace=True, regex=True)
coswara_pd = pd.concat([coswara_pd, heavy])

# save to csv
if os.path.isfile(os.path.join(coswara_data,"tyhac_coswara_stripped.csv")):
    os.remove(os.path.join(coswara_data,"tyhac_coswara_stripped.csv"))
coswara_pd.to_csv(os.path.join(coswara_data,"tyhac_coswara_stripped.csv"), index=None)
# coswara_pd

print("coswara extraction process completed!")

#
# Coughvid - zenodo
#
# Note: Crowdsourced dataset with covid status, contains ogg, webm, csv, json
# Size: ~1.3GB

# Let's check in case we've already done this
if not os.path.exists(coughvid_data + '/public_dataset'):
    print("no coughvid dir")
    !wget -O "{coughvid_data}/public_dataset.zip" "https://zenodo.org/record/4498364/files/public_dataset.zip?download=1"
    # extract the dataset
    !unzip -q "{coughvid_data}/public_dataset.zip" -d {coughvid_data}
else:
    print("found exisiting coughvid dataset")
    
extracted_data_dir = os.path.join(coughvid_data, 'public_dataset')

#
# Coughvid - processing
# Reformat for the bits we want
#
# Filter out records:
#  Covid Status = positive or negative
#  Cough = True
coughvid_pd = pd.read_csv(os.path.join(extracted_data_dir,'metadata_compiled.csv'),header=0)
# filer the set for coughs and columns we want
coughvid_pd = coughvid_pd.loc[coughvid_pd['cough_detected'] >= 0.9][['uuid', 'status']]
# strip the nans
coughvid_pd.dropna(inplace=True)

# Healthy or COVID-19
coughvid_pd.loc[coughvid_pd.status == "COVID-19", "status"] = "positive"
coughvid_pd.loc[coughvid_pd.status == "positive", "status"] = "positive"
coughvid_pd.loc[coughvid_pd.status == "positive_mild", "status"] = "positive"
coughvid_pd.loc[coughvid_pd.status == "positive_moderate", "status"] = "positive"
coughvid_pd.loc[coughvid_pd.status == "positive_asymp", "status"] = "positive"
# At this point, we've identified the positive cases, everything else is a form of negative
coughvid_pd.loc[coughvid_pd.status == "healthy", "status"] = "negative"
coughvid_pd = coughvid_pd[coughvid_pd.status.isin(['negative', 'positive'])]

coughvid_pd['file_path'] = ""
coughvid_pd['source'] = ""

# Add the file_path to the csv file
for filename in glob.glob(extracted_data_dir + '/*'):
    fullfile = filename.split('/')[-1]
    file_uid = fullfile.split('.')[0]
    file_ext = fullfile.split('.')[1]
    if file_ext == 'ogg' or file_ext == 'webm':
        file_path = filename
        coughvid_pd.loc[coughvid_pd.uuid == file_uid, "file_path"] = filename
        coughvid_pd.loc[coughvid_pd.uuid == file_uid, "source"] = 'coughvid'

# standardize columns
coughvid_pd.rename(columns={"status":"covid"},inplace=True)

# save to csv
if os.path.isfile(os.path.join(coughvid_data,"tyhac_coughvid_stripped.csv")):
    os.remove(os.path.join(coughvid_data,"tyhac_coughvid_stripped.csv"))
coughvid_pd.to_csv(os.path.join(coughvid_data,"tyhac_coughvid_stripped.csv"), index=None)

print("coughvid extraction process completed!")

#
# Merge the datasets to a single csv
#
combined = pd.concat([coswara_pd, coughvid_pd])
# ditch NaN, somehow it had some?
combined.dropna(inplace=True)
# Note: temporary to limit post processing during testing
# combined = combined.head(150)
if os.path.isfile(os.path.join(combined_data,"tyhac_combined.csv")):
    os.remove(os.path.join(combined_data,"tyhac_combined.csv"))
combined.to_csv(os.path.join(combined_data,"tyhac_combined.csv"), index=None)

print("download, extraction, csv completed")

In [None]:
# 
# Add in cough heavy and shallow, do some file conversion to wav format
#
import numpy as np
import pandas as pd
import os
import subprocess
from pathlib import Path
from shutil import copyfile
import torchaudio

combined_audio_data = 'rawdata/combined/audio'
combined_data = 'rawdata/combined'
combined_csv = os.path.join(combined_data,"tyhac_combined.csv")
combined_pd = pd.read_csv(combined_csv)
combined_pd.dropna(inplace=True) # I had already done this above but there was a NaN that surfaced, a missing file?

# Some of the files are named cough-*.wav, we want to name them the uuid instead
def find_filename(file_path):
    ext = file_path.split('.')[-1]
    # I've kept cough-heavy because one of the sets contains soft and heavy coughs for a single uuid
    # should be able to expand on this if needed
    if 'cough-heavy' in file_path:
        uuid = file_path.split('/')[-2]
        new_file = uuid + '-cough-heavy.' + ext
    elif 'cough-shallow' in file_path:
        uuid = file_path.split('/')[-2]
        new_file = uuid + '-cough-shallow.' + ext
    else:
        # just send back the filename
        new_file = file_path.split('/')[-1]
    return new_file

# Copy file from source data set into combined folder structure
def copy_file():
    files = combined_pd['file_path']
    for file in files:
        new_file = find_filename(file)
        copyfile(file, os.path.join(combined_audio_data, new_file))

def convert_files(source, dest):
    """Convert files from .webm and .ogg to .wav
    folder: path to coughvid database and metadata_compiled csv"""
    
    names_to_convert = combined_pd.uuid.to_numpy()
    for counter, name in enumerate(names_to_convert):
        if (counter%1000 == 0):
            print("Finished {0}/{1}".format(counter,len(names_to_convert)))
        if os.path.isfile(source + name + '.webm'):
            subprocess.call(["ffmpeg", "-nostats", "-loglevel", "error", "-n", "-i", source+name+".webm", dest+name+".wav"])
            os.remove(source + name + '.webm')
        elif os.path.isfile(source + name + '.ogg'):
            subprocess.call(["ffmpeg", "-nostats", "-loglevel", "error", "-n", "-i", source+name+".ogg", dest+name+".wav"])
            os.remove(source + name + '.ogg')

def resample(source, dest):
    # resample, majority of the dataset is 48000, resample everything else to 48000
    for fn in glob.glob(source + '*.*'):
        # set some vars for filenames and extensions
        filenoext = fn.split('.')
        fileext = filenoext[1]
        filenoext = filenoext[0]
        filetemp = filenoext + '_temp.wav'
                           
        # whatever is left needs resampling
        y, sr = torchaudio.load(fn) # access sample sr etc
        # print(filenoext)
        if sr != 48000:
            print("resample: ", fn)
            # print("would have temped: ", filetemp)
            subprocess.call(["sox", fn, "-r", "48000", filetemp])
            # remove the old non 48k sample
            os.remove(fn)
            # rename temp back to orig filename
            os.rename(filetemp, fn)

# The files have a new location, need to update the sheet
def update_csv(combined_csv):
    # Reset
    combined_pd = pd.read_csv(combined_csv)
    combined_pd.dropna(inplace=True) # I had already done this above but there was a NaN that surfaced, a missing file?
    for filename in glob.glob(combined_audio_data + '/*.wav'):
        file_uid = filename.split('/')[-1]
        # handle our special naming and grab uuid
        if 'cough' in file_uid:
            file_split = file_uid.split('-') # uid/cough-shallow.wav, find by path
            file_uid = file_split[0]
            file_cough = file_split[2]
            old_file = file_uid + '/cough-' + file_cough
            rowIndex = combined_pd[combined_pd.file_path.str.contains(old_file)]
            combined_pd.loc[combined_pd.index.values == rowIndex.index.values, "file_path"] = filename
            # print(rowIndex.index.values[0])
            #
        else:
            # this will be .wav
            file_uid = file_uid.split('.')[0]
            # Simple unique lookup
            combined_pd.loc[combined_pd.uuid == file_uid, "file_path"] = filename
        combined_pd.to_csv(combined_csv, index=None)
    print("csv combine updated")
    
# Run fun
copy_file()
convert_files(combined_audio_data + '/', combined_audio_data + '/')
resample(combined_audio_data + '/', combined_audio_data + '/')
update_csv(combined_csv)

In [None]:
# 
# Remove zero length files. I found that during initial training there incorrect shape files. I verified this by
# also trying to playback the audio which failed. this simply identifies these records and cuts them from the list.
#
import pandas as pd
dataset_csv = pd.read_csv('rawdata/combined/tyhac_combined.csv')

files = dataset_csv

for index, row in files.iterrows():
    record = files.loc[files['file_path'] == row['file_path']]
    try:
        y, sr = torchaudio.load(record.file_path.values[0])
        # Remove the 0 length files
        if y.shape[1] == 0:
            print("wav error: ", record.index.values[0], " uuid: ", row['uuid'])
            files = files[files.file_path != row['file_path']]
    except:
        # Identified during shallow / heavy split, non issue now
        print("null record error: ", row['uuid'])

# Write out the clean file so we don't have to keep doing this
if os.path.isfile(os.path.join(combined_data,"tyhac_combined_cleaned.csv")):
    os.remove(os.path.join(combined_data,"tyhac_combined_cleaned.csv"))
    
files.to_csv('rawdata/combined/tyhac_combined_cleaned.csv', index=None)

In [None]:
# 
# Upload to s3
#
rawbucket = sess.default_bucket() # Alternatively you can use our custom bucket here. 

prefix = 'sagemaker-covid-tyhac-fastai' # use this prefix to store all files pertaining to this workshop.

dataprefix = prefix + '/data'
traindataprefix = prefix + '/data'

In [None]:
# Upload all the files
# Upload the raw dataset
raw_data_location = sess.upload_data('rawdata/combined', bucket=rawbucket, key_prefix=dataprefix)
print(raw_data_location)

In [None]:
# This part is just data viz
from fastai.vision.all import *
from fastaudio.core.all import *
from fastaudio.augment.all import *
from fastaudio.ci import skip_if_ci
import torchaudio
import pathlib
from IPython.display import Audio
import numpy as np
import pandas as pd

# configuration for audio processing
n_fft=2048 # 20*n_mels (128 default)
hop_length=512
target_rate=48000
n_mels=128

# Setup fastaudio
cfg = AudioConfig.BasicMelSpectrogram(n_fft=n_fft, hop_length=hop_length, sample_rate=target_rate, n_mels=n_mels)
a2s = AudioToSpec.from_cfg(cfg)
print(f"Fastaudio hyperparameters: {cfg}")

# dataset
dataset = pd.read_csv('rawdata/combined/tyhac_combined_cleaned.csv')

# match the pipeline for training
crop1s = ResizeSignal(6000) # milliseconds
pipe = Pipeline([AudioTensor.create, crop1s, a2s])

def stft_and_display(signal, n_fft=512, hop_length=128, to_db_scale=False, n_mels=128, mel_scale=False, 
                     top_db=80, show_shape=False):
    stft = librosa.stft(signal, n_fft, hop_length)
    real_portion = abs(stft)
    if(mel_scale):   real_portion = librosa.feature.melspectrogram(S=real_portion, n_fft=n_fft, n_mels=n_mels)
    if(to_db_scale): real_portion = librosa.amplitude_to_db(real_portion, top_db)
    if(show_shape):  print("Shape: {}x{}".format(*real_portion.shape))
    plt.imshow(real_portion)
    plt.show()
    
# This will show the actuals, e.g. what the algo will be learning from
for fn in dataset.file_path.values[:3]:
    y, sr = librosa.load(fn, sr=target_rate)
    stft_and_display(y, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, mel_scale=True, to_db_scale=True, show_shape=True)

In [None]:
# Show some samples
for fn in dataset.file_path.values[:3]:
    # y, sr = torchaudio.load(fn) # access sample sr etc
    audio = AudioTensor.create(fn)
    audio.show()
    pipe(fn).show()

In [None]:
labels = dataset.covid.values
test_size=0.2
splitter = TrainTestSplitter(test_size=test_size, random_state=42, stratify=labels)
Counter(labels)

In [None]:
db = DataBlock(
    blocks=(AudioBlock, CategoryBlock),
    get_x=ColReader('file_path'),
    get_y=ColReader('covid'),
    splitter=splitter,
    item_tfms=[ResizeSignal(6000), a2s])

dsets = db.datasets(dataset)

In [None]:
# Imbalance
count = Counter(labels)

# AudioTensor, CategoryTensor
wgts = [1/count[dsets.vocab[label]] for img, label in dsets.train]
wgts[:10]

In [None]:
# Default bs = 64, won't work on small set, add bs=x.n 
# num_workers greater than 1 had issues on local book
dls = db.dataloaders(dataset, num_workers=1, dl_type=WeightedDL, wgts=wgts)

In [None]:
# Checking the balance
x, y = dls.one_batch()
sum(y)/len(y)
## ~50% => we are fine here

In [None]:
dls.show_batch(ncols=3, nrows=2, figsize=(20, 10))