In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import librosa as lb
from scipy.io.wavfile import write 
import matplotlib.pyplot as plt
import shutil
import sys
sys.path.append(os.path.abspath('script'))
from feature_class import features
from DSP import classify_cough
import pickle
from scipy.io import wavfile

In [None]:
if os.path.exists('csv_files/normalized_data/')==False:
    os.mkdir('csv_files/normalized_data/')

## Converting webm files to wav files

In [None]:
if os.path.exists('dataset/Coughvid-Data/wav')==False:
    os.mkdir('dataset/Coughvid-Data/wav')

In [None]:
coughvid = pd.read_csv('csv_files/Merge_dataset.csv')

In [None]:
for index,row in tqdm(coughvid.iterrows()):
    dataset = row['DataSet']
    if dataset == 'coughvid':
        paths = row['DIR']
        fn = paths.split('/')[-1].split('.')[0]
        loc = os.path.join('dataset/Coughvid-Data/wav',fn+'.wav')
        if (paths.endswith(".webm")):
            command = f"ffmpeg -i  {paths} -c:a pcm_f32le {loc}"
            os.system(command)

## Normalizing Cougvid Wav Files

In [None]:
if os.path.exists('dataset/Coughvid-Data/wav_normalized')==False:
    os.mkdir('dataset/Coughvid-Data/wav_normalized')

In [None]:
for i in os.listdir('dataset/Coughvid-Data/wav'):
    path = os.path.join('dataset/Coughvid-Data/wav',i)
    try:
        y, sr = lb.load(path, sr = None)
        y_norm = lb.util.normalize(y)
        y_re = lb.resample(y_norm, sr, 16000)
        loc = os.path.join('dataset/Coughvid-Data/wav_normalized',i)
        write(loc, 16000, y_re)
    except:
        print('None')

## Creating Coughvid Normalized Data CSV

In [None]:
df_list = []
for index,row in coughvid.iterrows():
    temp = pd.DataFrame(columns=['filename','label'])
    dataset = row['DataSet']
    if dataset == 'coughvid':
        paths = row['DIR']
        fn = paths.split('/')[-1].split('.')[0]
        loc = os.path.join('dataset/Coughvid-Data/wav_normalized',fn+'.wav')
        if os.path.exists(loc)==True:
            temp['filename'] = [fn+'.wav']
            temp['label'] = [row['STATUS']]
    df_list.append(temp)
VidData = pd.concat(df_list)
VidData = VidData.sample(frac=1).reset_index(drop=True)
VidData.to_csv('csv_files/normalized_data/Coughvid_dataset.csv', index=False)
VidData.head()

## Normalizing Coswara Wav Files

In [None]:
if os.path.exists('dataset/Coswara-Data/wav_normalized')==False:
    os.mkdir('dataset/Coswara-Data/wav_normalized')

In [None]:
coswara = pd.read_csv('csv_files/Merge_dataset.csv')
for index,row in coswara.iterrows():
    dataset = row['DataSet']
    if dataset == 'coswara':
        paths = row['DIR']
        name = paths.split('/')[-1].split('.')[0]
        if name == 'cough-shallow':
            fn = paths.split('/')[-2]
            try:
                y, sr = lb.load(paths, sr = None)
                y_norm = lb.util.normalize(y)
                y_re = lb.resample(y_norm, sr, 16000)
                loc = os.path.join('dataset/Coswara-Data/wav_normalized',fn+'.wav')
                write(loc, 16000, y_re)
            except:
                print('None')
        if name == 'cough-heavy':
            fn = paths.split('/')[-2]
            try:
                y, sr = lb.load(paths, sr = None)
                y_norm = lb.util.normalize(y)
                y_re = lb.resample(y_norm, sr, 16000)
                loc = os.path.join('dataset/Coswara-Data/wav_normalized',fn+'_heavy.wav')
                write(loc, 16000, y_re)
            except:
                print('None')

## Create Coswara Normalized Data CSV

In [None]:
coughvid = pd.read_csv('csv_files/Merge_dataset.csv')

In [None]:
df_list = []
for index,row in coughvid.iterrows():
    temp = pd.DataFrame(columns=['filename','label'])
    dataset = row['DataSet']
    if dataset == 'coswara':
        paths = row['DIR']
        name = paths.split('/')[-1].split('.')[0]
        if name == 'cough-shallow':
            fn = paths.split('/')[-2]
            loc = os.path.join('dataset/Coswara-Data/wav_normalized',fn+'.wav')
            if os.path.exists(loc)==True:
                temp['filename'] = [fn+'.wav']
                temp['label'] = [row['STATUS']]
        if name == 'cough-heavy':
            fn = paths.split('/')[-2]
            loc = os.path.join('dataset/Coswara-Data/wav_normalized',fn+'_heavy.wav')
            if os.path.exists(loc)==True:
                temp['filename'] = [fn+'_heavy.wav']
                temp['label'] = [row['STATUS']]
    df_list.append(temp)
CosData = pd.concat(df_list)
CosData = CosData.sample(frac=1).reset_index(drop=True)
CosData.to_csv('csv_files/normalized_data/Coswara_dataset.csv', index=False)
CosData.head()

## Normalizing Compare Wav Files

In [None]:
if os.path.exists('dataset/Compare-Data/dist/wav_normalized')==False:
    os.mkdir('dataset/Compare-Data/dist/wav_normalized')

In [None]:
compare= pd.read_csv('csv_files/Merge_dataset.csv')
for index,row in compare.iterrows():
    dataset = row['DataSet']
    if dataset == 'compare':
        paths = row['DIR']
        fn = paths.split('/')[-1].split('.')[0]
        try:
            y, sr = lb.load(paths, sr = None)
            y_norm = lb.util.normalize(y)
            y_re = lb.resample(y_norm, sr, 16000)
            loc = os.path.join('dataset/Compare-Data/dist/wav_normalized',fn+'.wav')
            write(loc, 16000, y_re)
        except:
            print('None')

## Create Compare Normalized Data CSV

In [None]:
df_list = []
for index,row in compare.iterrows():
    temp = pd.DataFrame(columns=['filename','label'])
    dataset = row['DataSet']
    if dataset == 'compare':
        paths = row['DIR']
        fn = paths.split('/')[-1].split('.')[0]
        loc = os.path.join('dataset/Compare-Data/dist/wav_normalized',fn+'.wav')
        if os.path.exists(loc)==True:
            temp['filename'] = [fn+'.wav']
            temp['label'] = [row['STATUS']]
    df_list.append(temp)
CompData = pd.concat(df_list)
CompData = CompData.sample(frac=1).reset_index(drop=True)
CompData.to_csv('csv_files/normalized_data/Compare_dataset.csv', index=False)
CompData.head()

## Cough Filtering 

In [None]:
loaded_model = pickle.load(open(os.path.join('models', 'cough_classifier'), 'rb'))
loaded_scaler = pickle.load(open(os.path.join('models','cough_classification_scaler'), 'rb'))

In [None]:
folder = 'dataset/Compare-Data/Cough'
data = 'dataset/Compare-Data/dist/wav_normalized'

if os.path.exists(folder)==False:
    os.mkdir(folder)
    
for file in os.listdir(data):
    filename = os.path.join(data,file)
    fs, x = wavfile.read(filename)
    probability = classify_cough(x, fs, loaded_model, loaded_scaler)
    value = round(probability*100,2)
    if value>=80:
        loc = os.path.join(folder,file)
        shutil.copy(filename,loc)
    print("The file {0} has a {1}\% probability of being a cough".format(file,value))

In [None]:
folder = 'dataset/Coughvid-Data/Cough'
data = 'dataset/Coughvid-Data/wav_normalized'

if os.path.exists(folder)==False:
    os.mkdir(folder)
    
for file in os.listdir('dataset/Coughvid-Data/wav_normalized'):
    filename = os.path.join(data,file)
    fs, x = wavfile.read(filename)
    probability = classify_cough(x, fs, loaded_model, loaded_scaler)
    value = round(probability*100,2)
    if value>=80:
        loc = os.path.join(folder,file)
        shutil.copy(filename,loc)
    print("The file {0} has a {1}\% probability of being a cough".format(file,value))

In [None]:
folder = 'dataset/Coswara-Data/Cough'
data = 'dataset/Coswara-Data/wav_normalized'

if os.path.exists(folder)==False:
    os.mkdir(folder)
    
for file in os.listdir('dataset/Coswara-Data/wav_normalized'):
    filename = os.path.join(data,file)
    fs, x = wavfile.read(filename)
    probability = classify_cough(x, fs, loaded_model, loaded_scaler)
    value = round(probability*100,2)
    if value>=80:
        loc = os.path.join(folder,file)
        shutil.copy(filename,loc)
    print("The file {0} has a {1}\% probability of being a cough".format(file,value))

## Create CSV

In [None]:
if os.path.exists('csv_files/cough')==False:
    os.mkdir('csv_files/cough')

In [None]:
df_coswara = pd.read_csv('csv_files/normalized_data/Coswara_dataset.csv')
dataset = 'dataset/Coswara-Data/Cough'
df_list = []

for index,row in df_coswara.iterrows():
    filename = row['filename']
    label = row['label']
    temp = pd.DataFrame(columns=['filename','label'])
    if os.path.exists(os.path.join(dataset,filename))==True:
        temp['filename'] = [filename]
        temp['label'] = [label]
        df_list.append(temp)
df_coswara = pd.concat(df_list)
df_coswara = df_coswara.sample(frac=1).reset_index(drop=True)
df_coswara.to_csv('csv_files/cough/Coswara_dataset.csv', index=False)
df_coswara.head()

In [None]:
df_coswara = pd.read_csv('csv_files/normalized_data/Coswara_dataset.csv')
dataset = 'dataset/Coswara-Data/Cough'
df_list = []

for index,row in df_coswara.iterrows():
    filename = row['filename']
    label = row['label']
    temp = pd.DataFrame(columns=['filename','label'])
    if os.path.exists(os.path.join(dataset,filename))==True:
        if label == 'positive':
            temp['filename'] = [filename]
            temp['label'] = [label]
            df_list.append(temp)
df_coswara = pd.concat(df_list)
df_coswara = df_coswara.sample(frac=1).reset_index(drop=True)
df_coswara.to_csv('csv_files/cough/Coswara_positive_dataset.csv', index=False)
df_coswara.head()

In [None]:
df_coughvid = pd.read_csv('csv_files/normalized_data/Coughvid_dataset.csv')
dataset = 'dataset/Coughvid-Data/Cough'
df_list = []

for index,row in df_coughvid.iterrows():
    filename = row['filename']
    label = row['label']
    temp = pd.DataFrame(columns=['filename','label'])
    if os.path.exists(os.path.join(dataset,filename))==True:
        temp['filename'] = [filename]
        temp['label'] = [label]
        df_list.append(temp)
df_coughvid = pd.concat(df_list)
df_coughvid = df_coughvid.sample(frac=1).reset_index(drop=True)
df_coughvid.to_csv('csv_files/cough/Coughvid_dataset.csv', index=False)
df_coughvid.head()

In [None]:
df_coughvid = pd.read_csv('csv_files/normalized_data/Coughvid_dataset.csv')
dataset = 'dataset/Coughvid-Data/Cough'
df_list = []

for index,row in df_coughvid.iterrows():
    filename = row['filename']
    label = row['label']
    temp = pd.DataFrame(columns=['filename','label'])
    if os.path.exists(os.path.join(dataset,filename))==True:
        if label == 'positive':
            temp['filename'] = [filename]
            temp['label'] = [label]
            df_list.append(temp)
df_coughvid = pd.concat(df_list)
df_coughvid = df_coughvid.sample(frac=1).reset_index(drop=True)
df_coughvid.to_csv('csv_files/cough/Coughvid_positive_dataset.csv', index=False)
df_coughvid.head()

In [None]:
df_compare = pd.read_csv('csv_files/normalized_data/Compare_dataset.csv')
dataset = 'dataset/Compare-Data/Cough'
df_list = []

for index,row in df_compare.iterrows():
    filename = row['filename']
    label = row['label']
    temp = pd.DataFrame(columns=['filename','label'])
    if os.path.exists(os.path.join(dataset,filename))==True:
        temp['filename'] = [filename]
        temp['label'] = [label]
        df_list.append(temp)
df_compare = pd.concat(df_list)
df_compare = df_compare.sample(frac=1).reset_index(drop=True)
df_compare.to_csv('csv_files/cough/Compare_dataset.csv', index=False)
df_compare.head()

## Merge Normalized Data

In [None]:
if os.path.exists('dataset/Merge-Data')==False:
    os.mkdir('dataset/Merge-Data')

In [None]:
for i in os.listdir('dataset/Coughvid-Data/Cough'):
    source = os.path.join('dataset/Coughvid-Data/Cough',i)
    target = os.path.join('dataset/Merge-Data',i)
    shutil.copyfile(source,target)

In [None]:
for i in os.listdir('dataset/Coswara-Data/Cough'):
    source = os.path.join('dataset/Coswara-Data/Cough',i)
    target = os.path.join('dataset/Merge-Data',i)
    shutil.copyfile(source,target)

In [None]:
for i in os.listdir('dataset/Compare-Data/Cough'):
    source = os.path.join('dataset/Compare-Data/Cough',i)
    target = os.path.join('dataset/Merge-Data',i)
    shutil.copyfile(source,target)

## Create CSV for Experiment

In [None]:
if os.path.exists('csv_files/experiment_data')==False:
    os.mkdir('csv_files/experiment_data')

In [None]:
coughvid = pd.read_csv('csv_files/cough/Coughvid_positive_dataset.csv')
coswara = pd.read_csv('csv_files/cough/Coswara_positive_dataset.csv')
compare = pd.read_csv('csv_files/cough/Compare_dataset.csv')

In [None]:
partisi = 0.2

In [None]:
df_list = []
for index, row in coughvid.iterrows():
    temp = pd.DataFrame(columns=['filename','label'])
    fn = row['filename']
    temp['filename'] = [fn]
    temp['label'] = [row['label']]
    df_list.append(temp)
num = int(partisi*len(df_list))

VidDevData = pd.concat(df_list[0:num])
VidDevData.to_csv('csv_files/experiment_data/coughvid_devel.csv', index=False)

VidTrainData = pd.concat(df_list[num:])
VidTrainData.to_csv('csv_files/experiment_data/coughvid_train.csv', index=False)

In [None]:
df_list = []
for index, row in coswara.iterrows():
    temp = pd.DataFrame(columns=['filename','label'])
    fn = row['filename']
    temp['filename'] = [fn]
    temp['label'] = [row['label']]
    df_list.append(temp)
num = int(partisi*len(df_list))

CosDevData = pd.concat(df_list[0:num])
CosDevData.to_csv('csv_files/experiment_data/coswara_devel.csv', index=False)

CosTrainData = pd.concat(df_list[num:])
CosTrainData.to_csv('csv_files/experiment_data/coswara_train.csv', index=False)

In [None]:
df_list = []
for index,row in MergeData.iterrows():
    temp = pd.DataFrame(columns=['filename','label'])
    fn = row['filename']
    name = fn.split('.')[0].split('_')[0]
    if name == 'devel':
        temp['filename'] = [fn]
        temp['label'] = [row['label']]
    df_list.append(temp)
ComDevData = pd.concat(df_list)
ComDevData.to_csv('csv_files/experiment_data/compare_devel.csv', index=False)
ComDevData.head()

In [None]:
df_list = []
for index,row in MergeData.iterrows():
    temp = pd.DataFrame(columns=['filename','label'])
    fn = row['filename']
    name = fn.split('.')[0].split('_')[0]
    if name == 'test':
        temp['filename'] = [fn]
        temp['label'] = [row['label']]
    df_list.append(temp)
TestData = pd.concat(df_list)
TestData = TestData.sample(frac=1).reset_index(drop=True)
TestData.to_csv('csv_files/experiment_data/test.csv', index=False)
TestData.head()

In [None]:
df_list = []
for index,row in compare.iterrows():
    temp = pd.DataFrame(columns=['filename','label'])
    fn = row['filename']
    name = fn.split('.')[0].split('_')[0]
    if name == 'train':
        temp['filename'] = [fn]
        temp['label'] = [row['label']]
    df_list.append(temp)
ComTrainData = pd.concat(df_list)
ComTrainData.to_csv('csv_files/experiment_data/compare_train.csv', index=False)
ComTrainData.head()

In [None]:
DevData = pd.concat([CosDevData, VidDevData, ComDevData])
DevData.to_csv('csv_files/experiment_data/devel.csv', index=False)
DevData.head()

In [None]:
TrainData = pd.concat([CosTrainData, VidTrainData, ComTrainData])
TrainData.to_csv('csv_files/experiment_data/train.csv', index=False)
TrainData.head()