In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import librosa 
import librosa.display
import os

# 1-Prepare Data

## Make Folder Dataset

In [None]:
if os.path.exists('dataset')==False:
    os.mkdir('dataset')

## Download Data

In [None]:
cd dataset/

### Download Coswara Dataset

In [None]:
!git clone https://github.com/iiscleap/Coswara-Data.git

### Extracting Coswara Dataset

In [None]:
rt_dir = 'Coswara-Data/'
import glob
for each in os.listdir(rt_dir):
    if os.path.isdir(os.path.join(rt_dir,each)) and each != '.git':
        print(each)
        !cat {os.path.join(rt_dir, each,'')}*.tar.gz.* > {os.path.join(rt_dir, each,'')}combined_file.tar.gz
        !tar -xzf {os.path.join(rt_dir, each,'')}combined_file.tar.gz -C {rt_dir}
subset = glob.glob('Coswara-Data/*/*.tar.gz.*') + glob.glob('Coswara-Data/*/combined_data.tar.gz')
[os.remove(x) for x in subset];

### Download Coughvid Dataset

In [None]:
!wget https://zenodo.org/record/4048312/files/public_dataset.zip

### Extracting Coughvid Dataset

In [None]:
if os.path.exists('Coughvid-Data')==False:
    os.mkdir('Coughvid-Data')

In [None]:
!unzip "public_dataset.zip" -d "Coughvid-Data/"

In [None]:
names   = ['ID','Fever/MP','ORC','STATUS','DIR','DataSet']
join_by = pd.read_csv('Coswara-Data/combined_data.csv')

import glob
df_list = []
for path in tqdm(glob.glob('Coswara-Data/*/*/cough-shallow.wav')):
  temp = pd.DataFrame(columns=['id','DIR'])
  temp['id'] = [path.split('/')[-2]]
  temp['DIR'] = [path]
  temp = pd.merge(left=temp,right=join_by,on='id',how='inner')

  temp['fomp']= (temp['fever']| temp['mp']).apply(int)
  temp['oths']= (temp['cld']|temp['asthma']|temp['cold']|temp['st']|temp['pneumonia']).apply(int)
  temp        = temp[['id','covid_status','DIR','fomp','oths']]
  df_list.append(temp.rename(columns={'id':'ID','covid_status':'STATUS','DIR':'DIR','fomp':'Fever/MP','oths':'ORC'}))
CosData=pd.concat(df_list)
CosData['DataSet'] = 'coswara'
CosData   = CosData.sample(frac=1).reset_index(drop=True)
CosData.head()

In [None]:
CosData.to_csv('Coswara_dataset.csv', index=False)

In [None]:
CosData = pd.read_csv('Coswara_dataset.csv')

## Prepare dataset

In [None]:
os.mkdir('image/')

In [None]:
coughvid  = 'public_dataset/'

VidData   = pd.read_csv(os.path.join(coughvid,'metadata_compiled.csv'),header=0)
VidData   = VidData.loc[VidData['cough_detected'] >= 0.9][['uuid','fever_muscle_pain','respiratory_condition','status']]
VidData.dropna(inplace=True)

extradata = VidData.loc[VidData['status']=='COVID-19']
notradata = VidData.loc[VidData['status']!='COVID-19'][0:1000]

TotData   = pd.concat([extradata,notradata],ignore_index= True)
TotData['DIR'] = coughvid + TotData['uuid'] + '.webm'
TotData['DataSet'] = 'coughvid'
TotData['fever_muscle_pain']    = TotData['fever_muscle_pain'].apply(int)
TotData['respiratory_condition']= TotData['respiratory_condition'].apply(int)
TotData   = pd.concat([CosData,TotData.rename(columns={'uuid':'ID','status':'STATUS','fever_muscle_pain':'Fever/MP','respiratory_condition':'ORC'})])
TotData   = TotData.sample(frac=1).reset_index(drop=True)
TotData.head()

In [None]:
TotData.to_csv('Total_Dataset.csv', index=False)

In [None]:
TotData = pd.read_csv('Total_Dataset.csv')
TotData.head()

In [None]:
compare  = 'ComParE2021-CCS-CSS-Data/dist/wav/'
join_by = pd.read_csv('ComParE2021-CCS-CSS-Data/metaData_CCS.csv')

df_list = []
for i in tqdm(os.listdir(compare)):
    temp = pd.DataFrame(columns=['filename','DIR'])
    temp['filename'] = [i]
    temp['DIR'] = [compare+i]
    temp = pd.merge(left=temp,right=join_by,on='filename',how='inner')

    temp        = temp[['Uid','label','DIR']]
    df_list.append(temp.rename(columns={'Uid':'ID','label':'STATUS','DIR':'DIR'}))
ComData=pd.concat(df_list)
ComData['DataSet'] = 'compare'
ComData.head(10)


In [None]:
FullData = pd.concat([ComData, TotData])
FullData = FullData[['ID','STATUS','DIR','DataSet']]
FullData  = FullData.sample(frac=1).reset_index(drop=True)
FullData.head(20)

In [None]:
FullData.to_csv('Merge_Dataset.csv', index=False)

In [None]:
diag = []
for f in FullData['STATUS']:
    diag.append(f)
key = list(dict.fromkeys(diag))
print(key) 

In [None]:
df_list = []
for index, row in FullData.iterrows():
    temp = pd.DataFrame(columns=['ID','STATUS','DIR','DataSet'])
    diagnose = row['STATUS']
    if diagnose == 'healthy' or diagnose == 'negative':
        status = 'negative'
        temp['ID'] = [row['ID']]
        temp['STATUS'] = [status]
        temp['DIR'] = [row['DIR']]
        temp['DataSet'] = [row['DataSet']] 
    elif diagnose == 'positive_mild' or diagnose == 'positive_moderate' or diagnose == 'positive' or diagnose == 'COVID-19':
        status = 'positive'
        temp['ID'] = [row['ID']]
        temp['STATUS'] = [status]
        temp['DIR'] = [row['DIR']]
        temp['DataSet'] = [row['DataSet']]
    df_list.append(temp)
CovidData=pd.concat(df_list)
CovidData = CovidData.sample(frac=1).reset_index(drop=True)
CovidData.head()
    

In [None]:
CovidData.to_csv('Covid_Dataset.csv', index=False)

In [None]:
CovidData = pd.read_csv('Covid_Dataset.csv')

In [None]:
df_list = []
for index, row in CovidData.iterrows():
    temp = pd.DataFrame(columns=['STATUS','DIR','DataSet'])
    diagnose = row['STATUS']
    dirs = row['DIR']
    filename = dirs.split('/')
    if diagnose == 'healthy' or diagnose == 'negative':
        status = 'negative'
        temp['ID'] = [row['ID']]
        temp['STATUS'] = [status]
        temp['DIR'] = [row['DIR']]
        temp['DataSet'] = [row['DataSet']] 
    elif diagnose == 'positive_mild' or diagnose == 'positive_moderate' or diagnose == 'positive' or diagnose == 'COVID-19':
        status = 'positive'
        temp['ID'] = [row['ID']]
        temp['STATUS'] = [status]
        temp['DIR'] = [row['DIR']]
        temp['DataSet'] = [row['DataSet']]
    df_list.append(temp)
CovidData=pd.concat(df_list)
CovidData = CovidData.sample(frac=1).reset_index(drop=True)
CovidData.head()
    

In [None]:
df_list = []
for index, row in tqdm(CovidData.iterrows()):
    temp = pd.DataFrame(columns=['STATUS','DIR','DataSet'])
    diagnose = row['STATUS']
    dirs = row['DIR']
    dataset = row['DataSet']
    filename = dirs.split('/')[-1].split('.')[0].split('_')[0]
    if filename == 'test':
        #print(filename)
        temp['STATUS'] = [row['STATUS']]
        temp['DIR'] = [row['DIR']]
        temp['DataSet'] = [dataset]
    df_list.append(temp)
    
TestData = pd.concat(df_list)
TestData = TestData.sample(frac=1).reset_index(drop=True)
TestData.head(20)
    

In [None]:
TestData.to_csv('Test_Dataset.csv', index=False)

In [None]:
df_list = []
for index, row in tqdm(CovidData.iterrows()):
    temp = pd.DataFrame(columns=['STATUS','DIR','DataSet','ID'])
    diagnose = row['STATUS']
    dirs = row['DIR']
    dataset = row['DataSet']
    data = row['ID']
    filename = dirs.split('/')[-1].split('.')[0].split('_')[0]
    if filename != 'test' and filename != 'devel':
        #print(filename)
        temp['STATUS'] = [row['STATUS']]
        temp['DIR'] = [row['DIR']]
        temp['DataSet'] = [dataset]
        temp['ID'] = [data]
    df_list.append(temp)
    
TrainData = pd.concat(df_list)
TrainData = TrainData.sample(frac=1).reset_index(drop=True)
TrainData.head(20)
    

In [None]:
df_list=[]
for index,row in tqdm(TrainData.iterrows()):
    temp = pd.DataFrame(columns=['filename','label'])
    diagnose = row['STATUS']
    dirs = row['DIR']
    dataset = row['DataSet']
    name = row['ID']
    filename = dirs.split('/')[-1]
    if dataset == 'coswara':
        fn = name+'.wav'
        temp['label'] = [row['STATUS']]
        temp['filename'] = [fn]
    elif dataset == 'coughvid':
        fn = name+'.wav'
        temp['label'] = [row['STATUS']]
        temp['filename'] = [fn]
    else:
        temp['label'] = [row['STATUS']]
        temp['filename'] = [dirs.split('/')[-1]]
    
        df_list.append(temp)
    
Train = pd.concat(df_list)
Train = Train.sample(frac=1).reset_index(drop=True)
Train.head(20)
    

In [None]:
Train.to_csv('train.csv', index=False)

In [None]:
df_list = []
for index, row in tqdm(CovidData.iterrows()):
    temp = pd.DataFrame(columns=['filename','label'])
    diagnose = row['STATUS']
    dirs = row['DIR']
    dataset = row['DataSet']
    filename = dirs.split('/')[-1].split('.')[0].split('_')[0]
    if filename != 'test' and filename != 'devel':
        #print(filename)
        temp['label'] = [row['STATUS']]
        temp['filename'] = [dirs.split('/')[-1].split('.')[0]]
    df_list.append(temp)
    
TrainData = pd.concat(df_list)
TrainData = TrainData.sample(frac=1).reset_index(drop=True)
TrainData.head(20)
    

In [None]:
df_list = []
for index, row in tqdm(CovidData.iterrows()):
    temp = pd.DataFrame(columns=['filename','label', 'dir'])
    diagnose = row['STATUS']
    dirs = row['DIR']
    dataset = row['DataSet']
    filename = dirs.split('/')[-1].split('.')[0].split('_')[0]
    if filename != 'test' and filename != 'devel' and filename != 'train':
        #print(filename)
        temp['label'] = [row['STATUS']]
        temp['filename'] = [dirs.split('/')[-1]]
        temp['dir'] = [dirs]
    df_list.append(temp)
    
TrainData = pd.concat(df_list)
TrainData = TrainData.sample(frac=1).reset_index(drop=True)
TrainData.head(20)
    

In [None]:
import shutil
import os

In [None]:
for index, row in tqdm(TrainData.iterrows()):
    source = row['dir']
    num = row['filename']
    path = os.path.join('SPIRA-ComParE2021 (copy)/Tosse/dist/wav_normalized',num)
    if os.path.exists(source) == True:
        shutil.copyfile(source,path)

In [None]:
df_list = []
for index, row in tqdm(CovidData.iterrows()):
    temp = pd.DataFrame(columns=['filename','label'])
    diagnose = row['STATUS']
    dirs = row['DIR']
    dataset = row['DataSet']
    filename = dirs.split('/')[-1].split('.')[0].split('_')[0]
    if os.path.exists(dirs) == True :
        if filename != 'test' and filename != 'devel':
            #print(filename)
            temp['label'] = [row['STATUS']]
            temp['filename'] = [dirs.split('/')[-1]]
    df_list.append(temp)
    
TrainData = pd.concat(df_list)
TrainData = TrainData.sample(frac=1).reset_index(drop=True)
TrainData.head(20)
    

In [None]:
TrainData.to_csv('train.csv', index=False)

In [None]:
!ffmpeg -i 'public_dataset/63f4e572-e693-4265-8b77-07f552fc63e9.webm' -c:a pcm_f32le 'out.wav'

In [None]:
!pip install ffmpeg-python

In [None]:
for index, row in tqdm(TrainData.iterrows()):
    source = row['DIR']
    #num = row['filename']
    data = row['ID']
    path = os.path.join('SPIRA-ComParE2021 (copy)/Tosse/dist/wav_normalized',data+'.wav')
    if os.path.exists(source) == True:
        if (source.endswith(".webm")):
            command = f"ffmpeg -i ' {source} -c:a pcm_f32le {path}"
            print(command)
            os.system(command)

In [None]:
data = pd.read_csv('train.csv')
data.head()