In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

In [3]:
if os.path.exists('csv_files')==False:
    os.mkdir('csv_files')

## Create Coswara's CSV File 

In [7]:
names   = ['ID','STATUS','DIR','DataSet']
join_by = pd.read_csv('dataset/Coswara-Data/combined_data.csv')

import glob
df_list = []
for path in tqdm(glob.glob('dataset/Coswara-Data/*/*/cough-shallow.wav')):
    temp = pd.DataFrame(columns=['id','DIR'])
    temp['id'] = [path.split('/')[-2]]
    temp['DIR'] = [path]
    temp = pd.merge(left=temp,right=join_by,on='id',how='inner')
    temp = temp[['id','covid_status','DIR']]
    df_list.append(temp.rename(columns={'id':'ID','covid_status':'STATUS','DIR':'DIR'}))
CosData=pd.concat(df_list)
CosData['DataSet'] = 'coswara'
CosData   = CosData.sample(frac=1).reset_index(drop=True)
CosData.to_csv('csv_files/Coswara_dataset.csv', index=False)
CosData.head()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2232/2232 [00:11<00:00, 200.46it/s]


Unnamed: 0,ID,STATUS,DIR,DataSet
0,PMVjl7cZfEaQD3QaKjvAG4Kidt92,resp_illness_not_identified,dataset/Coswara-Data/20200415/PMVjl7cZfEaQD3Qa...,coswara
1,0zUxdfS5aoS6SiEoF1x6JXOk25B3,healthy,dataset/Coswara-Data/20200820/0zUxdfS5aoS6SiEo...,coswara
2,uiMtR0VGFIaO3CQ1Tl8znyjlyHF2,healthy,dataset/Coswara-Data/20200417/uiMtR0VGFIaO3CQ1...,coswara
3,GMjhTrVkYsfH4RWjGFLCskzNnys2,healthy,dataset/Coswara-Data/20200419/GMjhTrVkYsfH4RWj...,coswara
4,D31ydMgq8wMpO9OOGwxpMfOp9qg1,healthy,dataset/Coswara-Data/20200814/D31ydMgq8wMpO9OO...,coswara


## Create Coughvid's CSV File

In [16]:
coughvid  = 'dataset/Coughvid-Data/public_dataset/'

VidData   = pd.read_csv(os.path.join(coughvid,'metadata_compiled.csv'),header=0)
VidData   = VidData.loc[VidData['cough_detected'] >= 0.9][['uuid','status']]
VidData.dropna(inplace=True)

extradata = VidData.loc[VidData['status']=='COVID-19']
notradata = VidData.loc[VidData['status']!='COVID-19'][0:1000]

VidData   = pd.concat([extradata,notradata],ignore_index= True)
VidData['DIR'] = coughvid + VidData['uuid'] + '.webm'
VidData['DataSet'] = 'coughvid'
VidData.rename(columns={'uuid': 'ID', 'status': 'STATUS'}, inplace=True)
VidData   = VidData.sample(frac=1).reset_index(drop=True)
VidData.to_csv('csv_files/Coughvid_dataset.csv', index=False)
VidData.head()

Unnamed: 0,ID,STATUS,DIR,DataSet
0,2d46ffd8-4f17-426d-af12-bafbf7dac732,healthy,dataset/Coughvid-Data/public_dataset/2d46ffd8-...,coughvid
1,23997166-603e-48dd-8fad-454256261613,healthy,dataset/Coughvid-Data/public_dataset/23997166-...,coughvid
2,25890602-367a-4831-ad1a-8b47735e23da,healthy,dataset/Coughvid-Data/public_dataset/25890602-...,coughvid
3,1f0d8f3b-c4fb-4f3e-b814-4b89f5cb03be,healthy,dataset/Coughvid-Data/public_dataset/1f0d8f3b-...,coughvid
4,1a496dea-1bd3-412e-af9e-09799afcad72,healthy,dataset/Coughvid-Data/public_dataset/1a496dea-...,coughvid


## Create Compare's CSV File

In [24]:
compare = 'dataset/Compare-Data/dist/wav'
df = pd.read_csv('dataset/Compare-Data/metaData_CCS.csv')
df_list = []
for index,row in tqdm(df.iterrows()):
    temp = pd.DataFrame(columns=['ID','STATUS','DIR','DataSet'])
    temp['ID'] = [row['Uid']]
    temp['STATUS'] = [row['label']]
    fn = row['filename']
    temp['DIR'] = [os.path.join(compare,fn)]
    temp['DataSet'] = ['compare']
    df_list.append(temp)
CompData = pd.concat(df_list)
CompData = CompData.sample(frac=1).reset_index(drop=True)
CompData.to_csv('csv_files/Compare_dataset.csv', index=False)
CompData.head()
    

725it [00:01, 643.15it/s]


Unnamed: 0,ID,STATUS,DIR,DataSet
0,hMYhXSGUrf,negative,dataset/Compare-Data/dist/wav/test_035.wav,compare
1,viXxWnJCQK,positive,dataset/Compare-Data/dist/wav/test_011.wav,compare
2,ssLllto5IN,negative,dataset/Compare-Data/dist/wav/test_038.wav,compare
3,2020-04-10-22_47_24_172784,positive,dataset/Compare-Data/dist/wav/train_175.wav,compare
4,CnXbgT7othVm,negative,dataset/Compare-Data/dist/wav/train_244.wav,compare


## Merge CSV 

In [25]:
MergeData = pd.concat([CosData,VidData,CompData])
MergeData   = MergeData.sample(frac=1).reset_index(drop=True)
df_list = []
for index,row in tqdm(MergeData.iterrows()):
    temp = pd.DataFrame(columns=['ID','STATUS','DIR','DataSet'])
    diagnose = row['STATUS']
    if diagnose == 'healthy' or diagnose == 'negative':
        status = 'negative'
        temp['ID'] = [row['ID']]
        temp['STATUS'] = [status]
        temp['DIR'] = [row['DIR']]
        temp['DataSet'] = [row['DataSet']] 
    elif diagnose == 'positive_mild' or diagnose == 'positive_moderate' or diagnose == 'positive' or diagnose == 'COVID-19':
        status = 'positive'
        temp['ID'] = [row['ID']]
        temp['STATUS'] = [status]
        temp['DIR'] = [row['DIR']]
        temp['DataSet'] = [row['DataSet']]
    df_list.append(temp)
MergeData = pd.concat(df_list)
MergeData = MergeData.sample(frac=1).reset_index(drop=True)
MergeData.to_csv('csv_files/Merge_dataset.csv', index=False)
MergeData.head()

4398it [00:06, 697.82it/s]


Unnamed: 0,ID,STATUS,DIR,DataSet
0,02b80927-ead6-492b-aca9-33a009050c27,negative,dataset/Coughvid-Data/public_dataset/02b80927-...,coughvid
1,5q8gLM0yCrgGCT8F9fWlH4ycl1D3,negative,dataset/Coswara-Data/20200820/5q8gLM0yCrgGCT8F...,coswara
2,14ce9bf1-2d6c-458c-93a6-98e24c8b67ca,negative,dataset/Coughvid-Data/public_dataset/14ce9bf1-...,coughvid
3,4X4N5BUaiG,positive,dataset/Compare-Data/dist/wav/train_259.wav,compare
4,78UvO8TmtycKb98kr2eXeDdLpST2,negative,dataset/Coswara-Data/20200415/78UvO8TmtycKb98k...,coswara
