In [1]:
#Libraries for files preparation

import os
import shutil
import pandas as pd
from sklearn.utils import shuffle

COVID-19 Radiography Database: https://www.kaggle.com/datasets/tawsifurrahman/covid19-radiography-database

In [2]:
#Install: pip install openpyxl
covid = pd.read_excel('../COVID-19_Radiography_Dataset/COVID.metadata.xlsx')
covid.head()

Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,COVID-1,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
1,COVID-2,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
2,COVID-3,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
3,COVID-4,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
4,COVID-5,PNG,256*256,https://sirm.org/category/senza-categoria/covi...


In [3]:
normal = pd.read_excel('../COVID-19_Radiography_Dataset/Normal.metadata.xlsx')
normal.head()

Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,NORMAL-1,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...
1,NORMAL-2,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...
2,NORMAL-3,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...
3,NORMAL-4,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...
4,NORMAL-5,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...


In [4]:
#Checking the numbers of each class
print("COVID-19 has " +str(len(covid))+ " cases")
print("Normal has " +str(len(normal))+ " cases")

COVID-19 has 3616 cases
Normal has 10192 cases


In [5]:
SAMPLE_SIZE = 3616

In [6]:
#Add label for each case
normal['label'] = 0
covid['label'] = 1

In [7]:
#Drop non-related columns
normal = normal[['FILE NAME' ,'label']]
covid = covid[['FILE NAME', 'label']]

In [8]:
covid.head()

Unnamed: 0,FILE NAME,label
0,COVID-1,1
1,COVID-2,1
2,COVID-3,1
3,COVID-4,1
4,COVID-5,1


In [9]:
normal.head()

Unnamed: 0,FILE NAME,label
0,NORMAL-1,0
1,NORMAL-2,0
2,NORMAL-3,0
3,NORMAL-4,0
4,NORMAL-5,0


In [10]:
#Sampling data for covid and normal cases
df_normal = normal.sample(SAMPLE_SIZE, random_state=26)
df_covid = covid.sample(SAMPLE_SIZE, random_state=26)

#Concanate dataframes
data = pd.concat([df_normal, df_covid], axis=0)

#Checking numbers of each label
data['label'].value_counts()

0    3616
1    3616
Name: label, dtype: int64

In [11]:
#Shuflle data
data = shuffle(data)
data.head()

Unnamed: 0,FILE NAME,label
979,COVID-980,1
6416,NORMAL-6417,0
8394,NORMAL-8395,0
7161,NORMAL-7162,0
3656,NORMAL-3657,0


Train test split with the data (80:20)

In [12]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(data, test_size=0.2, random_state=26, stratify=data['label'])

print(df_train.shape)
print(df_val.shape)

(5785, 2)
(1447, 2)


In [13]:
df_train['label'].value_counts()

0    2893
1    2892
Name: label, dtype: int64

In [14]:
df_val['label'].value_counts()

1    724
0    723
Name: label, dtype: int64

Separating images into folders

In [15]:
#Creating a new directory to the image
data_dir = 'dataset'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    print(" The root dataset folder created!")

 The root dataset folder created!


In [16]:
#Creating two folders inside 'data_dir'

#/dataset
#../train
#../../covid
#../../normal
#../val
#../../covid
#../../normal

#Creating a path to 'data_dir' wo which we will join the names of the new folders
#Creating folders inside the train and validation folders
#We create folders for each class, inside the each folder

#train
train_dir = os.path.join(data_dir, 'train')
if not os.path.exists(train_dir):
    os.mkdir(train_dir)
    print("\t-train folder created on dataset...")

#Creating new folders inside train_dir
train_covid = os.path.join(train_dir, 'covid')
if not os.path.exists(train_covid):
    os.mkdir(train_covid)
    print('\t\t-covid folder created on train folder...')

train_normal = os.path.join(train_dir, 'normal')
if not os.path.exists(train_normal):
    os.mkdir(train_normal)
    print('\t\t-normal folder created on train folder...')


#val
val_dir = os.path.join(data_dir, 'val')
if not os.path.exists(val_dir):
    os.mkdir(val_dir)
    print("\t-val folder created on dataset...")

#Creating new folders inside val_dir
val_covid = os.path.join(val_dir, 'covid')
if not os.path.exists(val_covid):
    os.mkdir(val_covid)
    print('\t\t-covid folder created on val folder...')

val_normal = os.path.join(val_dir, 'normal')
if not os.path.exists(val_normal):
    os.mkdir(val_normal)
    print('\t\t-normal folder created on val folder...')

	-train folder created on dataset...
		-covid folder created on train folder...
		-normal folder created on train folder...
	-val folder created on dataset...
		-covid folder created on val folder...
		-normal folder created on val folder...


In [17]:
#Check the folders in train_dir
os.listdir(train_dir)

['covid', 'normal']

Transfer the images into folders

In [18]:
train_list = list(df_train['FILE NAME'])
val_list = list(df_val['FILE NAME'])

In [19]:
#Copying images to train_dir folder

for image in train_list:
    filename = image + '.png'
    #Get the label for a certain image
    target = int(data.loc[data['FILE NAME'] == image, ['label']].values)
    #Match the target with the folder's name and source path of the image
    if target == 1:
        label = 'covid'
        src = os.path.join('../COVID-19_Radiography_Dataset/COVID/images', filename)
        print(f"Moving {filename} image to train folder")

    elif target == 0:
        label = 'normal'
        filename = filename.capitalize()
        src = os.path.join('../COVID-19_Radiography_Dataset/Normal/images', filename)
        print(f"Moving {filename} image to train folder")

    dest = os.path.join(train_dir, label, filename)
    shutil.copyfile(src=src, dst=dest)

Moving COVID-2095.png image to train folder
Moving Normal-8918.png image to train folder
Moving COVID-976.png image to train folder
Moving Normal-8039.png image to train folder
Moving COVID-3233.png image to train folder
Moving COVID-2083.png image to train folder
Moving COVID-1586.png image to train folder
Moving Normal-7381.png image to train folder
Moving COVID-3165.png image to train folder
Moving Normal-4294.png image to train folder
Moving Normal-7124.png image to train folder
Moving Normal-1476.png image to train folder
Moving COVID-346.png image to train folder
Moving Normal-6605.png image to train folder
Moving Normal-4193.png image to train folder
Moving Normal-10096.png image to train folder
Moving Normal-2222.png image to train folder
Moving Normal-8704.png image to train folder
Moving COVID-82.png image to train folder
Moving COVID-3067.png image to train folder
Moving Normal-6308.png image to train folder
Moving Normal-10090.png image to train folder
Moving COVID-2655.png

In [20]:
#Copying images to val_dir folder

for image in val_list:
    filename = image + '.png'
    target = int(data.loc[data['FILE NAME'] == image, ['label']].values)

    if target == 1:
        label = 'covid'
        src = os.path.join('../COVID-19_Radiography_Dataset/COVID/images', filename)
        print(f"Moving {filename} image to val folder")

    elif target == 0:
        label = 'normal'
        filename = filename.capitalize()
        src = os.path.join('../COVID-19_Radiography_Dataset/Normal/images', filename)
        print(f"Moving {filename} image to val folder")
    

    dest = os.path.join(val_dir, label, filename)
    shutil.copyfile(src=src, dst=dest)

Moving COVID-3383.png image to val folder
Moving Normal-3311.png image to val folder
Moving COVID-1087.png image to val folder
Moving COVID-774.png image to val folder
Moving COVID-1006.png image to val folder
Moving COVID-2651.png image to val folder
Moving Normal-4158.png image to val folder
Moving Normal-9678.png image to val folder
Moving COVID-1814.png image to val folder
Moving Normal-9423.png image to val folder
Moving COVID-3474.png image to val folder
Moving COVID-248.png image to val folder
Moving COVID-1755.png image to val folder
Moving Normal-9871.png image to val folder
Moving Normal-8765.png image to val folder
Moving Normal-452.png image to val folder
Moving COVID-3012.png image to val folder
Moving Normal-6526.png image to val folder
Moving COVID-1166.png image to val folder
Moving COVID-1286.png image to val folder
Moving COVID-2353.png image to val folder
Moving COVID-2035.png image to val folder
Moving COVID-865.png image to val folder
Moving COVID-2039.png image to

In [21]:
print('Train - COVID: ',len(os.listdir('dataset/train/covid')))
print('Train - NORMAL: ',len(os.listdir('dataset/train/normal')))
print('Validation - COVID: ',len(os.listdir('dataset/val/covid')))
print('Validation - NORMAL: ',len(os.listdir('dataset/val/normal')))

Train - COVID:  2892
Train - NORMAL:  2893
Validation - COVID:  724
Validation - NORMAL:  723
