In [7]:
import numpy as np
import pandas as pd
import os
import random 
from shutil import copyfile
from tqdm import tqdm

In [2]:
seed = 0
np.random.seed(seed) # Reset the seed so all runs are the same.
random.seed(seed)
MAXVAL = 255  # Range [0 255]

In [3]:
def get_clahe(img):
    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    lab_planes = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(16,16))
    lab_planes[0] = clahe.apply(lab_planes[0])
    lab = cv2.merge(lab_planes)
    bgr = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
    return bgr

In [5]:
covid_front_df = pd.read_pickle('../covid_df.pkl')
covid_front_df = covid_front_df[~covid_front_df.filename.isin(['kjr-21-e24-g001-l-b.jpg', 
                                                               'kjr-21-e24-g002-l-c.jpg', 
                                                               'kjr-21-e24-g003-l-b.jpg'])]
covid_front_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73 entries, 0 to 125
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Patientid       73 non-null     int64  
 1   offset          56 non-null     float64
 2   sex             67 non-null     object 
 3   age             63 non-null     float64
 4   finding         73 non-null     object 
 5   survival        29 non-null     object 
 6   view            73 non-null     object 
 7   date            73 non-null     object 
 8   location        60 non-null     object 
 9   filename        73 non-null     object 
 10  doi             41 non-null     object 
 11   url            73 non-null     object 
 12  license         11 non-null     object 
 13  clinical notes  68 non-null     object 
 14  other notes     32 non-null     object 
 15  Unnamed: 15     0 non-null      float64
dtypes: float64(3), int64(1), object(12)
memory usage: 9.7+ KB


In [26]:
covid_front_df.Patientid.unique()

array([ 2,  4,  6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
       32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65])

In [27]:
train_patients = random.sample(list(covid_front_df.Patientid.unique()), 47)
len(train_patients)

47

In [30]:
covid_train = covid_front_df[covid_front_df.Patientid.isin(train_patients)].filename.values
covid_test = covid_front_df[~covid_front_df.Patientid.isin(train_patients)].filename.values
len(covid_train), len(covid_test)

(68, 5)

### Replicate covidx dataset

In [32]:
for filename in tqdm(covid_train):
    # print(filename)
    # copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'test', patient[1]))
    copyfile('../covid-chestxray-dataset-master/images/'+filename, './images_covidx/covid/'+filename)

100%|██████████| 68/68 [00:00<00:00, 235.03it/s]


In [11]:
for filename in os.listdir('./chest_xray/train/NORMAL/'):
    copyfile('./chest_xray/train/NORMAL/'+filename, './images/normal/'+filename)

In [15]:
len(os.listdir('./chest_xray/train/PNEUMONIA/'))

3875

In [16]:
for filename in tqdm(os.listdir('./chest_xray/train/PNEUMONIA/')):
    # copyfile('./chest_xray/train/NORMAL/'+filename, './images/normal/'+filename)
    # print(filename, filename.find('bacteria')!=-1)
    if(filename.find('bacteria')!=-1):
        copyfile('./chest_xray/train/PNEUMONIA/'+filename, './images/bacterial/'+filename)
    else:
        copyfile('./chest_xray/train/PNEUMONIA/'+filename, './images/viral/'+filename)

100%|██████████| 3875/3875 [00:06<00:00, 555.89it/s]


### Covid X But Binary

In [34]:
for filename in tqdm(random.sample(os.listdir('./chest_xray/train/NORMAL/'),40)):
    copyfile('./chest_xray/train/NORMAL/'+filename, './images_binary_covidx/non-covid/'+filename)

100%|██████████| 40/40 [00:00<00:00, 132.95it/s]


In [35]:
bacteria_files = [filename for filename in os.listdir('./chest_xray/train/PNEUMONIA/') \
                  if filename.find('bacteria')!=-1]
len(bacteria_files)

2530

In [37]:
for filename in tqdm(random.sample(bacteria_files,40)):
    copyfile('./chest_xray/train/PNEUMONIA/'+filename, './images_binary_covidx/non-covid/'+filename)

100%|██████████| 40/40 [00:00<00:00, 175.40it/s]


In [39]:
viral_files = [filename for filename in os.listdir('./chest_xray/train/PNEUMONIA/') \
               if filename.find('bacteria')==-1]
len(viral_files)

1345

In [40]:
for filename in tqdm(random.sample(viral_files,40)):
    copyfile('./chest_xray/train/PNEUMONIA/'+filename, './images_binary_covidx/non-covid/'+filename)

100%|██████████| 40/40 [00:00<00:00, 225.84it/s]


In [46]:
120/len(os.listdir('./chest_xray/train/PNEUMONIA/') + os.listdir('./chest_xray/train/NORMAL/'))

0.023006134969325152

In [47]:
40/len(os.listdir('./chest_xray/train/PNEUMONIA/') + os.listdir('./chest_xray/train/NORMAL/'))

0.007668711656441718