In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import os
import cv2
from google.colab.patches import cv2_imshow
import torch.optim as optim
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, DataLoader
import albumentations as A
import glob
from sklearn.model_selection import train_test_split

MONTGOMERY_PATH = 'drive/MyDrive/course_project/Montgomery/MontgomerySet/'
CHINA_PATH = 'drive/MyDrive/course_project/ChinaSet_AllFiles/ChinaSet_AllFiles/'

MONTGOMERY_XRAYS_PATH = MONTGOMERY_PATH + 'CXR_png'
CHINA_XRAYS_PATH = CHINA_PATH + 'CXR_png'

MONTGOMERY_INFO_PATH = MONTGOMERY_PATH + 'ClinicalReadings'
CHINA_INFO_PATH = CHINA_PATH + 'ClinicalReadings'

MONTGOMERY_RIGHTMASK_PATH = MONTGOMERY_PATH + 'ManualMask/rightMask'
MONTGOMERY_LEFTMASK_PATH = MONTGOMERY_PATH + 'ManualMask/leftMask'
CHINA_MASKS_PATH = CHINA_PATH + 'mask/'

IMAGES_PATH = 'drive/MyDrive/course_project/images/' # folder to save all prepared images from both datasets
MASKS_PATH = 'drive/MyDrive/course_project/masks/' # masks for both datatsets

TRAIN_UNET_PATH = 'drive/MyDrive/course_project/train_unet'
VAL_UNET_PATH = 'drive/MyDrive/course_project/val_unet'
TEST_UNET_PATH = 'drive/MyDrive/course_project/test_unet'

TRAIN_PATH = 'drive/MyDrive/course_project/train'
VAL_PATH = 'drive/MyDrive/course_project/val'
TEST_PATH = 'drive/MyDrive/course_project/test'

weights_dir = 'drive/MyDrive/course_project/weight'

IMG_SIZE = 512
BN_EPS = 1e-3
MEAN =  np.array([0.485, 0.456, 0.406]) 
STD =  np.array([0.229, 0.224, 0.225])
BATCH_SIZE = 3

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
DEVICE

device(type='cpu')

In [None]:
directories = [TRAIN_UNET_PATH, VAL_UNET_PATH, TEST_UNET_PATH, TRAIN_PATH, VAL_PATH, TEST_PATH, IMAGES_PATH, MASKS_PATH, weights_dir]
for dir in directories:
  if not os.path.exists(dir) :
    os.mkdir(dir)

In [None]:
os.path.exists(MASKS_PATH)

True

Now join left and right parts of mask for all images and write them to folder masks.

In [None]:
for filename in os.listdir(MONTGOMERY_LEFTMASK_PATH):
  if not filename.startswith('MCUCXR'):
    continue
  left = cv2.imread(os.path.join(MONTGOMERY_LEFTMASK_PATH, filename))
  right = cv2.imread(os.path.join(MONTGOMERY_RIGHTMASK_PATH, filename))
  assert left.shape == right.shape
  new_mask = cv2.resize(np.maximum(left, right), (IMG_SIZE, IMG_SIZE))
  status = cv2.imwrite(os.path.join(MASKS_PATH, filename), new_mask)
  if not status:
    print('There is error while saving mask for', filename)
    break

In [None]:
all_data = pd.DataFrame(columns=['filename', 'label'])

all_unet = pd.DataFrame(columns=['filename'])
train_unet_df = pd.DataFrame(columns=['filename'])
val_unet_df = pd.DataFrame(columns=['filename'])
test_unet_df = pd.DataFrame(columns=['filename'])

train_val_model_data = pd.DataFrame(columns=['filename', 'label'])
train_df = pd.DataFrame(columns=['filename', 'label'])
val_df = pd.DataFrame(columns=['filename', 'label'])
test_df = pd.DataFrame(columns=['filename', 'label'])

 There are masks with lung area not for all x-rays from the Shenzhen set. But we still have labels of this images, so all of them can be in test part of images. Let's find them and write their names to test_df

In [None]:
china_masks_names = []

for filename in os.listdir(CHINA_MASKS_PATH):
  if not filename.startswith('CHNCXR'):
    continue
  china_masks_names.append(filename)
  all_data = all_data.append({"filename": filename, "label": int(filename[-5])}, ignore_index=True)
  china_mask_img = cv2.imread(os.path.join(CHINA_MASKS_PATH, filename))
  status = cv2.imwrite(os.path.join(MASKS_PATH, filename), 
                       cv2.resize(china_mask_img, (IMG_SIZE, IMG_SIZE)))
  if not status:
    print("Error while saving mask {}.".format(filename))
  
counter_test = 0
for filename in os.listdir(CHINA_XRAYS_PATH):
  if not filename.startswith('CHNCXR'):
    continue
  if filename not in china_masks_names:
    if counter_test < 80:
      test_df = test_df.append({"filename": filename, "label": int(filename[-5])}, ignore_index=True)
      china_image = cv2.imread(os.path.join(CHINA_XRAYS_PATH, filename))
      status = cv2.imwrite(os.path.join(TEST_PATH, filename), 
                        cv2.resize(china_image, (IMG_SIZE, IMG_SIZE)))
      counter_test += 1

      if not status:
        print("Error while saving mask {}.".format(filename))
    else:
      train_df = train_df.append({"filename": filename, "label": int(filename[-5])}, 
                                                         ignore_index=True)
      counter_test += 1

Let's save filenames of all x-rays from montgomery dataset (they all have masks) in all_data dataframe to split them into train, validation and test then.

In [None]:
counter = 0

for filename in os.listdir(MONTGOMERY_XRAYS_PATH):
  if not filename.startswith('MCUCXR'):
    continue
  if counter < 10:
    test_df = test_df.append({"filename": filename, "label": int(filename[-5])}, ignore_index=True)
    image = cv2.imread(os.path.join(MONTGOMERY_XRAYS_PATH, filename))
    status = cv2.imwrite(os.path.join(TEST_PATH, filename), 
                       cv2.resize(image, (IMG_SIZE, IMG_SIZE)))
    counter += 1
    if not status:
      print("Error while saving mask {}.".format(filename))
  else:
    all_data = all_data.append({"filename": filename, "label": int(filename[-5])}, ignore_index=True)
    counter += 1

In [None]:
# 1

model_train_data, unet_data = train_test_split(all_data, test_size=0.273, random_state=42)

In [None]:
len(model_train_data), len(unet_data)

(504, 190)

In [None]:
# 2
model_train, val_df = train_test_split(model_train_data, test_size=0.1785, random_state=41)

In [None]:
len(model_train), len(val_df)

(414, 90)

In [None]:
train_df = train_df.append(model_train, ignore_index=True)

In [None]:
len(train_df), len(val_df), len(test_df)

(430, 90, 90)

In [None]:
# save dataframes
basic_path = 'drive/MyDrive/course_project/'

train_df.to_csv(basic_path + 'train.csv')
val_df.to_csv(basic_path + 'val.csv')
test_df.to_csv(basic_path + 'test.csv')


In [None]:
# 3

train_val_unet, test_unet_df = train_test_split(unet_data, test_size=0.127, random_state=40)

In [None]:
# 4

train_unet_df, val_unet_df = train_test_split(train_val_unet, test_size=0.15, random_state=39)

In [None]:
len(train_unet_df), len(val_unet_df), len(test_unet_df)

(140, 25, 25)

In [None]:
basic_path = 'drive/MyDrive/course_project/'

train_unet_df.to_csv(basic_path + 'unet_train.csv')
val_unet_df.to_csv(basic_path + 'unet_val.csv')
test_unet_df.to_csv(basic_path + 'unet_test.csv')

Let's resave resized ($512 \times 512$) images in train, validation and test directories

In [None]:
for _, row in train_df.iterrows():
  if row['filename'].startswith('CHNCXR'):
    image = cv2.imread(os.path.join(CHINA_XRAYS_PATH, row['filename']))
    status = cv2.imwrite(os.path.join(TRAIN_PATH, row['filename']), 
                       cv2.resize(image, (IMG_SIZE, IMG_SIZE)))
  elif row['filename'].startswith('MCUCXR'):
    image = cv2.imread(os.path.join(MONTGOMERY_XRAYS_PATH, row['filename']))
    status = cv2.imwrite(os.path.join(TRAIN_PATH, row['filename']), 
                       cv2.resize(image, (IMG_SIZE, IMG_SIZE)))
  else:
    print('Unknown file in train!')
  if not status:
    print("Error while saving mask {}.".format(row['filename']))

In [None]:
for _, row in val_df.iterrows():
  if row['filename'].startswith('CHNCXR'):
    image = cv2.imread(os.path.join(CHINA_XRAYS_PATH, row['filename']))
    status = cv2.imwrite(os.path.join(VAL_PATH, row['filename']), 
                       cv2.resize(image, (IMG_SIZE, IMG_SIZE)))
  elif row['filename'].startswith('MCUCXR'):
    image = cv2.imread(os.path.join(MONTGOMERY_XRAYS_PATH, row['filename']))
    status = cv2.imwrite(os.path.join(VAL_PATH, row['filename']), 
                       cv2.resize(image, (IMG_SIZE, IMG_SIZE)))
  else:
    print('Unknown file in train!')
  if not status:
    print("Error while saving mask {}.".format(row['filename']))

In [None]:
for _, row in train_unet_df.iterrows():
  if row['filename'].startswith('CHNCXR'):
    image = cv2.imread(os.path.join(CHINA_XRAYS_PATH, row['filename']))
    status = cv2.imwrite(os.path.join(TRAIN_UNET_PATH, row['filename']), 
                       cv2.resize(image, (IMG_SIZE, IMG_SIZE)))
  elif row['filename'].startswith('MCUCXR'):
    image = cv2.imread(os.path.join(MONTGOMERY_XRAYS_PATH, row['filename']))
    status = cv2.imwrite(os.path.join(TRAIN_UNET_PATH, row['filename']), 
                       cv2.resize(image, (IMG_SIZE, IMG_SIZE)))
  else:
    print('Unknown file in train!')
  if not status:
    print("Error while saving mask {}.".format(row['filename']))

In [None]:
for _, row in val_unet_df.iterrows():
  if row['filename'].startswith('CHNCXR'):
    image = cv2.imread(os.path.join(CHINA_XRAYS_PATH, row['filename']))
    status = cv2.imwrite(os.path.join(VAL_UNET_PATH, row['filename']), 
                       cv2.resize(image, (IMG_SIZE, IMG_SIZE)))
  elif row['filename'].startswith('MCUCXR'):
    image = cv2.imread(os.path.join(MONTGOMERY_XRAYS_PATH, row['filename']))
    status = cv2.imwrite(os.path.join(VAL_UNET_PATH, row['filename']), 
                       cv2.resize(image, (IMG_SIZE, IMG_SIZE)))
  else:
    print('Unknown file in train!')
  if not status:
    print("Error while saving mask {}.".format(row['filename']))

In [None]:
for _, row in test_unet_df.iterrows():
  if row['filename'].startswith('CHNCXR'):
    image = cv2.imread(os.path.join(CHINA_XRAYS_PATH, row['filename']))
    status = cv2.imwrite(os.path.join(TEST_UNET_PATH, row['filename']), 
                       cv2.resize(image, (IMG_SIZE, IMG_SIZE)))
  elif row['filename'].startswith('MCUCXR'):
    image = cv2.imread(os.path.join(MONTGOMERY_XRAYS_PATH, row['filename']))
    status = cv2.imwrite(os.path.join(TEST_UNET_PATH, row['filename']), 
                       cv2.resize(image, (IMG_SIZE, IMG_SIZE)))
  else:
    print('Unknown file in train!')
  if not status:
    print("Error while saving mask {}.".format(row['filename']))

In [None]:
train_df['label'].value_counts()

1    219
0    211
Name: label, dtype: int64

In [None]:
val_df['label'].value_counts()

0    46
1    44
Name: label, dtype: int64

In [None]:
test_df['label'].value_counts()

0    57
1    33
Name: label, dtype: int64