In [10]:
# Necessary/extra dependencies. 
import os
import gc
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from shutil import copyfile
from sklearn.model_selection import train_test_split

#customize iPython writefile so we can write variables
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))

os.chdir("/Users/siyiwei/Desktop/COVID_XRay_Detection/")

In [11]:
TRAIN_PATH = './images/512jpg/'
IMG_SIZE = 512

In [12]:
# Load image level csv file
df = pd.read_csv('./image_info/train_image_level.csv')

# Modify values in the id column
df['id'] = df.apply(lambda row: row.id.split('_')[0], axis=1)
# Add absolute path
df['path'] = df.apply(lambda row: TRAIN_PATH+row.id+'.jpg', axis=1)
# Get image level labels
df['image_level'] = df.apply(lambda row: row.label.split(' ')[0], axis=1)

df.head(5)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level
0,000a312787f2,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,./images/512jpg/000a312787f2.jpg,opacity
1,000c3a3f293f,,none 1 0 0 1 1,ff0879eb20ed,./images/512jpg/000c3a3f293f.jpg,none
2,0012ff7358bc,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,./images/512jpg/0012ff7358bc.jpg,opacity
3,001398f4ff4f,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2,./images/512jpg/001398f4ff4f.jpg,opacity
4,001bd15d1891,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e,./images/512jpg/001bd15d1891.jpg,opacity


In [13]:
meta_df = pd.read_csv('./image_info/size.csv')
train_meta_df = meta_df.loc[meta_df.split == 'train']
train_meta_df = train_meta_df.drop('split', axis=1)
train_meta_df.columns = ['id', 'dim0', 'dim1']
train_meta_df["id"] = train_meta_df["id"].str.replace(".jpg", "")

train_meta_df.head(2)

  train_meta_df["id"] = train_meta_df["id"].str.replace(".jpg", "")


Unnamed: 0,id,dim0,dim1
1263,d8ba599611e5,2336,2836
1264,29b23a11d1e4,3488,4256


In [14]:
df = df.merge(train_meta_df, on='id',how="left")
df.head(2)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,dim0,dim1
0,000a312787f2,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,./images/512jpg/000a312787f2.jpg,opacity,3488,4256
1,000c3a3f293f,,none 1 0 0 1 1,ff0879eb20ed,./images/512jpg/000c3a3f293f.jpg,none,2320,2832


In [15]:
# Create train and validation split.
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df.image_level.values)

train_df.loc[:, 'split'] = 'train'
valid_df.loc[:, 'split'] = 'valid'

df = pd.concat([train_df, valid_df]).reset_index(drop=True)

print(f'Size of dataset: {len(df)}, training images: {len(train_df)}. validation images: {len(valid_df)}')

Size of dataset: 6334, training images: 5067. validation images: 1267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [16]:
# Get the raw bounding box by parsing the row value of the label column.
# Ref: https://www.kaggle.com/yujiariyasu/plot-3positive-classes
def get_bbox(row):
    bboxes = []
    bbox = []
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes

# Scale the bounding boxes according to the size of the resized image. 
def scale_bbox(row, bboxes):
    # Get scaling factor
    scale_x = IMG_SIZE/row.dim1
    scale_y = IMG_SIZE/row.dim0
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*(scale_x), 4))
        y1= int(np.round(bbox[3]*scale_y, 4))

        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
        
    return scaled_bboxes

# Convert the bounding boxes in YOLO format.
def get_yolo_format_bbox(img_w, img_h, bboxes):
    yolo_boxes = []
    for bbox in bboxes:
        w = bbox[2] - bbox[0] # xmax - xmin
        h = bbox[3] - bbox[1] # ymax - ymin
        xc = bbox[0] + int(np.round(w/2)) # xmin + width/2
        yc = bbox[1] + int(np.round(h/2)) # ymin + height/2
        
        yolo_boxes.append([xc/img_w, yc/img_h, w/img_w, h/img_h]) # x_center y_center width height
    
    return yolo_boxes

In [8]:
df.head(2)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,dim0,dim1,split
0,badf2d31cdbd,"[{'x': 484.0363, 'y': 955.76643, 'width': 587....",opacity 1 484.0363 955.76643 1071.75999 1657.7...,c508e7ff4063,./images/512jpg/badf2d31cdbd.jpg,opacity,2336,2836,train
1,8d95766f633e,"[{'x': 1821.70335, 'y': 880.30619, 'width': 66...",opacity 1 1821.70335 880.30619 2485.89476 1875...,2c78ef584129,./images/512jpg/8d95766f633e.jpg,opacity,2400,2880,train


In [17]:
for i in tqdm(range(len(df))):
    row = df.loc[i]
    # Get image id
    img_id = row.id
    # Get image-level label
    label = row.image_level
    
    file_name = f'./images/box_locations/{row.id}.txt'
        
    
    if label=='opacity':
        # Get bboxes
        bboxes = get_bbox(row)
        # Scale bounding boxes
        scale_bboxes = scale_bbox(row, bboxes)
        # Format for YOLOv5
        yolo_bboxes = get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes)
        
        with open(file_name, 'w') as f:
            for bbox in yolo_bboxes:
                bbox = [1]+bbox
                bbox = [str(i) for i in bbox]
                bbox = ' '.join(bbox)
                f.write(bbox)
                f.write('\n')
    else:
        with open(file_name, 'w') as f:
            f.write('\n')
    

100%|██████████| 6334/6334 [00:04<00:00, 1411.06it/s]
