In [43]:
import pandas as pd

#### Dev data contains 1,000 images - 500 with objects and 500 without objects.

In [34]:
dev_csv = u'../data/raw/dev.csv'
orig_dev_df = pd.read_csv(dev_csv)
orig_dev_df.head()

Unnamed: 0,image_name,annotation
0,08001.jpg,0 539 1036 897 1460
1,08002.jpg,
2,08003.jpg,
3,08004.jpg,0 828 1607 1181 2021
4,08005.jpg,


The original format of the "annotation" column is
```
ANNO_TYPE_IDX x1 y1 x2 y2 ... xn yn;...
```
where: 
- ANNO_TYPE_IDX: 0, 1, 2 for rectangle, ellipse and polygon
- x1 y1 x2 y2: coordinates of the bounding shape
     - for rectangle, ellipse, it's a box where x1<x2 and y1<y2
     - for polygon, it's a sequence of coordinates (it can easily be translated to a box, but for the sake of this project, we are removing these)
-`;`: marks/separates multiple objects, we are also removing it for simplicity

Conclusion: we are only keeping `(0 or 1) x1 y1 x2 y2`, same training

In [35]:
orig_dev_df.annotation.dtype

dtype('O')

In [36]:
# separate images with and without object
dev_w_obj_df = orig_dev_df[orig_dev_df.annotation.notnull()]
dev_wo_obj_df = orig_dev_df[orig_dev_df.annotation.isnull()]
dev_w_obj_img = dev_w_obj_df.image_name.tolist()
dev_wo_obj_img = dev_wo_obj_df.image_name.tolist()

In [37]:
dev_w_obj_df.shape

(500, 2)

In [38]:
dev_wo_obj_df.shape

(500, 2)

In [39]:
# remove anything with lens>5
dev_w_obj_df['annotation'] = dev_w_obj_df['annotation'].astype(str)
dev_w_obj_df['annotation_len'] = dev_w_obj_df['annotation'].apply(lambda x: len(x.split(' ')))
dev_w_obj_df = dev_w_obj_df[dev_w_obj_df.annotation_len <= 5]
# drop the annotation_len column
dev_w_obj_df = dev_w_obj_df.drop('annotation_len', axis=1)
dev_w_obj_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev_w_obj_df['annotation'] = dev_w_obj_df['annotation'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev_w_obj_df['annotation_len'] = dev_w_obj_df['annotation'].apply(lambda x: len(x.split(' ')))


(228, 2)

In [40]:
dev_w_obj_img = dev_w_obj_df.image_name.tolist()
len(dev_w_obj_img)

228

In [41]:
filtered_dev_img = dev_w_obj_img + dev_wo_obj_img
len(filtered_dev_img)

728

In [42]:
import shutil
import os

src = u'../data/raw/dev/'
dst = u'../data/filtered/dev/'

# iterate
for img in filtered_dev_img: 
    src_path = os.path.join(src, img)
    dst_path = os.path.join(dst, img)
    # move
    shutil.copy(src_path, dst_path)
    print(f'copy {src_path} to {dst_path}')

copy ../data/raw/dev/08001.jpg to ../data/filtered/dev/08001.jpg
copy ../data/raw/dev/08004.jpg to ../data/filtered/dev/08004.jpg
copy ../data/raw/dev/08010.jpg to ../data/filtered/dev/08010.jpg
copy ../data/raw/dev/08011.jpg to ../data/filtered/dev/08011.jpg
copy ../data/raw/dev/08021.jpg to ../data/filtered/dev/08021.jpg
copy ../data/raw/dev/08022.jpg to ../data/filtered/dev/08022.jpg
copy ../data/raw/dev/08025.jpg to ../data/filtered/dev/08025.jpg
copy ../data/raw/dev/08035.jpg to ../data/filtered/dev/08035.jpg
copy ../data/raw/dev/08036.jpg to ../data/filtered/dev/08036.jpg
copy ../data/raw/dev/08046.jpg to ../data/filtered/dev/08046.jpg
copy ../data/raw/dev/08051.jpg to ../data/filtered/dev/08051.jpg
copy ../data/raw/dev/08059.jpg to ../data/filtered/dev/08059.jpg
copy ../data/raw/dev/08063.jpg to ../data/filtered/dev/08063.jpg
copy ../data/raw/dev/08065.jpg to ../data/filtered/dev/08065.jpg
copy ../data/raw/dev/08066.jpg to ../data/filtered/dev/08066.jpg
copy ../data/raw/dev/0806

KeyboardInterrupt: 

In [None]:
# from orig_dev_df, all images in the filtered_dev_img
# crete a new dataframe
filtered_dev_df = orig_dev_df[orig_dev_df.image_name.isin(filtered_dev_img)]
filtered_dev_df.shape

(728, 2)

In [None]:
# save to csv
filtered_dev_df.to_csv('../data/filtered/dev.csv', index=False)

#### Train data contains 8,000 images - 4,000 with objects and 4,000 without objects.

In [44]:
train_csv = u'../data/raw/train.csv'
train_df = pd.read_csv(train_csv)
train_df.head()

Unnamed: 0,image_name,annotation
0,00001.jpg,
1,00002.jpg,
2,00003.jpg,
3,00004.jpg,
4,00005.jpg,0 2076 1559 2369 1695


In [45]:
# image name where annotation is not NaN
train_w_obj_df = train_df[train_df.annotation.notnull()]
train_wo_obj_df = train_df[train_df.annotation.isnull()]
train_w_obj_img = train_w_obj_df.image_name.tolist()
train_wo_obj_img = train_wo_obj_df.image_name.tolist()

In [46]:
len(train_w_obj_img), len(train_wo_obj_img)

(4000, 4000)

In [47]:
# take 10%
num_of_elem = int(len(train_w_obj_img)/10)
num_of_elem

400

repeat the same process for removing images with objects that doesn't conform to our task

In [48]:
# remove anything with lens>5
train_w_obj_df['annotation'] = train_w_obj_df['annotation'].astype(str)
train_w_obj_df['annotation_len'] = train_w_obj_df['annotation'].apply(lambda x: len(x.split(' ')))
train_w_obj_df = train_w_obj_df[train_w_obj_df.annotation_len <= 5]
# drop the annotation_len column
train_w_obj_df = train_w_obj_df.drop('annotation_len', axis=1)
train_w_obj_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_w_obj_df['annotation'] = train_w_obj_df['annotation'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_w_obj_df['annotation_len'] = train_w_obj_df['annotation'].apply(lambda x: len(x.split(' ')))


(1783, 2)

In [49]:
train_w_obj_img = train_w_obj_df.image_name.tolist()
len(train_w_obj_img)

1783

In [50]:
filtered_train_img = train_w_obj_img + train_wo_obj_img
len(filtered_train_img)

5783

In [None]:
src = u'../data/raw/train/'
dst = u'../data/filtered/train/'

# iterate
for img in filtered_train_img: 
    src_path = os.path.join(src, img)
    dst_path = os.path.join(dst, img)
    # move
    shutil.copy(src_path, dst_path)
    print(f'copy {src_path} to {dst_path}')

move ../data/raw/train/00005.jpg to ../data/processed/train/00005.jpg
move ../data/raw/train/00006.jpg to ../data/processed/train/00006.jpg
move ../data/raw/train/00007.jpg to ../data/processed/train/00007.jpg
move ../data/raw/train/00008.jpg to ../data/processed/train/00008.jpg
move ../data/raw/train/00009.jpg to ../data/processed/train/00009.jpg
move ../data/raw/train/00011.jpg to ../data/processed/train/00011.jpg
move ../data/raw/train/00013.jpg to ../data/processed/train/00013.jpg
move ../data/raw/train/00015.jpg to ../data/processed/train/00015.jpg
move ../data/raw/train/00017.jpg to ../data/processed/train/00017.jpg
move ../data/raw/train/00021.jpg to ../data/processed/train/00021.jpg
move ../data/raw/train/00022.jpg to ../data/processed/train/00022.jpg
move ../data/raw/train/00034.jpg to ../data/processed/train/00034.jpg
move ../data/raw/train/00035.jpg to ../data/processed/train/00035.jpg
move ../data/raw/train/00036.jpg to ../data/processed/train/00036.jpg
move ../data/raw/tra

In [51]:
filtered_train_df = train_df[train_df.image_name.isin(filtered_train_img)]
filtered_train_df.shape

(5783, 2)

In [52]:
# save to csv   
filtered_train_df.to_csv('../data/filtered/train.csv', index=False)

Sample data based - we are going to take 10% of the training data (balanced), that means 400 with objects and 400 without objects

In [53]:
import random
sample_train_w_obj_img = random.sample(train_w_obj_img, num_of_elem)
sample_train_wo_obj_img = random.sample(train_wo_obj_img, num_of_elem)
sample_train_img = sample_train_w_obj_img + sample_train_wo_obj_img

In [54]:
len(sample_train_img)

800

Go to train folder, move these to the new folder.

In [55]:

src = u'../data/filtered/train/'
dst = u'../data/filtered/sample_train/'

# iterate
for img in sample_train_img: 
    src_path = os.path.join(src, img)
    dst_path = os.path.join(dst, img)
    # move
    shutil.copy(src_path, dst_path)
    print(f'copy {src_path} to {dst_path}')

copy ../data/filtered/train/03947.jpg to ../data/filtered/sample_train/03947.jpg
copy ../data/filtered/train/04617.jpg to ../data/filtered/sample_train/04617.jpg
copy ../data/filtered/train/06265.jpg to ../data/filtered/sample_train/06265.jpg
copy ../data/filtered/train/00866.jpg to ../data/filtered/sample_train/00866.jpg
copy ../data/filtered/train/01441.jpg to ../data/filtered/sample_train/01441.jpg
copy ../data/filtered/train/04263.jpg to ../data/filtered/sample_train/04263.jpg
copy ../data/filtered/train/01328.jpg to ../data/filtered/sample_train/01328.jpg
copy ../data/filtered/train/00427.jpg to ../data/filtered/sample_train/00427.jpg
copy ../data/filtered/train/02930.jpg to ../data/filtered/sample_train/02930.jpg
copy ../data/filtered/train/05071.jpg to ../data/filtered/sample_train/05071.jpg
copy ../data/filtered/train/05974.jpg to ../data/filtered/sample_train/05974.jpg
copy ../data/filtered/train/06165.jpg to ../data/filtered/sample_train/06165.jpg
copy ../data/filtered/train/

In [56]:
# create a new dataframe
sample_train_df = train_df[train_df.image_name.isin(sample_train_img)]
sample_train_df.shape

(800, 2)

In [None]:
# to csv
sample_train_df.to_csv('../data/filtered/sample_train.csv', index=False)

#### Test if the original boxes are valid

In [2]:
%pwd

'/nfs/home/zle2435/dl_proj/notebook'

In [12]:
import os
image_dir = r'../data/raw/train'
image_names = os.listdir(image_dir)
len(image_names)

8000

In [13]:
import pandas as pd

csv_path = r'../data/raw/train.csv'
df = pd.read_csv(csv_path)
annotations = df['annotation'].fillna('').values

In [14]:
annotations

array(['', '', '', ..., '0 1072 696 1375 1432', '0 1578 1234 1943 1599',
       ''], dtype=object)

In [None]:
import cv2
from PIL import Image, ImageDraw

images = []
valid_annotations = []
counter = 0


for i, name in enumerate(image_names):
    if counter>5: break
    if annotations[i] == '' and len(annotations[i]) > 25:
        continue
    else:
        path = os.path.join(image_dir, name)
        img = cv2.imread(path)
        # Draw the bounding box with red lines
        image = Image.open(path)
        print(name)
        draw = ImageDraw.Draw(image)
        annotation_list = annotations[i].split(' ')[1:]
        x1, y1, x2, y2 = [int(x) for x in annotation_list]
        outline_color = (255, 0, 0)  # Red in RGB format
        line_width = 5
        # draw.rectangle([(x1, x2), (y1, y2)], width=line_width)
        draw.rectangle([(x1, y1), (x2, y2)], width=line_width)
        image.show()
        counter += 1