# Data preprocessing

There are a huge amount of image files in the dataset—231723—that are way too many for me to train on. I'm going to create a smaller subset of these files for my initial training purposes.

In [1]:
import pandas as pd
import os
import shutil

In [3]:
#since ~ 2/3rds of the photos contain no ships, I'm going to remove those from the dataset
df = pd.read_csv('./data/train_ship_segmentations_v2.csv').dropna()

In [4]:
df.shape

(81723, 2)

In [5]:
df.head()

Unnamed: 0,ImageId,EncodedPixels
2,000155de5.jpg,264661 17 265429 33 266197 33 266965 33 267733...
3,000194a2d.jpg,360486 1 361252 4 362019 5 362785 8 363552 10 ...
4,000194a2d.jpg,51834 9 52602 9 53370 9 54138 9 54906 9 55674 ...
5,000194a2d.jpg,198320 10 199088 10 199856 10 200624 10 201392...
6,000194a2d.jpg,55683 1 56451 1 57219 1 57987 1 58755 1 59523 ...


In [6]:
#since some images have multiple rows (see EDA) we need to merge these rows so all ships present in an image appear in the mask
df = df.groupby('ImageId')[['EncodedPixels']].agg(lambda rle_codes: ' '.join(rle_codes)).reset_index()

In [7]:
df.shape

(42556, 2)

## Make a sample of 1000 images

In [8]:
sample = df.sample(1000)

In [9]:
sample.shape

(1000, 2)

In [10]:
sample.head()

Unnamed: 0,ImageId,EncodedPixels
8813,35abc052e.jpg,75533 1 76300 3 77066 5 77833 7 78600 9 79367 ...
15149,5afa11f0c.jpg,137060 1 137826 4 138593 5 139360 7 140126 10 ...
17700,6a5b250ab.jpg,35813 8 36581 8 37349 8 38117 8 38885 8 39653 ...
19187,7345fdccd.jpg,317977 1 318744 4 319512 5 320279 8 321047 9 3...
6090,24b20063a.jpg,445483 2 446249 4 447015 7 447781 9 448547 12 ...


Export the sample dataframe

In [11]:
sample.to_csv('./data/train_small/segmentations.csv')

### Move the sample images to their own folder

In [17]:
orig_path = './data/train_v2/'
new_path = './data/train_small/images/'

In [18]:
for i in sample['ImageId']:
    shutil.copyfile(f'{orig_path}{i}', f'{new_path}{i}')