In [39]:
import sys
sys.path.append(os.path.abspath(os.path.join('..')))
import pandas as pd 
import os
from helpers import PATHS
from sklearn.model_selection import train_test_split

In [40]:
def get_dataframes():
    """
    Make the dataframes for the dataset. Check `eda.ipynb` for a more detailed explanation.
    Basically we're returning a df that has been cleaned of corrupted images.
    """

    #corrupted images
    exclude_list = ['6384c3e78.jpg','13703f040.jpg', '14715c06d.jpg',  '33e0ff2d5.jpg',
                '4d4e09f2a.jpg', '877691df8.jpg', '8b909bb20.jpg', 'a8d99130e.jpg', 
                'ad55c3143.jpg', 'c8260c541.jpg', 'd6c7f17c7.jpg', 'dc3e7c901.jpg',
                'e44dffe88.jpg', 'ef87bad36.jpg', 'f083256d8.jpg']

    # Get all
    masks = pd.read_csv(os.path.join(PATHS['root'], 'train_ship_segmentations_v2.csv'))
    # Remove corrupted file
    masks = masks[~masks['ImageId'].isin(exclude_list)]

    # Split between train and validation sets
    # We use stratify to balance the number of ships per image between the two df
    unique_img_ids = masks.groupby('ImageId').size().reset_index(name='counts')
    train_ids, val_ids = train_test_split(unique_img_ids, test_size=0.05, stratify=unique_img_ids['counts'], random_state=42)

    # Inner join masks with the ids
    train_df = pd.merge(masks, train_ids)
    valid_df = pd.merge(masks, val_ids)

    # Set has_ship =  if ship is present, otherwise 0
    train_df['has_ship'] = train_df.apply(lambda c_row: 1 if isinstance(c_row['EncodedPixels'], str) else 0, 1)
    valid_df['has_ship'] = valid_df.apply(lambda c_row: 1 if isinstance(c_row['EncodedPixels'], str) else 0, 1)

    train_df = train_df.drop(columns=['EncodedPixels', 'counts'])
    valid_df = valid_df.drop(columns=['EncodedPixels', 'counts'])

    # Remove duplicate lines
    train_df = train_df.groupby('ImageId').agg({ 'has_ship': 'first' }).reset_index()
    valid_df = valid_df.groupby('ImageId').agg({ 'has_ship': 'first' }).reset_index()

    return train_df, valid_df


In [41]:
df = get_dataframes()[0]
df

Unnamed: 0,ImageId,has_ship
0,00003e153.jpg,0
1,0001124c7.jpg,0
2,000155de5.jpg,1
3,000194a2d.jpg,1
4,0001b1832.jpg,0
...,...,...
182922,fffedbb6b.jpg,0
182923,ffff2aa57.jpg,0
182924,ffff6e525.jpg,0
182925,ffffc50b4.jpg,0


In [42]:
grp = list(df.groupby('ImageId'))
grp

[('00003e153.jpg',
           ImageId  has_ship
  0  00003e153.jpg         0),
 ('0001124c7.jpg',
           ImageId  has_ship
  1  0001124c7.jpg         0),
 ('000155de5.jpg',
           ImageId  has_ship
  2  000155de5.jpg         1),
 ('000194a2d.jpg',
           ImageId  has_ship
  3  000194a2d.jpg         1),
 ('0001b1832.jpg',
           ImageId  has_ship
  4  0001b1832.jpg         0),
 ('00021ddc3.jpg',
           ImageId  has_ship
  5  00021ddc3.jpg         1),
 ('0002756f7.jpg',
           ImageId  has_ship
  6  0002756f7.jpg         1),
 ('0002d0f32.jpg',
           ImageId  has_ship
  7  0002d0f32.jpg         0),
 ('000303d4d.jpg',
           ImageId  has_ship
  8  000303d4d.jpg         0),
 ('00031f145.jpg',
           ImageId  has_ship
  9  00031f145.jpg         1),
 ('00052ed46.jpg',
            ImageId  has_ship
  10  00052ed46.jpg         0),
 ('000532683.jpg',
            ImageId  has_ship
  11  000532683.jpg         1),
 ('00053c6ba.jpg',
            ImageId  has_ship

In [47]:
# [e['ImageId'] for e in df] 
list(df['ImageId'])

['00003e153.jpg',
 '0001124c7.jpg',
 '000155de5.jpg',
 '000194a2d.jpg',
 '0001b1832.jpg',
 '00021ddc3.jpg',
 '0002756f7.jpg',
 '0002d0f32.jpg',
 '000303d4d.jpg',
 '00031f145.jpg',
 '00052ed46.jpg',
 '000532683.jpg',
 '00053c6ba.jpg',
 '00057a50d.jpg',
 '000592296.jpg',
 '0005d01c8.jpg',
 '0005d6d95.jpg',
 '0006c52e8.jpg',
 '0007b8229.jpg',
 '000805313.jpg',
 '000811bb6.jpg',
 '0008d6216.jpg',
 '000913daf.jpg',
 '000952d51.jpg',
 '000969125.jpg',
 '0009a46e1.jpg',
 '000a4d4fd.jpg',
 '000aed819.jpg',
 '000baef0c.jpg',
 '000bd9ac4.jpg',
 '000c150b0.jpg',
 '000c34352.jpg',
 '000d26c17.jpg',
 '000db3e62.jpg',
 '000e37fc6.jpg',
 '000e6378b.jpg',
 '000e64855.jpg',
 '000f1f959.jpg',
 '000f7e728.jpg',
 '000f7f2fd.jpg',
 '000fd9827.jpg',
 '00104f16f.jpg',
 '0010e88ce.jpg',
 '0010eefb1.jpg',
 '00113a75c.jpg',
 '00119e4ba.jpg',
 '0011a9ccb.jpg',
 '0011c31b7.jpg',
 '00122061c.jpg',
 '001234638.jpg',
 '0012669a8.jpg',
 '0012a8fa1.jpg',
 '0012b8981.jpg',
 '0012be3bb.jpg',
 '0013b2222.jpg',
 '00140e59